diff options
Diffstat (limited to 'lib/Target')
145 files changed, 6415 insertions, 1672 deletions
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h index 37b9690d0434a..1dda746a6be1e 100644 --- a/lib/Target/AArch64/AArch64.h +++ b/lib/Target/AArch64/AArch64.h @@ -44,6 +44,8 @@ ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); +FunctionPass *createFalkorHWPFFixPass(); +FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); @@ -66,6 +68,8 @@ void initializeAArch64VectorByElementOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); +void initializeFalkorHWPFFixPass(PassRegistry&); +void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 53eef79c4df3f..436bf1193304c 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -50,6 +50,9 @@ def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", def FeatureSPE : SubtargetFeature<"spe", "HasSPE", "true", "Enable Statistical Profiling extension">; +def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true", + "Enable Scalable Vector Extension (SVE) instructions">; + /// Cyclone has register move instructions which are "free". def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true", "Has zero-cycle register moves">; @@ -269,6 +272,7 @@ def ProcExynosM2 : SubtargetFeature<"exynosm2", "ARMProcFamily", "ExynosM1", FeatureCrypto, FeatureCustomCheapAsMoveHandling, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon, FeaturePostRAScheduler, diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td index 938779d23690d..291bc5ea858e3 100644 --- a/lib/Target/AArch64/AArch64CallingConvention.td +++ b/lib/Target/AArch64/AArch64CallingConvention.td @@ -118,6 +118,13 @@ def RetCC_AArch64_AAPCS : CallingConv<[ CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>> ]>; +// Vararg functions on windows pass floats in integer registers +def CC_AArch64_Win64_VarArg : CallingConv<[ + CCIfType<[f16, f32], CCPromoteToType<f64>>, + CCIfType<[f64], CCBitConvertToType<i64>>, + CCDelegateTo<CC_AArch64_AAPCS> +]>; + // Darwin uses a calling convention which differs in only two ways // from the standard one at this level: diff --git a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp index ee54550c9900b..b72f23b109d94 100644 --- a/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp +++ b/lib/Target/AArch64/AArch64DeadRegisterDefinitionsPass.cpp @@ -102,6 +102,10 @@ void AArch64DeadRegisterDefinitions::processMachineBasicBlock( case AArch64::LDADDALh: case AArch64::LDADDALs: case AArch64::LDADDALd: + case AArch64::LDCLRALb: + case AArch64::LDCLRALh: + case AArch64::LDCLRALs: + case AArch64::LDCLRALd: case AArch64::LDEORALb: case AArch64::LDEORALh: case AArch64::LDEORALs: diff --git a/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp new file mode 100644 index 0000000000000..c0e22355a9ff6 --- /dev/null +++ b/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -0,0 +1,790 @@ +//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file For Falkor, we want to avoid HW prefetcher instruction tag collisions +/// that may inhibit the HW prefetching. This is done in two steps. Before +/// ISel, we mark strided loads (i.e. those that will likely benefit from +/// prefetching) with metadata. Then, after opcodes have been finalized, we +/// insert MOVs and re-write loads to prevent unintnentional tag collisions. +// ===---------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/LiveRegUnits.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "falkor-hwpf-fix" + +STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); +STATISTIC(NumCollisionsAvoided, + "Number of HW prefetch tag collisions avoided"); +STATISTIC(NumCollisionsNotAvoided, + "Number of HW prefetch tag collisions not avoided due to lack of regsiters"); + +namespace { + +class FalkorMarkStridedAccesses { +public: + FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) + : LI(LI), SE(SE) {} + + bool run(); + +private: + bool runOnLoop(Loop &L); + + LoopInfo &LI; + ScalarEvolution &SE; +}; + +class FalkorMarkStridedAccessesLegacy : public FunctionPass { +public: + static char ID; // Pass ID, replacement for typeid + FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { + initializeFalkorMarkStridedAccessesLegacyPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetPassConfig>(); + AU.addPreserved<DominatorTreeWrapperPass>(); + AU.addRequired<LoopInfoWrapperPass>(); + AU.addPreserved<LoopInfoWrapperPass>(); + AU.addRequired<ScalarEvolutionWrapperPass>(); + // FIXME: For some reason, preserving SE here breaks LSR (even if + // this pass changes nothing). + // AU.addPreserved<ScalarEvolutionWrapperPass>(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +char FalkorMarkStridedAccessesLegacy::ID = 0; +INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) + +FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { + return new FalkorMarkStridedAccessesLegacy(); +} + +bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { + TargetPassConfig &TPC = getAnalysis<TargetPassConfig>(); + const AArch64Subtarget *ST = + TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F); + if (ST->getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(F)) + return false; + + LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo(); + ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE(); + + FalkorMarkStridedAccesses LDP(LI, SE); + return LDP.run(); +} + +bool FalkorMarkStridedAccesses::run() { + bool MadeChange = false; + + for (Loop *L : LI) + for (auto LIt = df_begin(L), LE = df_end(L); LIt != LE; ++LIt) + MadeChange |= runOnLoop(**LIt); + + return MadeChange; +} + +bool FalkorMarkStridedAccesses::runOnLoop(Loop &L) { + // Only mark strided loads in the inner-most loop + if (!L.empty()) + return false; + + bool MadeChange = false; + + for (BasicBlock *BB : L.blocks()) { + for (Instruction &I : *BB) { + LoadInst *LoadI = dyn_cast<LoadInst>(&I); + if (!LoadI) + continue; + + Value *PtrValue = LoadI->getPointerOperand(); + if (L.isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, + MDNode::get(LoadI->getContext(), {})); + ++NumStridedLoadsMarked; + DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); + MadeChange = true; + } + } + + return MadeChange; +} + +namespace { + +class FalkorHWPFFix : public MachineFunctionPass { +public: + static char ID; + + FalkorHWPFFix() : MachineFunctionPass(ID) { + initializeFalkorHWPFFixPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &Fn) override; + + virtual void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<MachineLoopInfo>(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + MachineFunctionProperties getRequiredProperties() const override { + return MachineFunctionProperties().set( + MachineFunctionProperties::Property::NoVRegs); + } + +private: + void runOnLoop(MachineLoop &L, MachineFunction &Fn); + + const AArch64InstrInfo *TII; + const TargetRegisterInfo *TRI; + DenseMap<unsigned, SmallVector<MachineInstr *, 4>> TagMap; + bool Modified; +}; + +/// Bits from load opcodes used to compute HW prefetcher instruction tags. +struct LoadInfo { + LoadInfo() + : DestReg(0), BaseReg(0), BaseRegIdx(-1), OffsetOpnd(nullptr), + IsPrePost(false) {} + unsigned DestReg; + unsigned BaseReg; + int BaseRegIdx; + const MachineOperand *OffsetOpnd; + bool IsPrePost; +}; + +} // namespace + +char FalkorHWPFFix::ID = 0; + +INITIALIZE_PASS_BEGIN(FalkorHWPFFix, "falkor-hwpf-fix-late", + "Falkor HW Prefetch Fix Late Phase", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(FalkorHWPFFix, "falkor-hwpf-fix-late", + "Falkor HW Prefetch Fix Late Phase", false, false) + +static unsigned makeTag(unsigned Dest, unsigned Base, unsigned Offset) { + return (Dest & 0xf) | ((Base & 0xf) << 4) | ((Offset & 0x3f) << 8); +} + +static Optional<LoadInfo> getLoadInfo(const MachineInstr &MI) { + int DestRegIdx; + int BaseRegIdx; + int OffsetIdx; + bool IsPrePost; + + switch (MI.getOpcode()) { + default: + return None; + + case AArch64::LD1i8: + case AArch64::LD1i16: + case AArch64::LD1i32: + case AArch64::LD1i64: + case AArch64::LD2i8: + case AArch64::LD2i16: + case AArch64::LD2i32: + case AArch64::LD2i64: + case AArch64::LD3i8: + case AArch64::LD3i16: + case AArch64::LD3i32: + case AArch64::LD4i8: + case AArch64::LD4i16: + case AArch64::LD4i32: + DestRegIdx = 0; + BaseRegIdx = 3; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD3i64: + case AArch64::LD4i64: + DestRegIdx = -1; + BaseRegIdx = 3; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1Onev1d: + case AArch64::LD1Onev2s: + case AArch64::LD1Onev4h: + case AArch64::LD1Onev8b: + case AArch64::LD1Onev2d: + case AArch64::LD1Onev4s: + case AArch64::LD1Onev8h: + case AArch64::LD1Onev16b: + case AArch64::LD1Rv1d: + case AArch64::LD1Rv2s: + case AArch64::LD1Rv4h: + case AArch64::LD1Rv8b: + case AArch64::LD1Rv2d: + case AArch64::LD1Rv4s: + case AArch64::LD1Rv8h: + case AArch64::LD1Rv16b: + case AArch64::LD1Twov1d: + case AArch64::LD1Twov2s: + case AArch64::LD1Twov4h: + case AArch64::LD1Twov8b: + case AArch64::LD2Twov2s: + case AArch64::LD2Twov4s: + case AArch64::LD2Twov8b: + case AArch64::LD2Rv1d: + case AArch64::LD2Rv2s: + case AArch64::LD2Rv4s: + case AArch64::LD2Rv8b: + DestRegIdx = 0; + BaseRegIdx = 1; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1Twov2d: + case AArch64::LD1Twov4s: + case AArch64::LD1Twov8h: + case AArch64::LD1Twov16b: + case AArch64::LD1Threev1d: + case AArch64::LD1Threev2s: + case AArch64::LD1Threev4h: + case AArch64::LD1Threev8b: + case AArch64::LD1Threev2d: + case AArch64::LD1Threev4s: + case AArch64::LD1Threev8h: + case AArch64::LD1Threev16b: + case AArch64::LD1Fourv1d: + case AArch64::LD1Fourv2s: + case AArch64::LD1Fourv4h: + case AArch64::LD1Fourv8b: + case AArch64::LD1Fourv2d: + case AArch64::LD1Fourv4s: + case AArch64::LD1Fourv8h: + case AArch64::LD1Fourv16b: + case AArch64::LD2Twov2d: + case AArch64::LD2Twov4h: + case AArch64::LD2Twov8h: + case AArch64::LD2Twov16b: + case AArch64::LD2Rv2d: + case AArch64::LD2Rv4h: + case AArch64::LD2Rv8h: + case AArch64::LD2Rv16b: + case AArch64::LD3Threev2s: + case AArch64::LD3Threev4h: + case AArch64::LD3Threev8b: + case AArch64::LD3Threev2d: + case AArch64::LD3Threev4s: + case AArch64::LD3Threev8h: + case AArch64::LD3Threev16b: + case AArch64::LD3Rv1d: + case AArch64::LD3Rv2s: + case AArch64::LD3Rv4h: + case AArch64::LD3Rv8b: + case AArch64::LD3Rv2d: + case AArch64::LD3Rv4s: + case AArch64::LD3Rv8h: + case AArch64::LD3Rv16b: + case AArch64::LD4Fourv2s: + case AArch64::LD4Fourv4h: + case AArch64::LD4Fourv8b: + case AArch64::LD4Fourv2d: + case AArch64::LD4Fourv4s: + case AArch64::LD4Fourv8h: + case AArch64::LD4Fourv16b: + case AArch64::LD4Rv1d: + case AArch64::LD4Rv2s: + case AArch64::LD4Rv4h: + case AArch64::LD4Rv8b: + case AArch64::LD4Rv2d: + case AArch64::LD4Rv4s: + case AArch64::LD4Rv8h: + case AArch64::LD4Rv16b: + DestRegIdx = -1; + BaseRegIdx = 1; + OffsetIdx = -1; + IsPrePost = false; + break; + + case AArch64::LD1i8_POST: + case AArch64::LD1i16_POST: + case AArch64::LD1i32_POST: + case AArch64::LD1i64_POST: + case AArch64::LD2i8_POST: + case AArch64::LD2i16_POST: + case AArch64::LD2i32_POST: + case AArch64::LD2i64_POST: + case AArch64::LD3i8_POST: + case AArch64::LD3i16_POST: + case AArch64::LD3i32_POST: + case AArch64::LD4i8_POST: + case AArch64::LD4i16_POST: + case AArch64::LD4i32_POST: + DestRegIdx = 1; + BaseRegIdx = 4; + OffsetIdx = 5; + IsPrePost = false; + break; + + case AArch64::LD3i64_POST: + case AArch64::LD4i64_POST: + DestRegIdx = -1; + BaseRegIdx = 4; + OffsetIdx = 5; + IsPrePost = false; + break; + + case AArch64::LD1Onev1d_POST: + case AArch64::LD1Onev2s_POST: + case AArch64::LD1Onev4h_POST: + case AArch64::LD1Onev8b_POST: + case AArch64::LD1Onev2d_POST: + case AArch64::LD1Onev4s_POST: + case AArch64::LD1Onev8h_POST: + case AArch64::LD1Onev16b_POST: + case AArch64::LD1Rv1d_POST: + case AArch64::LD1Rv2s_POST: + case AArch64::LD1Rv4h_POST: + case AArch64::LD1Rv8b_POST: + case AArch64::LD1Rv2d_POST: + case AArch64::LD1Rv4s_POST: + case AArch64::LD1Rv8h_POST: + case AArch64::LD1Rv16b_POST: + case AArch64::LD1Twov1d_POST: + case AArch64::LD1Twov2s_POST: + case AArch64::LD1Twov4h_POST: + case AArch64::LD1Twov8b_POST: + case AArch64::LD2Twov2s_POST: + case AArch64::LD2Twov4s_POST: + case AArch64::LD2Twov8b_POST: + case AArch64::LD2Rv1d_POST: + case AArch64::LD2Rv2s_POST: + case AArch64::LD2Rv4s_POST: + case AArch64::LD2Rv8b_POST: + DestRegIdx = 1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LD1Twov2d_POST: + case AArch64::LD1Twov4s_POST: + case AArch64::LD1Twov8h_POST: + case AArch64::LD1Twov16b_POST: + case AArch64::LD1Threev1d_POST: + case AArch64::LD1Threev2s_POST: + case AArch64::LD1Threev4h_POST: + case AArch64::LD1Threev8b_POST: + case AArch64::LD1Threev2d_POST: + case AArch64::LD1Threev4s_POST: + case AArch64::LD1Threev8h_POST: + case AArch64::LD1Threev16b_POST: + case AArch64::LD1Fourv1d_POST: + case AArch64::LD1Fourv2s_POST: + case AArch64::LD1Fourv4h_POST: + case AArch64::LD1Fourv8b_POST: + case AArch64::LD1Fourv2d_POST: + case AArch64::LD1Fourv4s_POST: + case AArch64::LD1Fourv8h_POST: + case AArch64::LD1Fourv16b_POST: + case AArch64::LD2Twov2d_POST: + case AArch64::LD2Twov4h_POST: + case AArch64::LD2Twov8h_POST: + case AArch64::LD2Twov16b_POST: + case AArch64::LD2Rv2d_POST: + case AArch64::LD2Rv4h_POST: + case AArch64::LD2Rv8h_POST: + case AArch64::LD2Rv16b_POST: + case AArch64::LD3Threev2s_POST: + case AArch64::LD3Threev4h_POST: + case AArch64::LD3Threev8b_POST: + case AArch64::LD3Threev2d_POST: + case AArch64::LD3Threev4s_POST: + case AArch64::LD3Threev8h_POST: + case AArch64::LD3Threev16b_POST: + case AArch64::LD3Rv1d_POST: + case AArch64::LD3Rv2s_POST: + case AArch64::LD3Rv4h_POST: + case AArch64::LD3Rv8b_POST: + case AArch64::LD3Rv2d_POST: + case AArch64::LD3Rv4s_POST: + case AArch64::LD3Rv8h_POST: + case AArch64::LD3Rv16b_POST: + case AArch64::LD4Fourv2s_POST: + case AArch64::LD4Fourv4h_POST: + case AArch64::LD4Fourv8b_POST: + case AArch64::LD4Fourv2d_POST: + case AArch64::LD4Fourv4s_POST: + case AArch64::LD4Fourv8h_POST: + case AArch64::LD4Fourv16b_POST: + case AArch64::LD4Rv1d_POST: + case AArch64::LD4Rv2s_POST: + case AArch64::LD4Rv4h_POST: + case AArch64::LD4Rv8b_POST: + case AArch64::LD4Rv2d_POST: + case AArch64::LD4Rv4s_POST: + case AArch64::LD4Rv8h_POST: + case AArch64::LD4Rv16b_POST: + DestRegIdx = -1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDRBBroW: + case AArch64::LDRBBroX: + case AArch64::LDRBBui: + case AArch64::LDRBroW: + case AArch64::LDRBroX: + case AArch64::LDRBui: + case AArch64::LDRDl: + case AArch64::LDRDroW: + case AArch64::LDRDroX: + case AArch64::LDRDui: + case AArch64::LDRHHroW: + case AArch64::LDRHHroX: + case AArch64::LDRHHui: + case AArch64::LDRHroW: + case AArch64::LDRHroX: + case AArch64::LDRHui: + case AArch64::LDRQl: + case AArch64::LDRQroW: + case AArch64::LDRQroX: + case AArch64::LDRQui: + case AArch64::LDRSBWroW: + case AArch64::LDRSBWroX: + case AArch64::LDRSBWui: + case AArch64::LDRSBXroW: + case AArch64::LDRSBXroX: + case AArch64::LDRSBXui: + case AArch64::LDRSHWroW: + case AArch64::LDRSHWroX: + case AArch64::LDRSHWui: + case AArch64::LDRSHXroW: + case AArch64::LDRSHXroX: + case AArch64::LDRSHXui: + case AArch64::LDRSWl: + case AArch64::LDRSWroW: + case AArch64::LDRSWroX: + case AArch64::LDRSWui: + case AArch64::LDRSl: + case AArch64::LDRSroW: + case AArch64::LDRSroX: + case AArch64::LDRSui: + case AArch64::LDRWl: + case AArch64::LDRWroW: + case AArch64::LDRWroX: + case AArch64::LDRWui: + case AArch64::LDRXl: + case AArch64::LDRXroW: + case AArch64::LDRXroX: + case AArch64::LDRXui: + case AArch64::LDURBBi: + case AArch64::LDURBi: + case AArch64::LDURDi: + case AArch64::LDURHHi: + case AArch64::LDURHi: + case AArch64::LDURQi: + case AArch64::LDURSBWi: + case AArch64::LDURSBXi: + case AArch64::LDURSHWi: + case AArch64::LDURSHXi: + case AArch64::LDURSWi: + case AArch64::LDURSi: + case AArch64::LDURWi: + case AArch64::LDURXi: + DestRegIdx = 0; + BaseRegIdx = 1; + OffsetIdx = 2; + IsPrePost = false; + break; + + case AArch64::LDRBBpost: + case AArch64::LDRBBpre: + case AArch64::LDRBpost: + case AArch64::LDRBpre: + case AArch64::LDRDpost: + case AArch64::LDRDpre: + case AArch64::LDRHHpost: + case AArch64::LDRHHpre: + case AArch64::LDRHpost: + case AArch64::LDRHpre: + case AArch64::LDRQpost: + case AArch64::LDRQpre: + case AArch64::LDRSBWpost: + case AArch64::LDRSBWpre: + case AArch64::LDRSBXpost: + case AArch64::LDRSBXpre: + case AArch64::LDRSHWpost: + case AArch64::LDRSHWpre: + case AArch64::LDRSHXpost: + case AArch64::LDRSHXpre: + case AArch64::LDRSWpost: + case AArch64::LDRSWpre: + case AArch64::LDRSpost: + case AArch64::LDRSpre: + case AArch64::LDRWpost: + case AArch64::LDRWpre: + case AArch64::LDRXpost: + case AArch64::LDRXpre: + DestRegIdx = 1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = true; + break; + + case AArch64::LDPDi: + case AArch64::LDPQi: + DestRegIdx = -1; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDPSWi: + case AArch64::LDPSi: + case AArch64::LDPWi: + case AArch64::LDPXi: + DestRegIdx = 0; + BaseRegIdx = 2; + OffsetIdx = 3; + IsPrePost = false; + break; + + case AArch64::LDPQpost: + case AArch64::LDPQpre: + DestRegIdx = -1; + BaseRegIdx = 3; + OffsetIdx = 4; + IsPrePost = true; + break; + + case AArch64::LDPDpost: + case AArch64::LDPDpre: + case AArch64::LDPSWpost: + case AArch64::LDPSWpre: + case AArch64::LDPSpost: + case AArch64::LDPSpre: + case AArch64::LDPWpost: + case AArch64::LDPWpre: + case AArch64::LDPXpost: + case AArch64::LDPXpre: + DestRegIdx = 1; + BaseRegIdx = 3; + OffsetIdx = 4; + IsPrePost = true; + break; + } + + LoadInfo LI; + LI.DestReg = DestRegIdx == -1 ? 0 : MI.getOperand(DestRegIdx).getReg(); + LI.BaseReg = MI.getOperand(BaseRegIdx).getReg(); + LI.BaseRegIdx = BaseRegIdx; + LI.OffsetOpnd = OffsetIdx == -1 ? nullptr : &MI.getOperand(OffsetIdx); + LI.IsPrePost = IsPrePost; + return LI; +} + +static Optional<unsigned> getTag(const TargetRegisterInfo *TRI, + const MachineInstr &MI, const LoadInfo &LI) { + unsigned Dest = LI.DestReg ? TRI->getEncodingValue(LI.DestReg) : 0; + unsigned Base = TRI->getEncodingValue(LI.BaseReg); + unsigned Off; + if (LI.OffsetOpnd == nullptr) + Off = 0; + else if (LI.OffsetOpnd->isGlobal() || LI.OffsetOpnd->isSymbol() || + LI.OffsetOpnd->isCPI()) + return None; + else if (LI.OffsetOpnd->isReg()) + Off = (1 << 5) | TRI->getEncodingValue(LI.OffsetOpnd->getReg()); + else + Off = LI.OffsetOpnd->getImm() >> 2; + + return makeTag(Dest, Base, Off); +} + +void FalkorHWPFFix::runOnLoop(MachineLoop &L, MachineFunction &Fn) { + // Build the initial tag map for the whole loop. + TagMap.clear(); + for (MachineBasicBlock *MBB : L.getBlocks()) + for (MachineInstr &MI : *MBB) { + Optional<LoadInfo> LInfo = getLoadInfo(MI); + if (!LInfo) + continue; + Optional<unsigned> Tag = getTag(TRI, MI, *LInfo); + if (!Tag) + continue; + TagMap[*Tag].push_back(&MI); + } + + bool AnyCollisions = false; + for (auto &P : TagMap) { + auto Size = P.second.size(); + if (Size > 1) { + for (auto *MI : P.second) { + if (TII->isStridedAccess(*MI)) { + AnyCollisions = true; + break; + } + } + } + if (AnyCollisions) + break; + } + // Nothing to fix. + if (!AnyCollisions) + return; + + MachineRegisterInfo &MRI = Fn.getRegInfo(); + + // Go through all the basic blocks in the current loop and fix any streaming + // loads to avoid collisions with any other loads. + LiveRegUnits LR(*TRI); + for (MachineBasicBlock *MBB : L.getBlocks()) { + LR.clear(); + LR.addLiveOuts(*MBB); + for (auto I = MBB->rbegin(); I != MBB->rend(); LR.stepBackward(*I), ++I) { + MachineInstr &MI = *I; + if (!TII->isStridedAccess(MI)) + continue; + + LoadInfo LdI = *getLoadInfo(MI); + unsigned OldTag = *getTag(TRI, MI, LdI); + auto &OldCollisions = TagMap[OldTag]; + if (OldCollisions.size() <= 1) + continue; + + bool Fixed = false; + DEBUG(dbgs() << "Attempting to fix tag collision: " << MI); + + for (unsigned ScratchReg : AArch64::GPR64RegClass) { + if (!LR.available(ScratchReg) || MRI.isReserved(ScratchReg)) + continue; + + LoadInfo NewLdI(LdI); + NewLdI.BaseReg = ScratchReg; + unsigned NewTag = *getTag(TRI, MI, NewLdI); + // Scratch reg tag would collide too, so don't use it. + if (TagMap.count(NewTag)) + continue; + + DEBUG(dbgs() << "Changing base reg to: " << PrintReg(ScratchReg, TRI) + << '\n'); + + // Rewrite: + // Xd = LOAD Xb, off + // to: + // Xc = MOV Xb + // Xd = LOAD Xc, off + DebugLoc DL = MI.getDebugLoc(); + BuildMI(*MBB, &MI, DL, TII->get(AArch64::ORRXrs), ScratchReg) + .addReg(AArch64::XZR) + .addReg(LdI.BaseReg) + .addImm(0); + MachineOperand &BaseOpnd = MI.getOperand(LdI.BaseRegIdx); + BaseOpnd.setReg(ScratchReg); + + // If the load does a pre/post increment, then insert a MOV after as + // well to update the real base register. + if (LdI.IsPrePost) { + DEBUG(dbgs() << "Doing post MOV of incremented reg: " + << PrintReg(ScratchReg, TRI) << '\n'); + MI.getOperand(0).setReg( + ScratchReg); // Change tied operand pre/post update dest. + BuildMI(*MBB, std::next(MachineBasicBlock::iterator(MI)), DL, + TII->get(AArch64::ORRXrs), LdI.BaseReg) + .addReg(AArch64::XZR) + .addReg(ScratchReg) + .addImm(0); + } + + for (int I = 0, E = OldCollisions.size(); I != E; ++I) + if (OldCollisions[I] == &MI) { + std::swap(OldCollisions[I], OldCollisions[E - 1]); + OldCollisions.pop_back(); + break; + } + + // Update TagMap to reflect instruction changes to reduce the number + // of later MOVs to be inserted. This needs to be done after + // OldCollisions is updated since it may be relocated by this + // insertion. + TagMap[NewTag].push_back(&MI); + ++NumCollisionsAvoided; + Fixed = true; + Modified = true; + break; + } + if (!Fixed) + ++NumCollisionsNotAvoided; + } + } +} + +bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) { + auto &ST = static_cast<const AArch64Subtarget &>(Fn.getSubtarget()); + if (ST.getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(*Fn.getFunction())) + return false; + + TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo()); + TRI = ST.getRegisterInfo(); + + assert(TRI->trackLivenessAfterRegAlloc(Fn) && + "Register liveness not available!"); + + MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>(); + + Modified = false; + + for (MachineLoop *I : LI) + for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) + // Only process inner-loops + if (L->empty()) + runOnLoop(**L, Fn); + + return Modified; +} + +FunctionPass *llvm::createFalkorHWPFFixPass() { return new FalkorHWPFFix(); } diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 3682b62d2b84d..97396057dce07 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -5138,6 +5138,7 @@ bool AArch64FastISel::fastSelectInstruction(const Instruction *I) { return selectOperator(I, I->getOpcode()); // Silence warnings. (void)&CC_AArch64_DarwinPCS_VarArg; + (void)&CC_AArch64_Win64_VarArg; } namespace llvm { diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp index e96ee7d29b3e8..4907d082eda09 100644 --- a/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -41,6 +41,10 @@ // | | // |-----------------------------------| // | | +// | (Win64 only) varargs from reg | +// | | +// |-----------------------------------| +// | | // | prev_fp, prev_lr | // | (a.k.a. "frame record") | // |-----------------------------------| <- fp(=x29) @@ -950,7 +954,13 @@ static void computeCalleeSaveRegisterPairs( CC == CallingConv::PreserveMost || (Count & 1) == 0) && "Odd number of callee-saved regs to spill!"); - unsigned Offset = AFI->getCalleeSavedStackSize(); + int Offset = AFI->getCalleeSavedStackSize(); + + unsigned GPRSaveSize = AFI->getVarArgsGPRSize(); + const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>(); + bool IsWin64 = Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()); + if (IsWin64) + Offset -= alignTo(GPRSaveSize, 16); for (unsigned i = 0; i < Count; ++i) { RegPairInfo RPI; diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 04687847c1a30..06005f6b68861 100644 --- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -239,10 +239,17 @@ bool AArch64DAGToDAGISel::SelectInlineAsmMemoryOperand( case InlineAsm::Constraint_i: case InlineAsm::Constraint_m: case InlineAsm::Constraint_Q: - // Require the address to be in a register. That is safe for all AArch64 - // variants and it is hard to do anything much smarter without knowing - // how the operand is used. - OutOps.push_back(Op); + // We need to make sure that this one operand does not end up in XZR, thus + // require the address to be in a PointerRegClass register. + const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo(); + const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF); + SDLoc dl(Op); + SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i64); + SDValue NewOp = + SDValue(CurDAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, + dl, Op.getValueType(), + Op, RC), 0); + OutOps.push_back(NewOp); return false; } return true; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 60fde5caa3393..c6150f9e5d1d7 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2650,9 +2650,13 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC, case CallingConv::PreserveMost: case CallingConv::CXX_FAST_TLS: case CallingConv::Swift: + if (Subtarget->isTargetWindows() && IsVarArg) + return CC_AArch64_Win64_VarArg; if (!Subtarget->isTargetDarwin()) return CC_AArch64_AAPCS; return IsVarArg ? CC_AArch64_DarwinPCS_VarArg : CC_AArch64_DarwinPCS; + case CallingConv::Win64: + return IsVarArg ? CC_AArch64_Win64_VarArg : CC_AArch64_AAPCS; } } @@ -2668,6 +2672,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments( SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const { MachineFunction &MF = DAG.getMachineFunction(); MachineFrameInfo &MFI = MF.getFrameInfo(); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); // Assign locations to all of the incoming arguments. SmallVector<CCValAssign, 16> ArgLocs; @@ -2824,10 +2829,12 @@ SDValue AArch64TargetLowering::LowerFormalArguments( // varargs AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); if (isVarArg) { - if (!Subtarget->isTargetDarwin()) { + if (!Subtarget->isTargetDarwin() || IsWin64) { // The AAPCS variadic function ABI is identical to the non-variadic // one. As a result there may be more arguments in registers and we should // save them for future reference. + // Win64 variadic functions also pass arguments in registers, but all float + // arguments are passed in integer registers. saveVarArgRegisters(CCInfo, DAG, DL, Chain); } @@ -2869,6 +2876,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, MachineFrameInfo &MFI = MF.getFrameInfo(); AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>(); auto PtrVT = getPointerTy(DAG.getDataLayout()); + bool IsWin64 = Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv()); SmallVector<SDValue, 8> MemOps; @@ -2881,7 +2889,10 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, unsigned GPRSaveSize = 8 * (NumGPRArgRegs - FirstVariadicGPR); int GPRIdx = 0; if (GPRSaveSize != 0) { - GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); + if (IsWin64) + GPRIdx = MFI.CreateFixedObject(GPRSaveSize, -(int)GPRSaveSize, false); + else + GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false); SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT); @@ -2890,7 +2901,11 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64); SDValue Store = DAG.getStore( Val.getValue(1), DL, Val, FIN, - MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); + IsWin64 + ? MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), + GPRIdx, + (i - FirstVariadicGPR) * 8) + : MachinePointerInfo::getStack(DAG.getMachineFunction(), i * 8)); MemOps.push_back(Store); FIN = DAG.getNode(ISD::ADD, DL, PtrVT, FIN, DAG.getConstant(8, DL, PtrVT)); @@ -2899,7 +2914,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo, FuncInfo->setVarArgsGPRIndex(GPRIdx); FuncInfo->setVarArgsGPRSize(GPRSaveSize); - if (Subtarget->hasFPARMv8()) { + if (Subtarget->hasFPARMv8() && !IsWin64) { static const MCPhysReg FPRArgRegs[] = { AArch64::Q0, AArch64::Q1, AArch64::Q2, AArch64::Q3, AArch64::Q4, AArch64::Q5, AArch64::Q6, AArch64::Q7}; @@ -4491,6 +4506,21 @@ SDValue AArch64TargetLowering::LowerDarwin_VASTART(SDValue Op, MachinePointerInfo(SV)); } +SDValue AArch64TargetLowering::LowerWin64_VASTART(SDValue Op, + SelectionDAG &DAG) const { + AArch64FunctionInfo *FuncInfo = + DAG.getMachineFunction().getInfo<AArch64FunctionInfo>(); + + SDLoc DL(Op); + SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsGPRSize() > 0 + ? FuncInfo->getVarArgsGPRIndex() + : FuncInfo->getVarArgsStackIndex(), + getPointerTy(DAG.getDataLayout())); + const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue(); + return DAG.getStore(Op.getOperand(0), DL, FR, Op.getOperand(1), + MachinePointerInfo(SV)); +} + SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const { // The layout of the va_list struct is specified in the AArch64 Procedure Call @@ -4562,8 +4592,14 @@ SDValue AArch64TargetLowering::LowerAAPCS_VASTART(SDValue Op, SDValue AArch64TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { - return Subtarget->isTargetDarwin() ? LowerDarwin_VASTART(Op, DAG) - : LowerAAPCS_VASTART(Op, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + + if (Subtarget->isCallingConvWin64(MF.getFunction()->getCallingConv())) + return LowerWin64_VASTART(Op, DAG); + else if (Subtarget->isTargetDarwin()) + return LowerDarwin_VASTART(Op, DAG); + else + return LowerAAPCS_VASTART(Op, DAG); } SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, @@ -4571,7 +4607,8 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op, // AAPCS has three pointers and two ints (= 32 bytes), Darwin has single // pointer. SDLoc DL(Op); - unsigned VaListSize = Subtarget->isTargetDarwin() ? 8 : 32; + unsigned VaListSize = + Subtarget->isTargetDarwin() || Subtarget->isTargetWindows() ? 8 : 32; const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue(); const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue(); @@ -7451,6 +7488,14 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } +MachineMemOperand::Flags +AArch64TargetLowering::getMMOFlags(const Instruction &I) const { + if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && + I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) + return MOStridedAccess; + return MachineMemOperand::MONone; +} + bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { @@ -10567,9 +10612,6 @@ AArch64TargetLowering::shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const { if (Size > 128) return AtomicExpansionKind::None; // Nand not supported in LSE. if (AI->getOperation() == AtomicRMWInst::Nand) return AtomicExpansionKind::LLSC; - // Currently leaving And and Sub to LLSC - if ((AI->getOperation() == AtomicRMWInst::And) || (AI->getOperation() == AtomicRMWInst::Sub)) - return AtomicExpansionKind::LLSC; // Leave 128 bits to LLSC. return (Subtarget->hasLSE() && Size < 128) ? AtomicExpansionKind::None : AtomicExpansionKind::LLSC; } @@ -10783,7 +10825,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const { unsigned AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const { - if (Subtarget->isTargetDarwin()) + if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) return getPointerTy(DL).getSizeInBits(); return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32; diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h index ecc2517fb288d..3b0e0f1de8946 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.h +++ b/lib/Target/AArch64/AArch64ISelLowering.h @@ -408,6 +408,19 @@ public: bool isIntDivCheap(EVT VT, AttributeList Attr) const override; + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const SelectionDAG &DAG) const override { + // Do not merge to float value size (128 bytes) if no implicit + // float attribute is set. + + bool NoFloat = DAG.getMachineFunction().getFunction()->hasFnAttribute( + Attribute::NoImplicitFloat); + + if (NoFloat) + return (MemVT.getSizeInBits() <= 64); + return true; + } + bool isCheapToSpeculateCttz() const override { return true; } @@ -455,6 +468,8 @@ public: unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; + MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; @@ -541,6 +556,7 @@ private: SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const; SDValue LowerAAPCS_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerDarwin_VASTART(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerWin64_VASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/AArch64/AArch64InstrAtomics.td b/lib/Target/AArch64/AArch64InstrAtomics.td index de283b70210fe..eec41ddbc159f 100644 --- a/lib/Target/AArch64/AArch64InstrAtomics.td +++ b/lib/Target/AArch64/AArch64InstrAtomics.td @@ -451,3 +451,13 @@ def : Pat<(atomic_swap_8 GPR64:$Rn, GPR32:$Rs), (SWPALb GPR32:$Rs, GPR64sp:$Rn)> def : Pat<(atomic_swap_16 GPR64:$Rn, GPR32:$Rs), (SWPALh GPR32:$Rs, GPR64sp:$Rn)>; def : Pat<(atomic_swap_32 GPR64:$Rn, GPR32:$Rs), (SWPALs GPR32:$Rs, GPR64sp:$Rn)>; def : Pat<(atomic_swap_64 GPR64:$Rn, GPR64:$Rs), (SWPALd GPR64:$Rs, GPR64sp:$Rn)>; + +def : Pat<(atomic_load_sub_8 GPR64:$Rn, GPR32:$Rs), (LDADDALb (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_16 GPR64:$Rn, GPR32:$Rs), (LDADDALh (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_32 GPR64:$Rn, GPR32:$Rs), (LDADDALs (SUBWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_sub_64 GPR64:$Rn, GPR64:$Rs), (LDADDALd (SUBXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; + +def : Pat<(atomic_load_and_8 GPR64:$Rn, GPR32:$Rs), (LDCLRALb (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_16 GPR64:$Rn, GPR32:$Rs), (LDCLRALh (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_32 GPR64:$Rn, GPR32:$Rs), (LDCLRALs (ORNWrr WZR, GPR32:$Rs), GPR64sp:$Rn)>; +def : Pat<(atomic_load_and_64 GPR64:$Rn, GPR64:$Rs), (LDCLRALd (ORNXrr XZR, GPR64:$Rs), GPR64sp:$Rn)>; diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index dba3e4bdf82f1..c0c6055c358f7 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -52,9 +52,6 @@ using namespace llvm; #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" -static const MachineMemOperand::Flags MOSuppressPair = - MachineMemOperand::MOTargetFlag1; - static cl::opt<unsigned> TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); @@ -1715,6 +1712,13 @@ void AArch64InstrInfo::suppressLdStPair(MachineInstr &MI) const { (*MI.memoperands_begin())->setFlags(MOSuppressPair); } +/// Check all MachineMemOperands for a hint that the load/store is strided. +bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const { + return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOStridedAccess; + }); +} + bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { switch (Opc) { default: @@ -4433,7 +4437,8 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = - {{MOSuppressPair, "aarch64-suppress-pair"}}; + {{MOSuppressPair, "aarch64-suppress-pair"}, + {MOStridedAccess, "aarch64-strided-access"}}; return makeArrayRef(TargetFlags); } diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 0809ede4df2a5..1765a0263ea44 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -27,6 +27,13 @@ namespace llvm { class AArch64Subtarget; class AArch64TargetMachine; +static const MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; +static const MachineMemOperand::Flags MOStridedAccess = + MachineMemOperand::MOTargetFlag2; + +#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access" + class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; const AArch64Subtarget &Subtarget; @@ -81,6 +88,9 @@ public: /// unprofitable. bool isLdStPairSuppressed(const MachineInstr &MI) const; + /// Return true if the given load or store is a strided memory access. + bool isStridedAccess(const MachineInstr &MI) const; + /// Return true if this is an unscaled load/store. bool isUnscaledLdSt(unsigned Opc) const; @@ -356,7 +366,7 @@ enum AArch64FrameOffsetStatus { /// If result == AArch64FrameOffsetCannotUpdate, @p MI cannot be updated to /// use an offset.eq /// If result & AArch64FrameOffsetIsLegal, @p Offset can completely be -/// rewriten in @p MI. +/// rewritten in @p MI. /// If result & AArch64FrameOffsetCanUpdate, @p Offset contains the /// amount that is off the limit of the legal offset. /// If set, @p OutUseUnscaledOp will contain the whether @p MI should be diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 0be14673eb20b..0dcf07f984124 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -37,6 +37,8 @@ def HasFullFP16 : Predicate<"Subtarget->hasFullFP16()">, AssemblerPredicate<"FeatureFullFP16", "fullfp16">; def HasSPE : Predicate<"Subtarget->hasSPE()">, AssemblerPredicate<"FeatureSPE", "spe">; +def HasSVE : Predicate<"Subtarget->hasSVE()">, + AssemblerPredicate<"FeatureSVE", "sve">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 4a0a7c36baf8b..ffb27834c31c6 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -82,7 +82,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() { setAction({Op, 1, s1}, Legal); } - for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMA, G_FMUL, G_FDIV}) for (auto Ty : {s32, s64}) setAction({BinOp, Ty}, Legal); diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp index fab92e139dd09..9f7dcb3fe1c3f 100644 --- a/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -74,7 +74,7 @@ const uint32_t * AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return CSR_AArch64_NoRegs_RegMask; if (CC == CallingConv::AnyReg) return CSR_AArch64_AllRegs_RegMask; @@ -167,7 +167,7 @@ bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const { const TargetRegisterClass * AArch64RegisterInfo::getPointerRegClass(const MachineFunction &MF, unsigned Kind) const { - return &AArch64::GPR64RegClass; + return &AArch64::GPR64spRegClass; } const TargetRegisterClass * diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index a3238cf3b60f0..ea61124527365 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -134,7 +134,9 @@ void AArch64Subtarget::initializeProperties() { case CortexA72: PrefFunctionAlignment = 4; break; - case CortexA73: break; + case CortexA73: + PrefFunctionAlignment = 4; + break; case Others: break; } } @@ -171,7 +173,8 @@ struct AArch64GISelActualAccessor : public GISelAccessor { AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU, const std::string &FS, const TargetMachine &TM, bool LittleEndian) - : AArch64GenSubtargetInfo(TT, CPU, FS), ReserveX18(TT.isOSDarwin()), + : AArch64GenSubtargetInfo(TT, CPU, FS), + ReserveX18(TT.isOSDarwin() || TT.isOSWindows()), IsLittle(LittleEndian), TargetTriple(TT), FrameLowering(), InstrInfo(initializeSubtargetDependencies(FS, CPU)), TSInfo(), TLInfo(TM, *this), GISel() { diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index db53946cbc77f..5a1f45ee25528 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -70,6 +70,7 @@ protected: bool HasFullFP16 = false; bool HasSPE = false; bool HasLSLFast = false; + bool HasSVE = false; // HasZeroCycleRegMove - Has zero-cycle register mov instructions. bool HasZeroCycleRegMove = false; @@ -251,6 +252,7 @@ public: bool hasFullFP16() const { return HasFullFP16; } bool hasSPE() const { return HasSPE; } bool hasLSLFast() const { return HasLSLFast; } + bool hasSVE() const { return HasSVE; } bool isLittleEndian() const { return IsLittle; } @@ -304,6 +306,17 @@ public: bool enableEarlyIfConversion() const override; std::unique_ptr<PBQPRAConstraint> getCustomPBQPConstraints() const override; + + bool isCallingConvWin64(CallingConv::ID CC) const { + switch (CC) { + case CallingConv::C: + return isTargetWindows(); + case CallingConv::Win64: + return true; + default: + return false; + } + } }; } // End llvm namespace diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 6237b8f3e7b9b..ba28c01a2effb 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -138,6 +138,9 @@ static cl::opt<int> EnableGlobalISelAtO( cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(-1)); +static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget()); @@ -158,6 +161,8 @@ extern "C" void LLVMInitializeAArch64Target() { initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); + initializeFalkorHWPFFixPass(*PR); + initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); } @@ -182,7 +187,7 @@ static std::string computeDataLayout(const Triple &TT, if (TT.isOSBinFormatMachO()) return "e-m:o-i64:64-i128:128-n32:64-S128"; if (TT.isOSBinFormatCOFF()) - return "e-m:w-i64:64-i128:128-n32:64-S128"; + return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128"; if (LittleEndian) return "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; return "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"; @@ -346,8 +351,12 @@ void AArch64PassConfig::addIRPasses() { // // Run this before LSR to remove the multiplies involved in computing the // pointer values N iterations ahead. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch) - addPass(createLoopDataPrefetchPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoopDataPrefetch) + addPass(createLoopDataPrefetchPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorMarkStridedAccessesPass()); + } TargetPassConfig::addIRPasses(); @@ -478,8 +487,12 @@ void AArch64PassConfig::addPreSched2() { // Expand some pseudo instructions to allow proper scheduling. addPass(createAArch64ExpandPseudoPass()); // Use load/store pair instructions when possible. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt) - addPass(createAArch64LoadStoreOptimizationPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoadStoreOpt) + addPass(createAArch64LoadStoreOptimizationPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorHWPFFixPass()); + } } void AArch64PassConfig::addPreEmitPass() { diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h index fefa7e26b79f6..85de02e859e0c 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.h +++ b/lib/Target/AArch64/AArch64TargetMachine.h @@ -36,6 +36,8 @@ public: ~AArch64TargetMachine() override; const AArch64Subtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some, targets is + // deprecated and should not be used. const AArch64Subtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index e841fb8945191..a79d518205450 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -86,7 +86,7 @@ private: bool parseOperand(OperandVector &Operands, bool isCondCode, bool invertCondCode); - bool showMatchError(SMLoc Loc, unsigned ErrCode); + bool showMatchError(SMLoc Loc, unsigned ErrCode, OperandVector &Operands); bool parseDirectiveArch(SMLoc L); bool parseDirectiveCPU(SMLoc L); @@ -3257,7 +3257,10 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, } } -bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { +std::string AArch64MnemonicSpellCheck(StringRef S, uint64_t FBS); + +bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode, + OperandVector &Operands) { switch (ErrCode) { case Match_MissingFeature: return Error(Loc, @@ -3380,8 +3383,12 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) { return Error(Loc, "expected readable system register"); case Match_MSR: return Error(Loc, "expected writable system register or pstate"); - case Match_MnemonicFail: - return Error(Loc, "unrecognized instruction mnemonic"); + case Match_MnemonicFail: { + std::string Suggestion = AArch64MnemonicSpellCheck( + ((AArch64Operand &)*Operands[0]).getToken(), + ComputeAvailableFeatures(STI->getFeatureBits())); + return Error(Loc, "unrecognized instruction mnemonic" + Suggestion); + } default: llvm_unreachable("unexpected error code!"); } @@ -3707,7 +3714,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(IDLoc, Msg); } case Match_MnemonicFail: - return showMatchError(IDLoc, MatchResult); + return showMatchError(IDLoc, MatchResult, Operands); case Match_InvalidOperand: { SMLoc ErrorLoc = IDLoc; @@ -3726,7 +3733,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, ((AArch64Operand &)*Operands[ErrorInfo]).isTokenSuffix()) MatchResult = Match_InvalidSuffix; - return showMatchError(ErrorLoc, MatchResult); + return showMatchError(ErrorLoc, MatchResult, Operands); } case Match_InvalidMemoryIndexed1: case Match_InvalidMemoryIndexed2: @@ -3784,7 +3791,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, SMLoc ErrorLoc = ((AArch64Operand &)*Operands[ErrorInfo]).getStartLoc(); if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc; - return showMatchError(ErrorLoc, MatchResult); + return showMatchError(ErrorLoc, MatchResult, Operands); } } diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt index 02b12b5e90ca2..f7e0a5c7bed39 100644 --- a/lib/Target/AArch64/CMakeLists.txt +++ b/lib/Target/AArch64/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_target(AArch64CodeGen AArch64ConditionalCompares.cpp AArch64DeadRegisterDefinitionsPass.cpp AArch64ExpandPseudoInsts.cpp + AArch64FalkorHWPFFix.cpp AArch64FastISel.cpp AArch64A53Fix835769.cpp AArch64FrameLowering.cpp diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index a7a7daf4b4a55..2bd0cbf9f7c6a 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -104,8 +104,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case FK_Data_1: return 1; - case FK_Data_2: case AArch64::fixup_aarch64_movw: + case FK_Data_2: + case FK_SecRel_2: return 2; case AArch64::fixup_aarch64_pcrel_branch14: @@ -124,6 +125,7 @@ static unsigned getFixupKindNumBytes(unsigned Kind) { case AArch64::fixup_aarch64_pcrel_branch26: case AArch64::fixup_aarch64_pcrel_call26: case FK_Data_4: + case FK_SecRel_4: return 4; case FK_Data_8: @@ -218,6 +220,8 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, case FK_Data_2: case FK_Data_4: case FK_Data_8: + case FK_SecRel_2: + case FK_SecRel_4: return Value; } } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp index 7862a03e771c1..31762b9e4cd50 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFObjectWriter.cpp @@ -27,8 +27,7 @@ namespace { class AArch64WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter { public: AArch64WinCOFFObjectWriter() - : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) { - } + : MCWinCOFFObjectTargetWriter(COFF::IMAGE_FILE_MACHINE_ARM64) {} ~AArch64WinCOFFObjectWriter() override = default; @@ -36,19 +35,59 @@ public: const MCFixup &Fixup, bool IsCrossSection, const MCAsmBackend &MAB) const override; - bool recordRelocation(const MCFixup &) const override; + bool recordRelocation(const MCFixup &) const override; }; } // end anonymous namespace -unsigned -AArch64WinCOFFObjectWriter::getRelocType(MCContext &Ctx, - const MCValue &Target, - const MCFixup &Fixup, - bool IsCrossSection, - const MCAsmBackend &MAB) const { - const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); - report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); +unsigned AArch64WinCOFFObjectWriter::getRelocType( + MCContext &Ctx, const MCValue &Target, const MCFixup &Fixup, + bool IsCrossSection, const MCAsmBackend &MAB) const { + auto Modifier = Target.isAbsolute() ? MCSymbolRefExpr::VK_None + : Target.getSymA()->getKind(); + + switch (static_cast<unsigned>(Fixup.getKind())) { + default: { + const MCFixupKindInfo &Info = MAB.getFixupKindInfo(Fixup.getKind()); + report_fatal_error(Twine("unsupported relocation type: ") + Info.Name); + } + + case FK_Data_4: + switch (Modifier) { + default: + return COFF::IMAGE_REL_ARM64_ADDR32; + case MCSymbolRefExpr::VK_COFF_IMGREL32: + return COFF::IMAGE_REL_ARM64_ADDR32NB; + case MCSymbolRefExpr::VK_SECREL: + return COFF::IMAGE_REL_ARM64_SECREL; + } + + case FK_Data_8: + return COFF::IMAGE_REL_ARM64_ADDR64; + + case FK_SecRel_2: + return COFF::IMAGE_REL_ARM64_SECTION; + + case FK_SecRel_4: + return COFF::IMAGE_REL_ARM64_SECREL; + + case AArch64::fixup_aarch64_add_imm12: + return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12A; + + case AArch64::fixup_aarch64_ldst_imm12_scale1: + case AArch64::fixup_aarch64_ldst_imm12_scale2: + case AArch64::fixup_aarch64_ldst_imm12_scale4: + case AArch64::fixup_aarch64_ldst_imm12_scale8: + case AArch64::fixup_aarch64_ldst_imm12_scale16: + return COFF::IMAGE_REL_ARM64_PAGEOFFSET_12L; + + case AArch64::fixup_aarch64_pcrel_adrp_imm21: + return COFF::IMAGE_REL_ARM64_PAGEBASE_REL21; + + case AArch64::fixup_aarch64_pcrel_branch26: + case AArch64::fixup_aarch64_pcrel_call26: + return COFF::IMAGE_REL_ARM64_BRANCH26; + } } bool AArch64WinCOFFObjectWriter::recordRelocation(const MCFixup &Fixup) const { diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 5a799b2d88d0d..568682899be51 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -56,7 +56,7 @@ extern char &AMDGPUMachineCFGStructurizerID; void initializeAMDGPUAlwaysInlinePass(PassRegistry&); -ModulePass *createAMDGPUAnnotateKernelFeaturesPass(); +Pass *createAMDGPUAnnotateKernelFeaturesPass(); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); extern char &AMDGPUAnnotateKernelFeaturesID; diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp index 7235d8fae3327..c68e5861ff25b 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp @@ -15,8 +15,10 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/CallGraphSCCPass.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Module.h" @@ -26,26 +28,27 @@ using namespace llvm; namespace { -class AMDGPUAnnotateKernelFeatures : public ModulePass { +class AMDGPUAnnotateKernelFeatures : public CallGraphSCCPass { private: + const TargetMachine *TM = nullptr; AMDGPUAS AS; - static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS); - void addAttrToCallers(Function *Intrin, StringRef AttrName); - bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>); + bool addFeatureAttributes(Function &F); public: static char ID; - AMDGPUAnnotateKernelFeatures() : ModulePass(ID) {} - bool runOnModule(Module &M) override; + AMDGPUAnnotateKernelFeatures() : CallGraphSCCPass(ID) {} + + bool doInitialization(CallGraph &CG) override; + bool runOnSCC(CallGraphSCC &SCC) override; StringRef getPassName() const override { return "AMDGPU Annotate Kernel Features"; } void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesAll(); - ModulePass::getAnalysisUsage(AU); + CallGraphSCCPass::getAnalysisUsage(AU); } static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS); @@ -121,16 +124,130 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively( return false; } -// Return true if an addrspacecast is used that requires the queue ptr. -bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F, - AMDGPUAS AS) { +// We do not need to note the x workitem or workgroup id because they are always +// initialized. +// +// TODO: We should not add the attributes if the known compile time workgroup +// size is 1 for y/z. +static StringRef intrinsicToAttrName(Intrinsic::ID ID, + bool &NonKernelOnly, + bool &IsQueuePtr) { + switch (ID) { + case Intrinsic::amdgcn_workitem_id_x: + NonKernelOnly = true; + return "amdgpu-work-item-id-x"; + case Intrinsic::amdgcn_workgroup_id_x: + NonKernelOnly = true; + return "amdgpu-work-group-id-x"; + case Intrinsic::amdgcn_workitem_id_y: + case Intrinsic::r600_read_tidig_y: + return "amdgpu-work-item-id-y"; + case Intrinsic::amdgcn_workitem_id_z: + case Intrinsic::r600_read_tidig_z: + return "amdgpu-work-item-id-z"; + case Intrinsic::amdgcn_workgroup_id_y: + case Intrinsic::r600_read_tgid_y: + return "amdgpu-work-group-id-y"; + case Intrinsic::amdgcn_workgroup_id_z: + case Intrinsic::r600_read_tgid_z: + return "amdgpu-work-group-id-z"; + case Intrinsic::amdgcn_dispatch_ptr: + return "amdgpu-dispatch-ptr"; + case Intrinsic::amdgcn_dispatch_id: + return "amdgpu-dispatch-id"; + case Intrinsic::amdgcn_kernarg_segment_ptr: + case Intrinsic::amdgcn_implicitarg_ptr: + return "amdgpu-kernarg-segment-ptr"; + case Intrinsic::amdgcn_queue_ptr: + case Intrinsic::trap: + case Intrinsic::debugtrap: + IsQueuePtr = true; + return "amdgpu-queue-ptr"; + default: + return ""; + } +} + +static bool handleAttr(Function &Parent, const Function &Callee, + StringRef Name) { + if (Callee.hasFnAttribute(Name)) { + Parent.addFnAttr(Name); + return true; + } + + return false; +} + +static void copyFeaturesToFunction(Function &Parent, const Function &Callee, + bool &NeedQueuePtr) { + // X ids unnecessarily propagated to kernels. + static const StringRef AttrNames[] = { + { "amdgpu-work-item-id-x" }, + { "amdgpu-work-item-id-y" }, + { "amdgpu-work-item-id-z" }, + { "amdgpu-work-group-id-x" }, + { "amdgpu-work-group-id-y" }, + { "amdgpu-work-group-id-z" }, + { "amdgpu-dispatch-ptr" }, + { "amdgpu-dispatch-id" }, + { "amdgpu-kernarg-segment-ptr" } + }; + + if (handleAttr(Parent, Callee, "amdgpu-queue-ptr")) + NeedQueuePtr = true; + + for (StringRef AttrName : AttrNames) + handleAttr(Parent, Callee, AttrName); +} + +bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) { + const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F); + bool HasFlat = ST.hasFlatAddressSpace(); + bool HasApertureRegs = ST.hasApertureRegs(); SmallPtrSet<const Constant *, 8> ConstantExprVisited; - for (const BasicBlock &BB : F) { - for (const Instruction &I : BB) { + bool Changed = false; + bool NeedQueuePtr = false; + bool HaveCall = false; + bool IsFunc = !AMDGPU::isEntryFunctionCC(F.getCallingConv()); + + for (BasicBlock &BB : F) { + for (Instruction &I : BB) { + CallSite CS(&I); + if (CS) { + Function *Callee = CS.getCalledFunction(); + + // TODO: Do something with indirect calls. + if (!Callee) { + if (!CS.isInlineAsm()) + HaveCall = true; + continue; + } + + Intrinsic::ID IID = Callee->getIntrinsicID(); + if (IID == Intrinsic::not_intrinsic) { + HaveCall = true; + copyFeaturesToFunction(F, *Callee, NeedQueuePtr); + Changed = true; + } else { + bool NonKernelOnly = false; + StringRef AttrName = intrinsicToAttrName(IID, + NonKernelOnly, NeedQueuePtr); + if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) { + F.addFnAttr(AttrName); + Changed = true; + } + } + } + + if (NeedQueuePtr || HasApertureRegs) + continue; + if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) { - if (castRequiresQueuePtr(ASC, AS)) - return true; + if (castRequiresQueuePtr(ASC, AS)) { + NeedQueuePtr = true; + continue; + } } for (const Use &U : I.operands()) { @@ -138,100 +255,57 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F, if (!OpC) continue; - if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) - return true; + if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS)) { + NeedQueuePtr = true; + break; + } } } } - return false; -} - -void AMDGPUAnnotateKernelFeatures::addAttrToCallers(Function *Intrin, - StringRef AttrName) { - SmallPtrSet<Function *, 4> SeenFuncs; - - for (User *U : Intrin->users()) { - // CallInst is the only valid user for an intrinsic. - CallInst *CI = cast<CallInst>(U); - - Function *CallingFunction = CI->getParent()->getParent(); - if (SeenFuncs.insert(CallingFunction).second) - CallingFunction->addFnAttr(AttrName); + if (NeedQueuePtr) { + F.addFnAttr("amdgpu-queue-ptr"); + Changed = true; } -} - -bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics( - Module &M, - ArrayRef<StringRef[2]> IntrinsicToAttr) { - bool Changed = false; - for (const StringRef *Arr : IntrinsicToAttr) { - if (Function *Fn = M.getFunction(Arr[0])) { - addAttrToCallers(Fn, Arr[1]); - Changed = true; - } + // TODO: We could refine this to captured pointers that could possibly be + // accessed by flat instructions. For now this is mostly a poor way of + // estimating whether there are calls before argument lowering. + if (HasFlat && !IsFunc && HaveCall) { + F.addFnAttr("amdgpu-flat-scratch"); + Changed = true; } return Changed; } -bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) { +bool AMDGPUAnnotateKernelFeatures::runOnSCC(CallGraphSCC &SCC) { + Module &M = SCC.getCallGraph().getModule(); Triple TT(M.getTargetTriple()); - AS = AMDGPU::getAMDGPUAS(M); - - static const StringRef IntrinsicToAttr[][2] = { - // .x omitted - { "llvm.amdgcn.workitem.id.y", "amdgpu-work-item-id-y" }, - { "llvm.amdgcn.workitem.id.z", "amdgpu-work-item-id-z" }, - - { "llvm.amdgcn.workgroup.id.y", "amdgpu-work-group-id-y" }, - { "llvm.amdgcn.workgroup.id.z", "amdgpu-work-group-id-z" }, - - { "llvm.r600.read.tgid.y", "amdgpu-work-group-id-y" }, - { "llvm.r600.read.tgid.z", "amdgpu-work-group-id-z" }, - - // .x omitted - { "llvm.r600.read.tidig.y", "amdgpu-work-item-id-y" }, - { "llvm.r600.read.tidig.z", "amdgpu-work-item-id-z" } - }; - static const StringRef HSAIntrinsicToAttr[][2] = { - { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" }, - { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" }, - { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" }, - { "llvm.trap", "amdgpu-queue-ptr" }, - { "llvm.debugtrap", "amdgpu-queue-ptr" } - }; - - // TODO: We should not add the attributes if the known compile time workgroup - // size is 1 for y/z. - - // TODO: Intrinsics that require queue ptr. + bool Changed = false; + for (CallGraphNode *I : SCC) { + Function *F = I->getFunction(); + if (!F || F->isDeclaration()) + continue; - // We do not need to note the x workitem or workgroup id because they are - // always initialized. + Changed |= addFeatureAttributes(*F); + } - bool Changed = addAttrsForIntrinsics(M, IntrinsicToAttr); - if (TT.getOS() == Triple::AMDHSA || TT.getOS() == Triple::Mesa3D) { - Changed |= addAttrsForIntrinsics(M, HSAIntrinsicToAttr); - for (Function &F : M) { - if (F.hasFnAttribute("amdgpu-queue-ptr")) - continue; + return Changed; +} - auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); - bool HasApertureRegs = TPC && TPC->getTM<TargetMachine>() - .getSubtarget<AMDGPUSubtarget>(F) - .hasApertureRegs(); - if (!HasApertureRegs && hasAddrSpaceCast(F, AS)) - F.addFnAttr("amdgpu-queue-ptr"); - } - } +bool AMDGPUAnnotateKernelFeatures::doInitialization(CallGraph &CG) { + auto *TPC = getAnalysisIfAvailable<TargetPassConfig>(); + if (!TPC) + report_fatal_error("TargetMachine is required"); - return Changed; + AS = AMDGPU::getAMDGPUAS(CG.getModule()); + TM = &TPC->getTM<TargetMachine>(); + return false; } -ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { +Pass *llvm::createAMDGPUAnnotateKernelFeaturesPass() { return new AMDGPUAnnotateKernelFeatures(); } diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp index 83ad1a5c6ee30..2247814cfe55d 100644 --- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp +++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp @@ -268,20 +268,11 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) { CurrentProgramInfo.ScratchSize, getFunctionCodeSize(MF)); - OutStreamer->emitRawComment(" codeLenInByte = " + - Twine(getFunctionCodeSize(MF)), false); - OutStreamer->emitRawComment( - " NumSgprs: " + Twine(CurrentProgramInfo.NumSGPR), false); - OutStreamer->emitRawComment( - " NumVgprs: " + Twine(CurrentProgramInfo.NumVGPR), false); - OutStreamer->emitRawComment( " FloatMode: " + Twine(CurrentProgramInfo.FloatMode), false); OutStreamer->emitRawComment( " IeeeMode: " + Twine(CurrentProgramInfo.IEEEMode), false); OutStreamer->emitRawComment( - " ScratchSize: " + Twine(CurrentProgramInfo.ScratchSize), false); - OutStreamer->emitRawComment( " LDSByteSize: " + Twine(CurrentProgramInfo.LDSSize) + " bytes/workgroup (compile time only)", false); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 2553cf4da0feb..258b1737deb38 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -573,6 +573,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FSUB); setTargetDAGCombine(ISD::FNEG); setTargetDAGCombine(ISD::FABS); + setTargetDAGCombine(ISD::AssertZext); + setTargetDAGCombine(ISD::AssertSext); } //===----------------------------------------------------------------------===// @@ -883,7 +885,7 @@ CCAssignFn *AMDGPUCallLowering::CCAssignFnForReturn(CallingConv::ID CC, /// When the SelectionDAGBuilder computes the Ins, it takes care of splitting /// input values across multiple registers. Each item in the Ins array -/// represents a single value that will be stored in regsters. Ins[x].VT is +/// represents a single value that will be stored in registers. Ins[x].VT is /// the value type of the value that will be stored in the register, so /// whatever SDNode we lower the argument to needs to be this type. /// @@ -2591,6 +2593,31 @@ SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N, return SDValue(CSrc, 0); } +// FIXME: This should go in generic DAG combiner with an isTruncateFree check, +// but isTruncateFree is inaccurate for i16 now because of SALU vs. VALU +// issues. +SDValue AMDGPUTargetLowering::performAssertSZExtCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDValue N0 = N->getOperand(0); + + // (vt2 (assertzext (truncate vt0:x), vt1)) -> + // (vt2 (truncate (assertzext vt0:x, vt1))) + if (N0.getOpcode() == ISD::TRUNCATE) { + SDValue N1 = N->getOperand(1); + EVT ExtVT = cast<VTSDNode>(N1)->getVT(); + SDLoc SL(N); + + SDValue Src = N0.getOperand(0); + EVT SrcVT = Src.getValueType(); + if (SrcVT.bitsGE(ExtVT)) { + SDValue NewInReg = DAG.getNode(N->getOpcode(), SL, SrcVT, Src, N1); + return DAG.getNode(ISD::TRUNCATE, SL, N->getValueType(0), NewInReg); + } + } + + return SDValue(); +} /// Split the 64-bit value \p LHS into two 32-bit components, and perform the /// binary operation \p Opc to it with the corresponding constant operands. SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( @@ -3521,6 +3548,9 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N, break; } + case ISD::AssertZext: + case ISD::AssertSext: + return performAssertSZExtCombine(N, DCI); } return SDValue(); } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index a45234e2b39f2..d85aada6053a1 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -76,6 +76,7 @@ protected: SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performAssertSZExtCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL, unsigned Opc, SDValue LHS, diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 1bc5a52053ecb..7796176290108 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -277,7 +277,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU( // Make sure requested values are compatible with values implied by requested // minimum/maximum flat work group sizes. if (RequestedFlatWorkGroupSize && - Requested.first > MinImpliedByFlatWorkGroupSize) + Requested.first < MinImpliedByFlatWorkGroupSize) return Default; return Requested; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index 22cede59086ab..d4b6a5fe8020b 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -359,6 +359,10 @@ public: return FP64FP16Denormals; } + bool supportsMinMaxDenormModes() const { + return getGeneration() >= AMDGPUSubtarget::GFX9; + } + bool hasFPExceptions() const { return FPExceptions; } diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index e3c90f250600a..b37c274102bc8 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1208,7 +1208,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const { } bool AMDGPUOperand::isLiteralImm(MVT type) const { - // Check that this imediate can be added as literal + // Check that this immediate can be added as literal if (!isImmTy(ImmTyNone)) { return false; } diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp index f26e49295e690..966c6fec20c63 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp @@ -87,6 +87,7 @@ DECODE_OPERAND(Decode##RegClass##RegisterClass, decodeOperand_##RegClass) DECODE_OPERAND_REG(VGPR_32) DECODE_OPERAND_REG(VS_32) DECODE_OPERAND_REG(VS_64) +DECODE_OPERAND_REG(VS_128) DECODE_OPERAND_REG(VReg_64) DECODE_OPERAND_REG(VReg_96) @@ -318,6 +319,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VS_64(unsigned Val) const { return decodeSrcOp(OPW64, Val); } +MCOperand AMDGPUDisassembler::decodeOperand_VS_128(unsigned Val) const { + return decodeSrcOp(OPW128, Val); +} + MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const { return decodeSrcOp(OPW16, Val); } diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h index 3d71db909e20d..4c755be099995 100644 --- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h +++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h @@ -70,6 +70,7 @@ public: MCOperand decodeOperand_VGPR_32(unsigned Val) const; MCOperand decodeOperand_VS_32(unsigned Val) const; MCOperand decodeOperand_VS_64(unsigned Val) const; + MCOperand decodeOperand_VS_128(unsigned Val) const; MCOperand decodeOperand_VSrc16(unsigned Val) const; MCOperand decodeOperand_VSrcV216(unsigned Val) const; diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index 3af242d9ea660..0aad8f0843d62 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -653,6 +653,7 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI, // again. The same constant folded instruction could also have a second // use operand. NextUse = MRI->use_begin(Dst.getReg()); + FoldList.clear(); continue; } diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp index 08a64de385018..7334781916d81 100644 --- a/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -158,7 +158,7 @@ SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg( // No replacement necessary. if (ScratchWaveOffsetReg == AMDGPU::NoRegister || !MRI.isPhysRegUsed(ScratchWaveOffsetReg)) { - assert(MFI->getStackPtrOffsetReg() == AMDGPU::NoRegister); + assert(MFI->getStackPtrOffsetReg() == AMDGPU::SP_REG); return std::make_pair(AMDGPU::NoRegister, AMDGPU::NoRegister); } @@ -246,13 +246,16 @@ void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF, // this point it appears we need the setup. This part of the prolog should be // emitted after frame indices are eliminated. - if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit()) + if (MFI->hasFlatScratchInit()) emitFlatScratchInit(ST, MF, MBB); unsigned SPReg = MFI->getStackPtrOffsetReg(); - if (SPReg != AMDGPU::NoRegister) { + if (SPReg != AMDGPU::SP_REG) { + assert(MRI.isReserved(SPReg) && "SPReg used but not reserved"); + DebugLoc DL; - int64_t StackSize = MF.getFrameInfo().getStackSize(); + const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); + int64_t StackSize = FrameInfo.getStackSize(); if (StackSize == 0) { BuildMI(MBB, MBB.begin(), DL, TII->get(AMDGPU::COPY), SPReg) diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index 2ba570b9ebbbc..2356405f09199 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1171,8 +1171,7 @@ static void allocateSystemSGPRs(CCState &CCInfo, static void reservePrivateMemoryRegs(const TargetMachine &TM, MachineFunction &MF, const SIRegisterInfo &TRI, - SIMachineFunctionInfo &Info, - bool NeedSP) { + SIMachineFunctionInfo &Info) { // Now that we've figured out where the scratch register inputs are, see if // should reserve the arguments and use them directly. MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -1234,15 +1233,6 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM, Info.setScratchWaveOffsetReg(ReservedOffsetReg); } } - - if (NeedSP) { - unsigned ReservedStackPtrOffsetReg = TRI.reservedStackPtrOffsetReg(MF); - Info.setStackPtrOffsetReg(ReservedStackPtrOffsetReg); - - assert(Info.getStackPtrOffsetReg() != Info.getFrameOffsetReg()); - assert(!TRI.isSubRegister(Info.getScratchRSrcReg(), - Info.getStackPtrOffsetReg())); - } } SDValue SITargetLowering::LowerFormalArguments( @@ -1380,10 +1370,37 @@ SDValue SITargetLowering::LowerFormalArguments( unsigned Reg = VA.getLocReg(); const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT); + EVT ValVT = VA.getValVT(); Reg = MF.addLiveIn(Reg, RC); SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT); + // If this is an 8 or 16-bit value, it is really passed promoted + // to 32 bits. Insert an assert[sz]ext to capture this, then + // truncate to the right size. + switch (VA.getLocInfo()) { + case CCValAssign::Full: + break; + case CCValAssign::BCvt: + Val = DAG.getNode(ISD::BITCAST, DL, ValVT, Val); + break; + case CCValAssign::SExt: + Val = DAG.getNode(ISD::AssertSext, DL, VT, Val, + DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + case CCValAssign::ZExt: + Val = DAG.getNode(ISD::AssertZext, DL, VT, Val, + DAG.getValueType(ValVT)); + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + case CCValAssign::AExt: + Val = DAG.getNode(ISD::TRUNCATE, DL, ValVT, Val); + break; + default: + llvm_unreachable("Unknown loc info!"); + } + if (IsShader && Arg.VT.isVector()) { // Build a vector from the registers Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); @@ -1410,25 +1427,13 @@ SDValue SITargetLowering::LowerFormalArguments( InVals.push_back(Val); } - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); - - // TODO: Could maybe omit SP if only tail calls? - bool NeedSP = FrameInfo.hasCalls() || FrameInfo.hasVarSizedObjects(); - // Start adding system SGPRs. if (IsEntryFunc) { allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader); - reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info, NeedSP); } else { CCInfo.AllocateReg(Info->getScratchRSrcReg()); CCInfo.AllocateReg(Info->getScratchWaveOffsetReg()); CCInfo.AllocateReg(Info->getFrameOffsetReg()); - - if (NeedSP) { - unsigned StackPtrReg = findFirstFreeSGPR(CCInfo); - CCInfo.AllocateReg(StackPtrReg); - Info->setStackPtrOffsetReg(StackPtrReg); - } } return Chains.empty() ? Chain : @@ -4624,8 +4629,8 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { return DAG.isKnownNeverNaN(Op); } -static bool isCanonicalized(SDValue Op, const SISubtarget *ST, - unsigned MaxDepth=5) { +static bool isCanonicalized(SelectionDAG &DAG, SDValue Op, + const SISubtarget *ST, unsigned MaxDepth=5) { // If source is a result of another standard FP operation it is already in // canonical form. @@ -4663,7 +4668,7 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST, case ISD::FNEG: case ISD::FABS: return (MaxDepth > 0) && - isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1); + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1); case ISD::FSIN: case ISD::FCOS: @@ -4672,16 +4677,19 @@ static bool isCanonicalized(SDValue Op, const SISubtarget *ST, // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. // For such targets need to check their input recursively. - // TODO: on GFX9+ we could return true without checking provided no-nan - // mode, since canonicalization is also used to quiet sNaNs. case ISD::FMINNUM: case ISD::FMAXNUM: case ISD::FMINNAN: case ISD::FMAXNAN: + if (ST->supportsMinMaxDenormModes() && + DAG.isKnownNeverNaN(Op.getOperand(0)) && + DAG.isKnownNeverNaN(Op.getOperand(1))) + return true; + return (MaxDepth > 0) && - isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) && - isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1); + isCanonicalized(DAG, Op.getOperand(0), ST, MaxDepth - 1) && + isCanonicalized(DAG, Op.getOperand(1), ST, MaxDepth - 1); case ISD::ConstantFP: { auto F = cast<ConstantFPSDNode>(Op)->getValueAPF(); @@ -4700,11 +4708,19 @@ SDValue SITargetLowering::performFCanonicalizeCombine( if (!CFP) { SDValue N0 = N->getOperand(0); + EVT VT = N0.getValueType().getScalarType(); + auto ST = getSubtarget(); + + if (((VT == MVT::f32 && ST->hasFP32Denormals()) || + (VT == MVT::f64 && ST->hasFP64Denormals()) || + (VT == MVT::f16 && ST->hasFP16Denormals())) && + DAG.isKnownNeverNaN(N0)) + return N0; bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && - isCanonicalized(N0, getSubtarget())) + isCanonicalized(DAG, N0, ST)) return N0; return SDValue(); @@ -5813,3 +5829,44 @@ SITargetLowering::getConstraintType(StringRef Constraint) const { } return TargetLowering::getConstraintType(Constraint); } + +// Figure out which registers should be reserved for stack access. Only after +// the function is legalized do we know all of the non-spill stack objects or if +// calls are present. +void SITargetLowering::finalizeLowering(MachineFunction &MF) const { + MachineRegisterInfo &MRI = MF.getRegInfo(); + SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>(); + const MachineFrameInfo &MFI = MF.getFrameInfo(); + const SISubtarget &ST = MF.getSubtarget<SISubtarget>(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); + + if (Info->isEntryFunction()) { + // Callable functions have fixed registers used for stack access. + reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info); + } + + // We have to assume the SP is needed in case there are calls in the function + // during lowering. Calls are only detected after the function is + // lowered. We're about to reserve registers, so don't bother using it if we + // aren't really going to use it. + bool NeedSP = !Info->isEntryFunction() || + MFI.hasVarSizedObjects() || + MFI.hasCalls(); + + if (NeedSP) { + unsigned ReservedStackPtrOffsetReg = TRI->reservedStackPtrOffsetReg(MF); + Info->setStackPtrOffsetReg(ReservedStackPtrOffsetReg); + + assert(Info->getStackPtrOffsetReg() != Info->getFrameOffsetReg()); + assert(!TRI->isSubRegister(Info->getScratchRSrcReg(), + Info->getStackPtrOffsetReg())); + MRI.replaceRegWith(AMDGPU::SP_REG, Info->getStackPtrOffsetReg()); + } + + MRI.replaceRegWith(AMDGPU::PRIVATE_RSRC_REG, Info->getScratchRSrcReg()); + MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg()); + MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG, + Info->getScratchWaveOffsetReg()); + + TargetLoweringBase::finalizeLowering(MF); +} diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 83392a7ab1b21..e6bb3d6cd4191 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -232,6 +232,8 @@ public: ConstraintType getConstraintType(StringRef Constraint) const override; SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL, SDValue V) const; + + void finalizeLowering(MachineFunction &MF) const override; }; } // End namespace llvm diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 160f8837d49c8..a7e0feb10b9f1 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3408,8 +3408,8 @@ void SIInstrInfo::legalizeOperands(MachineInstr &MI) const { } void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { - SmallVector<MachineInstr *, 128> Worklist; - Worklist.push_back(&TopInst); + SetVectorType Worklist; + Worklist.insert(&TopInst); while (!Worklist.empty()) { MachineInstr &Inst = *Worklist.pop_back_val(); @@ -3610,7 +3610,7 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const { } } -void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, +void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3635,7 +3635,7 @@ void SIInstrInfo::lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, } void SIInstrInfo::splitScalar64BitUnaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3686,7 +3686,7 @@ void SIInstrInfo::splitScalar64BitUnaryOp( } void SIInstrInfo::splitScalar64BitBinaryOp( - SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst, + SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3753,7 +3753,7 @@ void SIInstrInfo::splitScalar64BitBinaryOp( } void SIInstrInfo::splitScalar64BitBCNT( - SmallVectorImpl<MachineInstr *> &Worklist, MachineInstr &Inst) const { + SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3789,7 +3789,7 @@ void SIInstrInfo::splitScalar64BitBCNT( addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist); } -void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, +void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const { MachineBasicBlock &MBB = *Inst.getParent(); MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); @@ -3853,12 +3853,12 @@ void SIInstrInfo::splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, void SIInstrInfo::addUsersToMoveToVALUWorklist( unsigned DstReg, MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const { + SetVectorType &Worklist) const { for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg), E = MRI.use_end(); I != E;) { MachineInstr &UseMI = *I->getParent(); if (!canReadVGPR(UseMI, I.getOperandNo())) { - Worklist.push_back(&UseMI); + Worklist.insert(&UseMI); do { ++I; @@ -3869,7 +3869,7 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist( } } -void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, +void SIInstrInfo::movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const { unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); @@ -3932,7 +3932,7 @@ void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, } void SIInstrInfo::addSCCDefUsersToVALUWorklist( - MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const { + MachineInstr &SCCDefInst, SetVectorType &Worklist) const { // This assumes that all the users of SCC are in the same block // as the SCC def. for (MachineInstr &MI : @@ -3943,7 +3943,7 @@ void SIInstrInfo::addSCCDefUsersToVALUWorklist( return; if (MI.findRegisterUseOperandIdx(AMDGPU::SCC) != -1) - Worklist.push_back(&MI); + Worklist.insert(&MI); } } diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index d00c0d4a7f4ea..3dd5bc89e6c77 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -19,6 +19,7 @@ #include "AMDGPUInstrInfo.h" #include "SIDefines.h" #include "SIRegisterInfo.h" +#include "llvm/ADT/SetVector.h" namespace llvm { @@ -38,6 +39,8 @@ private: EXECZ = 3 }; + typedef SmallSetVector<MachineInstr *, 32> SetVectorType; + static unsigned getBranchOpcode(BranchPredicate Cond); static BranchPredicate getBranchPredicate(unsigned Opcode); @@ -56,30 +59,30 @@ private: void swapOperands(MachineInstr &Inst) const; - void lowerScalarAbs(SmallVectorImpl<MachineInstr *> &Worklist, + void lowerScalarAbs(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalar64BitUnaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitUnaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitBinaryOp(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitBinaryOp(SetVectorType &Worklist, MachineInstr &Inst, unsigned Opcode) const; - void splitScalar64BitBCNT(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitBCNT(SetVectorType &Worklist, MachineInstr &Inst) const; - void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist, + void splitScalar64BitBFE(SetVectorType &Worklist, MachineInstr &Inst) const; - void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist, + void movePackToVALU(SetVectorType &Worklist, MachineRegisterInfo &MRI, MachineInstr &Inst) const; void addUsersToMoveToVALUWorklist( unsigned Reg, MachineRegisterInfo &MRI, - SmallVectorImpl<MachineInstr *> &Worklist) const; + SetVectorType &Worklist) const; void addSCCDefUsersToVALUWorklist(MachineInstr &SCCDefInst, - SmallVectorImpl<MachineInstr *> &Worklist) const; + SetVectorType &Worklist) const; const TargetRegisterClass * getDestEquivalentVGPRClass(const MachineInstr &Inst) const; diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index ffb01363e1313..088173680fa89 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1436,7 +1436,7 @@ class VOPProfile <list<ValueType> _ArgVT> { field bit IsPacked = isPackedType<Src0VT>.ret; field bit HasOpSel = IsPacked; - field bit HasOMod = !if(HasOpSel, 0, HasModifiers); + field bit HasOMod = !if(HasOpSel, 0, isFloatType<DstVT>.ret); field bit HasSDWAOMod = isFloatType<DstVT>.ret; field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index bcc685015cf5e..ba69e42d9125f 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -1060,7 +1060,7 @@ def : Pat < class FPToI1Pat<Instruction Inst, int KOne, ValueType kone_type, ValueType vt, SDPatternOperator fp_to_int> : Pat < (i1 (fp_to_int (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)))), - (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)) + (i1 (Inst 0, (kone_type KOne), $src0_modifiers, $src0, DSTCLAMP.NONE)) >; def : FPToI1Pat<V_CMP_EQ_F32_e64, CONST.FP32_ONE, i32, f32, fp_to_uint>; diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp index 3203c38dae344..a7c8166ff6d27 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -23,10 +23,10 @@ using namespace llvm; SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) : AMDGPUMachineFunction(MF), TIDReg(AMDGPU::NoRegister), - ScratchRSrcReg(AMDGPU::NoRegister), - ScratchWaveOffsetReg(AMDGPU::NoRegister), - FrameOffsetReg(AMDGPU::NoRegister), - StackPtrOffsetReg(AMDGPU::NoRegister), + ScratchRSrcReg(AMDGPU::PRIVATE_RSRC_REG), + ScratchWaveOffsetReg(AMDGPU::SCRATCH_WAVE_OFFSET_REG), + FrameOffsetReg(AMDGPU::FP_REG), + StackPtrOffsetReg(AMDGPU::SP_REG), PrivateSegmentBufferUserSGPR(AMDGPU::NoRegister), DispatchPtrUserSGPR(AMDGPU::NoRegister), QueuePtrUserSGPR(AMDGPU::NoRegister), @@ -42,6 +42,9 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkGroupIDZSystemSGPR(AMDGPU::NoRegister), WorkGroupInfoSystemSGPR(AMDGPU::NoRegister), PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister), + WorkItemIDXVGPR(AMDGPU::NoRegister), + WorkItemIDYVGPR(AMDGPU::NoRegister), + WorkItemIDZVGPR(AMDGPU::NoRegister), PSInputAddr(0), PSInputEnable(0), ReturnsVoid(true), @@ -87,12 +90,14 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ScratchWaveOffsetReg = AMDGPU::SGPR4; FrameOffsetReg = AMDGPU::SGPR5; StackPtrOffsetReg = AMDGPU::SGPR32; - return; + + // FIXME: Not really a system SGPR. + PrivateSegmentWaveByteOffsetSystemSGPR = ScratchWaveOffsetReg; } CallingConv::ID CC = F->getCallingConv(); if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) { - KernargSegmentPtr = true; + KernargSegmentPtr = !F->arg_empty(); WorkGroupIDX = true; WorkItemIDX = true; } else if (CC == CallingConv::AMDGPU_PS) { @@ -101,17 +106,25 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) if (ST.debuggerEmitPrologue()) { // Enable everything. + WorkGroupIDX = true; WorkGroupIDY = true; WorkGroupIDZ = true; + WorkItemIDX = true; WorkItemIDY = true; WorkItemIDZ = true; } else { + if (F->hasFnAttribute("amdgpu-work-group-id-x")) + WorkGroupIDX = true; + if (F->hasFnAttribute("amdgpu-work-group-id-y")) WorkGroupIDY = true; if (F->hasFnAttribute("amdgpu-work-group-id-z")) WorkGroupIDZ = true; + if (F->hasFnAttribute("amdgpu-work-item-id-x")) + WorkItemIDX = true; + if (F->hasFnAttribute("amdgpu-work-item-id-y")) WorkItemIDY = true; @@ -119,25 +132,28 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) WorkItemIDZ = true; } - // X, XY, and XYZ are the only supported combinations, so make sure Y is - // enabled if Z is. - if (WorkItemIDZ) - WorkItemIDY = true; - const MachineFrameInfo &FrameInfo = MF.getFrameInfo(); bool MaySpill = ST.isVGPRSpillingEnabled(*F); - bool HasStackObjects = FrameInfo.hasStackObjects() || FrameInfo.hasCalls(); + bool HasStackObjects = FrameInfo.hasStackObjects(); + + if (isEntryFunction()) { + // X, XY, and XYZ are the only supported combinations, so make sure Y is + // enabled if Z is. + if (WorkItemIDZ) + WorkItemIDY = true; - if (HasStackObjects || MaySpill) { - PrivateSegmentWaveByteOffset = true; + if (HasStackObjects || MaySpill) { + PrivateSegmentWaveByteOffset = true; - // HS and GS always have the scratch wave offset in SGPR5 on GFX9. - if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && - (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) - PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + // HS and GS always have the scratch wave offset in SGPR5 on GFX9. + if (ST.getGeneration() >= AMDGPUSubtarget::GFX9 && + (CC == CallingConv::AMDGPU_HS || CC == CallingConv::AMDGPU_GS)) + PrivateSegmentWaveByteOffsetSystemSGPR = AMDGPU::SGPR5; + } } - if (ST.isAmdCodeObjectV2(MF)) { + bool IsCOV2 = ST.isAmdCodeObjectV2(MF); + if (IsCOV2) { if (HasStackObjects || MaySpill) PrivateSegmentBuffer = true; @@ -154,11 +170,15 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF) ImplicitBufferPtr = true; } - // We don't need to worry about accessing spills with flat instructions. - // TODO: On VI where we must use flat for global, we should be able to omit - // this if it is never used for generic access. - if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS()) - FlatScratchInit = true; + if (F->hasFnAttribute("amdgpu-kernarg-segment-ptr")) + KernargSegmentPtr = true; + + if (ST.hasFlatAddressSpace() && isEntryFunction() && IsCOV2) { + // TODO: This could be refined a lot. The attribute is a poor way of + // detecting calls that may require it before argument lowering. + if (HasStackObjects || F->hasFnAttribute("amdgpu-flat-scratch")) + FlatScratchInit = true; + } } unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer( diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h index 05aa249584bf1..4c7f38a09a484 100644 --- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -119,6 +119,11 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction { unsigned WorkGroupInfoSystemSGPR; unsigned PrivateSegmentWaveByteOffsetSystemSGPR; + // VGPR inputs. These are always v0, v1 and v2 for entry functions. + unsigned WorkItemIDXVGPR; + unsigned WorkItemIDYVGPR; + unsigned WorkItemIDZVGPR; + // Graphics info. unsigned PSInputAddr; unsigned PSInputEnable; @@ -377,10 +382,13 @@ public: } void setStackPtrOffsetReg(unsigned Reg) { - assert(Reg != AMDGPU::NoRegister && "Should never be unset"); StackPtrOffsetReg = Reg; } + // Note the unset value for this is AMDGPU::SP_REG rather than + // NoRegister. This is mostly a workaround for MIR tests where state that + // can't be directly computed from the function is not preserved in serialized + // MIR. unsigned getStackPtrOffsetReg() const { return StackPtrOffsetReg; } diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp index ef6ad4ad0c8f3..4a3fbb4593bb3 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -207,7 +207,11 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const { assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg)); } + // We have to assume the SP is needed in case there are calls in the function, + // which is detected after the function is lowered. If we aren't really going + // to need SP, don't bother reserving it. unsigned StackPtrReg = MFI->getStackPtrOffsetReg(); + if (StackPtrReg != AMDGPU::NoRegister) { reserveRegisterTuples(Reserved, StackPtrReg); assert(!isSubRegister(ScratchRSrcReg, StackPtrReg)); diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td index fc808011cd889..54ea7805e18d6 100644 --- a/lib/Target/AMDGPU/SIRegisterInfo.td +++ b/lib/Target/AMDGPU/SIRegisterInfo.td @@ -23,6 +23,13 @@ class SIReg <string n, bits<16> regIdx = 0> : Register<n>, def VCC_LO : SIReg<"vcc_lo", 106>; def VCC_HI : SIReg<"vcc_hi", 107>; +// Pseudo-registers: Used as placeholders during isel and immediately +// replaced, never seeing the verifier. +def PRIVATE_RSRC_REG : SIReg<"", 0>; +def FP_REG : SIReg<"", 0>; +def SP_REG : SIReg<"", 0>; +def SCRATCH_WAVE_OFFSET_REG : SIReg<"", 0>; + // VCC for 64-bit instructions def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>, DwarfRegAlias<VCC_LO> { @@ -267,7 +274,8 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7, def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32, (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI, TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT, - SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> { + SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT, + FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> { let AllocationPriority = 7; } @@ -314,7 +322,8 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add TTMP_128R let isAllocatable = 0; } -def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, (add SGPR_128, TTMP_128)> { +def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v16i8, v2i64], 32, + (add SGPR_128, TTMP_128)> { let AllocationPriority = 10; } @@ -464,7 +473,9 @@ defm SCSrc : RegInlineOperand<"SReg", "SCSrc"> ; defm VSrc : RegImmOperand<"VS", "VSrc">; -def VSrc_128 : RegisterOperand<VReg_128>; +def VSrc_128 : RegisterOperand<VReg_128> { + let DecoderMethod = "DecodeVS_128RegisterClass"; +} //===----------------------------------------------------------------------===// // VSrc_* Operands with an VGPR diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 26515b27bb77d..67ad904ca9723 100644 --- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -539,23 +539,9 @@ bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) { } bool isRegIntersect(unsigned Reg0, unsigned Reg1, const MCRegisterInfo* TRI) { - - if (Reg0 == Reg1) { - return true; + for (MCRegAliasIterator R(Reg0, TRI, true); R.isValid(); ++R) { + if (*R == Reg1) return true; } - - unsigned SubReg0 = TRI->getSubReg(Reg0, 1); - if (SubReg0 == 0) { - return TRI->getSubRegIndex(Reg1, Reg0) > 0; - } - - for (unsigned Idx = 2; SubReg0 > 0; ++Idx) { - if (isRegIntersect(Reg1, SubReg0, TRI)) { - return true; - } - SubReg0 = TRI->getSubReg(Reg0, Idx); - } - return false; } diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 7b9bc71ad4c77..d5acb49b4f39c 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -117,7 +117,10 @@ class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> : class getVOP2Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies { list<dag> ret = !if(P.HasModifiers, [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT + !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))], [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))]); } @@ -813,9 +816,11 @@ let SubtargetPredicate = isVI in { // Aliases to simplify matching of floating-point instructions that // are VOP2 on SI and VOP3 on VI. -class SI2_VI3Alias <string name, Instruction inst> : InstAlias < +class SI2_VI3Alias <string name, VOP3_Real inst> : InstAlias < name#" $dst, $src0, $src1", - (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0) + !if(inst.Pfl.HasOMod, + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0, 0), + (inst VGPR_32:$dst, 0, VCSrc_f32:$src0, 0, VCSrc_f32:$src1, 0)) >, PredicateControl { let UseInstAsmMatchConverter = 0; let AsmVariantName = AMDGPUAsmVariants.VOP3; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index a8ca593f14ed0..92ed0706dc011 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -12,17 +12,21 @@ //===----------------------------------------------------------------------===// class getVOP3ModPat<VOPProfile P, SDPatternOperator node> { + dag src0 = !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)); + list<dag> ret3 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), (P.Src2VT (VOP3Mods P.Src2VT:$src2, i32:$src2_modifiers))))]; list<dag> ret2 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)), + (node (P.Src0VT src0), (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers))))]; list<dag> ret1 = [(set P.DstVT:$vdst, - (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod))))]; + (node (P.Src0VT src0)))]; list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3, !if(!eq(P.NumSrcArgs, 2), ret2, @@ -92,6 +96,7 @@ class VOP3_Profile<VOPProfile P> : VOPProfile<P.ArgVT> { class VOP3b_Profile<ValueType vt> : VOPProfile<[vt, vt, vt, vt]> { // v_div_scale_{f32|f64} do not support input modifiers. let HasModifiers = 0; + let HasOMod = 0; let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst); let Asm64 = " $vdst, $sdst, $src0, $src1, $src2"; } diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index f2de1f9957260..3becf758aaa3e 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -34,6 +34,9 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_ let isCommutable = 1 in { def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; +def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; +def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>; + def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>; def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>; def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>; @@ -41,7 +44,6 @@ def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16> def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>; def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; -def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>; def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>; @@ -50,6 +52,9 @@ def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16> def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>; } +def V_PK_SUB_U16 : VOP3PInst<"v_pk_sub_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>; +def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>; + def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>; def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>; def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; @@ -71,6 +76,7 @@ multiclass VOP3P_Real_vi<bits<10> op> { } } +defm V_PK_MAD_I16 : VOP3P_Real_vi <0x380>; defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>; defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>; defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>; @@ -79,8 +85,10 @@ defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>; defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>; defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>; defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>; +defm V_PK_MAD_U16 : VOP3P_Real_vi <0x389>; defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>; +defm V_PK_SUB_U16 : VOP3P_Real_vi <0x38b>; defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>; defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>; defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>; diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td index f3482a22d5dcd..b636fc9be431b 100644 --- a/lib/Target/AMDGPU/VOPCInstructions.td +++ b/lib/Target/AMDGPU/VOPCInstructions.td @@ -148,6 +148,19 @@ class VOPCInstAlias <VOP3_Pseudo ps, Instruction inst, VOPProfile p = ps.Pfl> : let SubtargetPredicate = AssemblerPredicate; } +class getVOPCPat64 <PatLeaf cond, VOPProfile P> : LetDummies { + list<dag> ret = !if(P.HasModifiers, + [(set i1:$sdst, + (setcc (P.Src0VT + !if(P.HasOMod, + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod), + (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))), + (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), + cond))], + [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))]); +} + + multiclass VOPC_Pseudos <string opName, VOPC_Profile P, PatLeaf cond = COND_NULL, @@ -163,14 +176,7 @@ multiclass VOPC_Pseudos <string opName, let isCommutable = 1; } - def _e64 : VOP3_Pseudo<opName, P, - !if(P.HasModifiers, - [(set i1:$sdst, - (setcc (P.Src0VT (VOP3Mods0 P.Src0VT:$src0, i32:$src0_modifiers, - i1:$clamp, i32:$omod)), - (P.Src1VT (VOP3Mods P.Src1VT:$src1, i32:$src1_modifiers)), - cond))], - [(set i1:$sdst, (setcc P.Src0VT:$src0, P.Src1VT:$src1, cond))])>, + def _e64 : VOP3_Pseudo<opName, P, getVOPCPat64<cond, P>.ret>, Commutable_REV<revOp#"_e64", !eq(revOp, opName)> { let Defs = !if(DefExec, [EXEC], []); let SchedRW = P.Schedule; @@ -634,7 +640,7 @@ class FCMP_Pattern <PatLeaf cond, Instruction inst, ValueType vt> : Pat < (i64 (AMDGPUsetcc (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)), (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)), cond)), (inst $src0_modifiers, $src0, $src1_modifiers, $src1, - DSTCLAMP.NONE, DSTOMOD.NONE) + DSTCLAMP.NONE) >; def : FCMP_Pattern <COND_OEQ, V_CMP_EQ_F32_e64, f32>; diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index 77b7952b22a88..b47538ba0349a 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -136,6 +136,8 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> : let TSFlags = ps.TSFlags; let UseNamedOperandTable = ps.UseNamedOperandTable; let Uses = ps.Uses; + + VOPProfile Pfl = ps.Pfl; } // XXX - Is there any reason to distingusih this from regular VOP3 diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td index c40b4450a5b5b..e49c1babac210 100644 --- a/lib/Target/ARM/ARM.td +++ b/lib/Target/ARM/ARM.td @@ -17,144 +17,172 @@ include "llvm/Target/Target.td" //===----------------------------------------------------------------------===// -// ARM Helper classes. +// ARM Subtarget state. // -class ProcNoItin<string Name, list<SubtargetFeature> Features> - : Processor<Name, NoItineraries, Features>; +def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", + "true", "Thumb mode">; + +def ModeSoftFloat : SubtargetFeature<"soft-float","UseSoftFloat", + "true", "Use software floating " + "point features.">; -class Architecture<string fname, string aname, list<SubtargetFeature> features > - : SubtargetFeature<fname, "ARMArch", aname, - !strconcat(aname, " architecture"), features>; //===----------------------------------------------------------------------===// -// ARM Subtarget state. +// ARM Subtarget features. // -def ModeThumb : SubtargetFeature<"thumb-mode", "InThumbMode", "true", - "Thumb mode">; +// Floating Point, HW Division and Neon Support +def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", + "Enable VFP2 instructions">; -def ModeSoftFloat : SubtargetFeature<"soft-float", "UseSoftFloat", "true", - "Use software floating point features.">; +def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", + "Enable VFP3 instructions", + [FeatureVFP2]>; -//===----------------------------------------------------------------------===// -// ARM Subtarget features. -// +def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", + "Enable NEON instructions", + [FeatureVFP3]>; + +def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", + "Enable half-precision " + "floating point">; + +def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", + "Enable VFP4 instructions", + [FeatureVFP3, FeatureFP16]>; + +def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", + "true", "Enable ARMv8 FP", + [FeatureVFP4]>; + +def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", + "Enable full half-precision " + "floating point", + [FeatureFPARMv8]>; + +def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", + "Floating point unit supports " + "single precision only">; + +def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", + "Restrict FP to 16 double registers">; + +def FeatureHWDivThumb : SubtargetFeature<"hwdiv", + "HasHardwareDivideInThumb", "true", + "Enable divide instructions in Thumb">; + +def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", + "HasHardwareDivideInARM", "true", + "Enable divide instructions in ARM mode">; + +// Atomic Support +def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", + "Has data barrier (dmb/dsb) instructions">; + +def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", + "Has v7 clrex instruction">; -def FeatureVFP2 : SubtargetFeature<"vfp2", "HasVFPv2", "true", - "Enable VFP2 instructions">; -def FeatureVFP3 : SubtargetFeature<"vfp3", "HasVFPv3", "true", - "Enable VFP3 instructions", - [FeatureVFP2]>; -def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true", - "Enable NEON instructions", - [FeatureVFP3]>; -def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", - "Enable Thumb2 instructions">; -def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", - "Does not support ARM mode execution", - [ModeThumb]>; -def FeatureFP16 : SubtargetFeature<"fp16", "HasFP16", "true", - "Enable half-precision floating point">; -def FeatureVFP4 : SubtargetFeature<"vfp4", "HasVFPv4", "true", - "Enable VFP4 instructions", - [FeatureVFP3, FeatureFP16]>; -def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", - "true", "Enable ARMv8 FP", - [FeatureVFP4]>; -def FeatureFullFP16 : SubtargetFeature<"fullfp16", "HasFullFP16", "true", - "Enable full half-precision floating point", - [FeatureFPARMv8]>; -def FeatureD16 : SubtargetFeature<"d16", "HasD16", "true", - "Restrict FP to 16 double registers">; -def FeatureHWDivThumb : SubtargetFeature<"hwdiv", "HasHardwareDivideInThumb", - "true", - "Enable divide instructions in Thumb">; -def FeatureHWDivARM : SubtargetFeature<"hwdiv-arm", - "HasHardwareDivideInARM", "true", - "Enable divide instructions in ARM mode">; -def FeatureDB : SubtargetFeature<"db", "HasDataBarrier", "true", - "Has data barrier (dmb / dsb) instructions">; -def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true", - "Has v7 clrex instruction">; def FeatureAcquireRelease : SubtargetFeature<"acquire-release", "HasAcquireRelease", "true", - "Has v8 acquire/release (lda/ldaex etc) instructions">; -def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", - "FP compare + branch is slow">; -def FeatureVFPOnlySP : SubtargetFeature<"fp-only-sp", "FPOnlySP", "true", - "Floating point unit supports single precision only">; -def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", - "Enable support for Performance Monitor extensions">; -def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", - "Enable support for TrustZone security extensions">; -def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true", - "Enable support for ARMv8-M Security Extensions">; -def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", - "Enable support for Cryptography extensions", - [FeatureNEON]>; -def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", - "Enable support for CRC instructions">; + "Has v8 acquire/release (lda/ldaex " + " etc) instructions">; + + +def FeatureSlowFPBrcc : SubtargetFeature<"slow-fp-brcc", "SlowFPBrcc", "true", + "FP compare + branch is slow">; + +def FeaturePerfMon : SubtargetFeature<"perfmon", "HasPerfMon", "true", + "Enable support for Performance " + "Monitor extensions">; + + +// TrustZone Security Extensions +def FeatureTrustZone : SubtargetFeature<"trustzone", "HasTrustZone", "true", + "Enable support for TrustZone " + "security extensions">; + +def Feature8MSecExt : SubtargetFeature<"8msecext", "Has8MSecExt", "true", + "Enable support for ARMv8-M " + "Security Extensions">; + +def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true", + "Enable support for " + "Cryptography extensions", + [FeatureNEON]>; + +def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true", + "Enable support for CRC instructions">; + + // Not to be confused with FeatureHasRetAddrStack (return address stack) -def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", - "Enable Reliability, Availability and Serviceability extensions">; -def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true", - "Enable fast computation of positive address offsets">; -def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true", - "CPU fuses AES crypto operations">; - -// Cyclone has preferred instructions for zeroing VFP registers, which can -// execute in 0 cycles. -def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", - "Has zero-cycle zeroing instructions">; - -// Whether or not it may be profitable to unpredicate certain instructions -// during if conversion. +def FeatureRAS : SubtargetFeature<"ras", "HasRAS", "true", + "Enable Reliability, Availability " + "and Serviceability extensions">; + +// Fast computation of non-negative address offsets +def FeatureFPAO : SubtargetFeature<"fpao", "HasFPAO", "true", + "Enable fast computation of " + "positive address offsets">; + +// Fast execution of AES crypto operations +def FeatureFuseAES : SubtargetFeature<"fuse-aes", "HasFuseAES", "true", + "CPU fuses AES crypto operations">; + +// Cyclone can zero VFP registers in 0 cycles. +def FeatureZCZeroing : SubtargetFeature<"zcz", "HasZeroCycleZeroing", "true", + "Has zero-cycle zeroing instructions">; + +// Whether it is profitable to unpredicate certain instructions during if-conversion def FeatureProfUnpredicate : SubtargetFeature<"prof-unpr", - "IsProfitableToUnpredicate", - "true", + "IsProfitableToUnpredicate", "true", "Is profitable to unpredicate">; // Some targets (e.g. Swift) have microcoded VGETLNi32. -def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32", - "HasSlowVGETLNi32", "true", - "Has slow VGETLNi32 - prefer VMOV">; +def FeatureSlowVGETLNi32 : SubtargetFeature<"slow-vgetlni32", + "HasSlowVGETLNi32", "true", + "Has slow VGETLNi32 - prefer VMOV">; // Some targets (e.g. Swift) have microcoded VDUP32. -def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", "true", - "Has slow VDUP32 - prefer VMOV">; +def FeatureSlowVDUP32 : SubtargetFeature<"slow-vdup32", "HasSlowVDUP32", + "true", + "Has slow VDUP32 - prefer VMOV">; // Some targets (e.g. Cortex-A9) prefer VMOVSR to VMOVDRR even when using NEON // for scalar FP, as this allows more effective execution domain optimization. -def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR", - "true", "Prefer VMOVSR">; +def FeaturePreferVMOVSR : SubtargetFeature<"prefer-vmovsr", "PreferVMOVSR", + "true", "Prefer VMOVSR">; // Swift has ISHST barriers compatible with Atomic Release semantics but weaker // than ISH def FeaturePrefISHSTBarrier : SubtargetFeature<"prefer-ishst", "PreferISHST", - "true", "Prefer ISHST barriers">; + "true", "Prefer ISHST barriers">; // Some targets (e.g. Cortex-A9) have muxed AGU and NEON/FPU. -def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", "true", - "Has muxed AGU and NEON/FPU">; +def FeatureMuxedUnits : SubtargetFeature<"muxed-units", "HasMuxedUnits", + "true", + "Has muxed AGU and NEON/FPU">; -// On some targets, a VLDM/VSTM starting with an odd register number needs more -// microops than single VLDRS. +// Whether VLDM/VSTM starting with odd register number need more microops +// than single VLDRS def FeatureSlowOddRegister : SubtargetFeature<"slow-odd-reg", "SlowOddRegister", - "true", "VLDM/VSTM starting with an odd register is slow">; + "true", "VLDM/VSTM starting " + "with an odd register is slow">; // Some targets have a renaming dependency when loading into D subregisters. def FeatureSlowLoadDSubreg : SubtargetFeature<"slow-load-D-subreg", "SlowLoadDSubregister", "true", "Loading into D subregs is slow">; + // Some targets (e.g. Cortex-A15) never want VMOVS to be widened to VMOVD. def FeatureDontWidenVMOVS : SubtargetFeature<"dont-widen-vmovs", "DontWidenVMOVS", "true", "Don't widen VMOVS to VMOVD">; // Whether or not it is profitable to expand VFP/NEON MLA/MLS instructions. -def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", "ExpandMLx", "true", - "Expand VFP/NEON MLA/MLS instructions">; +def FeatureExpandMLx : SubtargetFeature<"expand-fp-mlx", + "ExpandMLx", "true", + "Expand VFP/NEON MLA/MLS instructions">; // Some targets have special RAW hazards for VFP/NEON VMLA/VMLS. def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards", @@ -162,15 +190,18 @@ def FeatureHasVMLxHazards : SubtargetFeature<"vmlx-hazards", "HasVMLxHazards", // Some targets (e.g. Cortex-A9) want to convert VMOVRS, VMOVSR and VMOVS from // VFP to NEON, as an execution domain optimization. -def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", "UseNEONForFPMovs", - "true", "Convert VMOVSR, VMOVRS, VMOVS to NEON">; +def FeatureNEONForFPMovs : SubtargetFeature<"neon-fpmovs", + "UseNEONForFPMovs", "true", + "Convert VMOVSR, VMOVRS, " + "VMOVS to NEON">; // Some processors benefit from using NEON instructions for scalar // single-precision FP operations. This affects instruction selection and should // only be enabled if the handling of denormals is not important. -def FeatureNEONForFP : SubtargetFeature<"neonfp", "UseNEONForSinglePrecisionFP", - "true", - "Use NEON for single precision FP">; +def FeatureNEONForFP : SubtargetFeature<"neonfp", + "UseNEONForSinglePrecisionFP", + "true", + "Use NEON for single precision FP">; // On some processors, VLDn instructions that access unaligned data take one // extra cycle. Take that into account when computing operand latencies. @@ -181,18 +212,18 @@ def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign", // Some processors have a nonpipelined VFP coprocessor. def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp", "NonpipelinedVFP", "true", - "VFP instructions are not pipelined">; + "VFP instructions are not pipelined">; // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. -def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", - "Disable VFP / NEON MAC instructions">; +def FeatureHasSlowFPVMLx : SubtargetFeature<"slowfpvmlx", "SlowFPVMLx", "true", + "Disable VFP / NEON MAC instructions">; // Cortex-A8 / A9 Advanced SIMD has multiplier accumulator forwarding. def FeatureVMLxForwarding : SubtargetFeature<"vmlx-forwarding", - "HasVMLxForwarding", "true", - "Has multiplier accumulator forwarding">; + "HasVMLxForwarding", "true", + "Has multiplier accumulator forwarding">; // Disable 32-bit to 16-bit narrowing for experimentation. def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Pref32BitThumb", "true", @@ -213,14 +244,16 @@ def FeatureCheapPredicableCPSR : SubtargetFeature<"cheap-predicable-cpsr", "true", "Disable +1 predication cost for instructions updating CPSR">; -def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", - "AvoidMOVsShifterOperand", "true", - "Avoid movs instructions with shifter operand">; +def FeatureAvoidMOVsShOp : SubtargetFeature<"avoid-movs-shop", + "AvoidMOVsShifterOperand", "true", + "Avoid movs instructions with " + "shifter operand">; // Some processors perform return stack prediction. CodeGen should avoid issue // "normal" call instructions to callees which do not return. -def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", "HasRetAddrStack", "true", - "Has return address stack">; +def FeatureHasRetAddrStack : SubtargetFeature<"ret-addr-stack", + "HasRetAddrStack", "true", + "Has return address stack">; // Some processors have no branch predictor, which changes the expected cost of // taking a branch which affects the choice of whether to use predicated @@ -230,63 +263,80 @@ def FeatureHasNoBranchPredictor : SubtargetFeature<"no-branch-predictor", "Has no branch predictor">; /// DSP extension. -def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", - "Supports DSP instructions in ARM and/or Thumb2">; +def FeatureDSP : SubtargetFeature<"dsp", "HasDSP", "true", + "Supports DSP instructions in " + "ARM and/or Thumb2">; // Multiprocessing extension. -def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", - "Supports Multiprocessing extension">; +def FeatureMP : SubtargetFeature<"mp", "HasMPExtension", "true", + "Supports Multiprocessing extension">; // Virtualization extension - requires HW divide (ARMv7-AR ARMARM - 4.4.8). def FeatureVirtualization : SubtargetFeature<"virtualization", - "HasVirtualization", "true", - "Supports Virtualization extension", - [FeatureHWDivThumb, FeatureHWDivARM]>; + "HasVirtualization", "true", + "Supports Virtualization extension", + [FeatureHWDivThumb, FeatureHWDivARM]>; -// M-series ISA -def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", - "Is microcontroller profile ('M' series)">; +// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. +// See ARMInstrInfo.td for details. +def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", + "NaCl trap">; -// R-series ISA -def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass", - "Is realtime profile ('R' series)">; +def FeatureStrictAlign : SubtargetFeature<"strict-align", + "StrictAlign", "true", + "Disallow all unaligned memory " + "access">; + +def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", + "Generate calls via indirect call " + "instructions">; + +def FeatureExecuteOnly : SubtargetFeature<"execute-only", + "GenExecuteOnly", "true", + "Enable the generation of " + "execute only code.">; + +def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", + "Reserve R9, making it unavailable" + " as GPR">; + +def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", + "Don't use movt/movw pairs for " + "32-bit imms">; + +def FeatureNoNegativeImmediates + : SubtargetFeature<"no-neg-immediates", + "NegativeImmediates", "false", + "Convert immediates and instructions " + "to their negated or complemented " + "equivalent when the immediate does " + "not fit in the encoding.">; + + +//===----------------------------------------------------------------------===// +// ARM architecture class +// // A-series ISA def FeatureAClass : SubtargetFeature<"aclass", "ARMProcClass", "AClass", "Is application profile ('A' series)">; -// Special TRAP encoding for NaCl, which looks like a TRAP in Thumb too. -// See ARMInstrInfo.td for details. -def FeatureNaClTrap : SubtargetFeature<"nacl-trap", "UseNaClTrap", "true", - "NaCl trap">; - -def FeatureStrictAlign : SubtargetFeature<"strict-align", - "StrictAlign", "true", - "Disallow all unaligned memory " - "access">; +// R-series ISA +def FeatureRClass : SubtargetFeature<"rclass", "ARMProcClass", "RClass", + "Is realtime profile ('R' series)">; -def FeatureLongCalls : SubtargetFeature<"long-calls", "GenLongCalls", "true", - "Generate calls via indirect call " - "instructions">; +// M-series ISA +def FeatureMClass : SubtargetFeature<"mclass", "ARMProcClass", "MClass", + "Is microcontroller profile ('M' series)">; -def FeatureExecuteOnly - : SubtargetFeature<"execute-only", "GenExecuteOnly", "true", - "Enable the generation of execute only code.">; -def FeatureReserveR9 : SubtargetFeature<"reserve-r9", "ReserveR9", "true", - "Reserve R9, making it unavailable as " - "GPR">; +def FeatureThumb2 : SubtargetFeature<"thumb2", "HasThumb2", "true", + "Enable Thumb2 instructions">; -def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true", - "Don't use movt/movw pairs for 32-bit " - "imms">; +def FeatureNoARM : SubtargetFeature<"noarm", "NoARM", "true", + "Does not support ARM mode execution", + [ModeThumb]>; -def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", - "NegativeImmediates", "false", - "Convert immediates and instructions " - "to their negated or complemented " - "equivalent when the immediate does " - "not fit in the encoding.">; //===----------------------------------------------------------------------===// // ARM ISAa. @@ -294,43 +344,57 @@ def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates", def HasV4TOps : SubtargetFeature<"v4t", "HasV4TOps", "true", "Support ARM v4T instructions">; + def HasV5TOps : SubtargetFeature<"v5t", "HasV5TOps", "true", "Support ARM v5T instructions", [HasV4TOps]>; + def HasV5TEOps : SubtargetFeature<"v5te", "HasV5TEOps", "true", - "Support ARM v5TE, v5TEj, and v5TExp instructions", + "Support ARM v5TE, v5TEj, and " + "v5TExp instructions", [HasV5TOps]>; + def HasV6Ops : SubtargetFeature<"v6", "HasV6Ops", "true", "Support ARM v6 instructions", [HasV5TEOps]>; + def HasV6MOps : SubtargetFeature<"v6m", "HasV6MOps", "true", "Support ARM v6M instructions", [HasV6Ops]>; + def HasV8MBaselineOps : SubtargetFeature<"v8m", "HasV8MBaselineOps", "true", "Support ARM v8M Baseline instructions", [HasV6MOps]>; + def HasV6KOps : SubtargetFeature<"v6k", "HasV6KOps", "true", "Support ARM v6k instructions", [HasV6Ops]>; + def HasV6T2Ops : SubtargetFeature<"v6t2", "HasV6T2Ops", "true", "Support ARM v6t2 instructions", [HasV8MBaselineOps, HasV6KOps, FeatureThumb2]>; + def HasV7Ops : SubtargetFeature<"v7", "HasV7Ops", "true", "Support ARM v7 instructions", [HasV6T2Ops, FeaturePerfMon, FeatureV7Clrex]>; + +def HasV8MMainlineOps : + SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true", + "Support ARM v8M Mainline instructions", + [HasV7Ops]>; + def HasV8Ops : SubtargetFeature<"v8", "HasV8Ops", "true", "Support ARM v8 instructions", [HasV7Ops, FeatureAcquireRelease]>; + def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true", "Support ARM v8.1a instructions", [HasV8Ops]>; -def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", + +def HasV8_2aOps : SubtargetFeature<"v8.2a", "HasV8_2aOps", "true", "Support ARM v8.2a instructions", [HasV8_1aOps]>; -def HasV8MMainlineOps : SubtargetFeature<"v8m.main", "HasV8MMainlineOps", "true", - "Support ARM v8M Mainline instructions", - [HasV7Ops]>; //===----------------------------------------------------------------------===// @@ -386,11 +450,17 @@ def ProcR52 : SubtargetFeature<"r52", "ARMProcFamily", "CortexR52", def ProcM3 : SubtargetFeature<"m3", "ARMProcFamily", "CortexM3", "Cortex-M3 ARM processors", []>; + //===----------------------------------------------------------------------===// -// ARM schedules. +// ARM Helper classes. // -include "ARMSchedule.td" +class Architecture<string fname, string aname, list<SubtargetFeature> features> + : SubtargetFeature<fname, "ARMArch", aname, + !strconcat(aname, " architecture"), features>; + +class ProcNoItin<string Name, list<SubtargetFeature> Features> + : Processor<Name, NoItineraries, Features>; //===----------------------------------------------------------------------===// @@ -547,12 +617,21 @@ def ARMv7s : Architecture<"armv7s", "ARMv7a", [ARMv7a]>; //===----------------------------------------------------------------------===// +// ARM schedules. +//===----------------------------------------------------------------------===// +// +include "ARMSchedule.td" + +//===----------------------------------------------------------------------===// // ARM processors // // Dummy CPU, used to target architectures def : ProcessorModel<"generic", CortexA8Model, []>; +// FIXME: Several processors below are not using their own scheduler +// model, but one of similar/previous processor. These should be fixed. + def : ProcNoItin<"arm8", [ARMv4]>; def : ProcNoItin<"arm810", [ARMv4]>; def : ProcNoItin<"strongarm", [ARMv4]>; @@ -612,7 +691,6 @@ def : Processor<"arm1156t2f-s", ARMV6Itineraries, [ARMv6t2, FeatureVFP2, FeatureHasSlowFPVMLx]>; -// FIXME: A5 has currently the same Schedule model as A8 def : ProcessorModel<"cortex-a5", CortexA8Model, [ARMv7a, ProcA5, FeatureHasRetAddrStack, FeatureTrustZone, @@ -656,7 +734,6 @@ def : ProcessorModel<"cortex-a9", CortexA9Model, [ARMv7a, ProcA9, FeatureCheckVLDnAlign, FeatureMP]>; -// FIXME: A12 has currently the same Schedule model as A9 def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, FeatureHasRetAddrStack, FeatureTrustZone, @@ -666,7 +743,6 @@ def : ProcessorModel<"cortex-a12", CortexA9Model, [ARMv7a, ProcA12, FeatureVirtualization, FeatureMP]>; -// FIXME: A15 has currently the same Schedule model as A9. def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, FeatureDontWidenVMOVS, FeatureHasRetAddrStack, @@ -678,7 +754,6 @@ def : ProcessorModel<"cortex-a15", CortexA9Model, [ARMv7a, ProcA15, FeatureAvoidPartialCPSR, FeatureVirtualization]>; -// FIXME: A17 has currently the same Schedule model as A9 def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, FeatureHasRetAddrStack, FeatureTrustZone, @@ -688,9 +763,7 @@ def : ProcessorModel<"cortex-a17", CortexA9Model, [ARMv7a, ProcA17, FeatureAvoidPartialCPSR, FeatureVirtualization]>; -// FIXME: krait has currently the same Schedule model as A9 -// FIXME: krait has currently the same features as A9 plus VFP4 and hardware -// division features. +// FIXME: krait has currently the same features as A9 plus VFP4 and HWDiv def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, FeatureHasRetAddrStack, FeatureMuxedUnits, @@ -720,12 +793,10 @@ def : ProcessorModel<"swift", SwiftModel, [ARMv7a, ProcSwift, FeatureSlowVGETLNi32, FeatureSlowVDUP32]>; -// FIXME: R4 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r4", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureAvoidPartialCPSR]>; -// FIXME: R4F has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureHasRetAddrStack, FeatureSlowFPBrcc, @@ -734,7 +805,6 @@ def : ProcessorModel<"cortex-r4f", CortexA8Model, [ARMv7r, ProcR4, FeatureD16, FeatureAvoidPartialCPSR]>; -// FIXME: R5 has currently the same ProcessorModel as A8. def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureHasRetAddrStack, FeatureVFP3, @@ -744,7 +814,6 @@ def : ProcessorModel<"cortex-r5", CortexA8Model, [ARMv7r, ProcR5, FeatureHasSlowFPVMLx, FeatureAvoidPartialCPSR]>; -// FIXME: R7 has currently the same ProcessorModel as A8 and is modelled as R5. def : ProcessorModel<"cortex-r7", CortexA8Model, [ARMv7r, ProcR7, FeatureHasRetAddrStack, FeatureVFP3, @@ -814,14 +883,14 @@ def : ProcNoItin<"cortex-a53", [ARMv8a, ProcA53, FeatureCRC, FeatureFPAO]>; -def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, - FeatureHWDivThumb, - FeatureHWDivARM, - FeatureCrypto, - FeatureCRC, - FeatureFPAO, - FeatureAvoidPartialCPSR, - FeatureCheapPredicableCPSR]>; +def : ProcessorModel<"cortex-a57", CortexA57Model, [ARMv8a, ProcA57, + FeatureHWDivThumb, + FeatureHWDivARM, + FeatureCrypto, + FeatureCRC, + FeatureFPAO, + FeatureAvoidPartialCPSR, + FeatureCheapPredicableCPSR]>; def : ProcNoItin<"cortex-a72", [ARMv8a, ProcA72, FeatureHWDivThumb, @@ -835,7 +904,6 @@ def : ProcNoItin<"cortex-a73", [ARMv8a, ProcA73, FeatureCrypto, FeatureCRC]>; -// Cyclone is very similar to swift def : ProcessorModel<"cyclone", SwiftModel, [ARMv8a, ProcSwift, FeatureHasRetAddrStack, FeatureNEONForFP, @@ -881,9 +949,7 @@ def : ProcessorModel<"cortex-r52", CortexR52Model, [ARMv8r, ProcR52, //===----------------------------------------------------------------------===// include "ARMRegisterInfo.td" - include "ARMRegisterBanks.td" - include "ARMCallingConv.td" //===----------------------------------------------------------------------===// @@ -891,7 +957,6 @@ include "ARMCallingConv.td" //===----------------------------------------------------------------------===// include "ARMInstrInfo.td" - def ARMInstrInfo : InstrInfo; //===----------------------------------------------------------------------===// @@ -912,7 +977,7 @@ def ARMAsmParserVariant : AsmParserVariant { } def ARM : Target { - // Pull in Instruction Info: + // Pull in Instruction Info. let InstructionSet = ARMInstrInfo; let AssemblyWriters = [ARMAsmWriter]; let AssemblyParserVariants = [ARMAsmParserVariant]; diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index e97a7ce5067f9..370c0a7f5c537 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -117,7 +117,7 @@ ARMBaseRegisterInfo::getCallPreservedMask(const MachineFunction &MF, CallingConv::ID CC) const { const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>(); if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return CSR_NoRegs_RegMask; if (STI.isTargetDarwin() && STI.getTargetLowering()->supportSwiftError() && @@ -163,7 +163,7 @@ ARMBaseRegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF, // both or otherwise does not want to enable this optimization, the function // should return NULL if (CC == CallingConv::GHC) - // This is academic becase all GHC calls are (supposed to be) tail calls + // This is academic because all GHC calls are (supposed to be) tail calls return nullptr; return STI.isTargetDarwin() ? CSR_iOS_ThisReturn_RegMask : CSR_AAPCS_ThisReturn_RegMask; diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 384f80356cc84..bf00ef61c2d1b 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -250,8 +250,7 @@ bool ARMFastISel::DefinesOptionalPredicate(MachineInstr *MI, bool *CPSR) { return false; // Look to see if our OptionalDef is defining CPSR or CCR. - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); + for (const MachineOperand &MO : MI->operands()) { if (!MO.isReg() || !MO.isDef()) continue; if (MO.getReg() == ARM::CPSR) *CPSR = true; @@ -267,8 +266,8 @@ bool ARMFastISel::isARMNEONPred(const MachineInstr *MI) { AFI->isThumb2Function()) return MI->isPredicable(); - for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i) - if (MCID.OpInfo[i].isPredicate()) + for (const MCOperandInfo &opInfo : MCID.operands()) + if (opInfo.isPredicate()) return true; return false; @@ -1972,7 +1971,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, break; } case CCValAssign::AExt: - // Intentional fall-through. Handle AExt and ZExt. + // Intentional fall-through. Handle AExt and ZExt. case CCValAssign::ZExt: { MVT DestVT = VA.getLocVT(); Arg = ARMEmitIntExt(ArgVT, Arg, DestVT, /*isZExt*/true); @@ -2001,6 +2000,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl<Value*> &Args, assert(VA.getLocVT() == MVT::f64 && "Custom lowering for v2f64 args not available"); + // FIXME: ArgLocs[++i] may extend beyond ArgLocs.size() CCValAssign &NextVA = ArgLocs[++i]; assert(VA.isRegLoc() && NextVA.isRegLoc() && @@ -2172,8 +2172,8 @@ bool ARMFastISel::SelectRet(const Instruction *I) { MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(RetOpc)); AddOptionalDefs(MIB); - for (unsigned i = 0, e = RetRegs.size(); i != e; ++i) - MIB.addReg(RetRegs[i], RegState::Implicit); + for (unsigned R : RetRegs) + MIB.addReg(R, RegState::Implicit); return true; } @@ -2233,8 +2233,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { ArgRegs.reserve(I->getNumOperands()); ArgVTs.reserve(I->getNumOperands()); ArgFlags.reserve(I->getNumOperands()); - for (unsigned i = 0; i < I->getNumOperands(); ++i) { - Value *Op = I->getOperand(i); + for (Value *Op : I->operands()) { unsigned Arg = getRegForValue(Op); if (Arg == 0) return false; @@ -2278,8 +2277,8 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) { MIB.addExternalSymbol(TLI.getLibcallName(Call)); // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (unsigned R : RegArgs) + MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). @@ -2423,8 +2422,8 @@ bool ARMFastISel::SelectCall(const Instruction *I, MIB.addExternalSymbol(IntrMemName, 0); // Add implicit physical register uses to the call. - for (unsigned i = 0, e = RegArgs.size(); i != e; ++i) - MIB.addReg(RegArgs[i], RegState::Implicit); + for (unsigned R : RegArgs) + MIB.addReg(R, RegState::Implicit); // Add a register mask with the call-preserved registers. // Proper defs for return values will be added by setPhysRegsDeadExcept(). @@ -2932,13 +2931,12 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, bool Found = false; bool isZExt; - for (unsigned i = 0, e = array_lengthof(FoldableLoadExtends); - i != e; ++i) { - if (FoldableLoadExtends[i].Opc[isThumb2] == MI->getOpcode() && - (uint64_t)FoldableLoadExtends[i].ExpectedImm == Imm && - MVT((MVT::SimpleValueType)FoldableLoadExtends[i].ExpectedVT) == VT) { + for (const FoldableLoadExtendsStruct &FLE : FoldableLoadExtends) { + if (FLE.Opc[isThumb2] == MI->getOpcode() && + (uint64_t)FLE.ExpectedImm == Imm && + MVT((MVT::SimpleValueType)FLE.ExpectedVT) == VT) { Found = true; - isZExt = FoldableLoadExtends[i].isZExt; + isZExt = FLE.isZExt; } } if (!Found) return false; @@ -3057,9 +3055,8 @@ bool ARMFastISel::fastLowerArguments() { }; const TargetRegisterClass *RC = &ARM::rGPRRegClass; - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); - I != E; ++I) { - unsigned ArgNo = I->getArgNo(); + for (const Argument &Arg : F->args()) { + unsigned ArgNo = Arg.getArgNo(); unsigned SrcReg = GPRArgRegs[ArgNo]; unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC); // FIXME: Unfortunately it's necessary to emit a copy from the livein copy. @@ -3069,7 +3066,7 @@ bool ARMFastISel::fastLowerArguments() { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY), ResultReg).addReg(DstReg, getKillRegState(true)); - updateValueMap(&*I, ResultReg); + updateValueMap(&Arg, ResultReg); } return true; diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp index 7f9fe55a5c38b..f75dd4de3f96c 100644 --- a/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2682,9 +2682,12 @@ void ARMDAGToDAGISel::Select(SDNode *N) { SDNode *ResNode; if (Subtarget->isThumb()) { - SDValue Pred = getAL(CurDAG, dl); - SDValue PredReg = CurDAG->getRegister(0, MVT::i32); - SDValue Ops[] = { CPIdx, Pred, PredReg, CurDAG->getEntryNode() }; + SDValue Ops[] = { + CPIdx, + getAL(CurDAG, dl), + CurDAG->getRegister(0, MVT::i32), + CurDAG->getEntryNode() + }; ResNode = CurDAG->getMachineNode(ARM::tLDRpci, dl, MVT::i32, MVT::Other, Ops); } else { @@ -2698,6 +2701,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) { ResNode = CurDAG->getMachineNode(ARM::LDRcp, dl, MVT::i32, MVT::Other, Ops); } + // Annotate the Node with memory operand information so that MachineInstr + // queries work properly. This e.g. gives the register allocation the + // required information for rematerialization. + MachineFunction& MF = CurDAG->getMachineFunction(); + MachineSDNode::mmo_iterator MemOp = MF.allocateMemRefsArray(1); + MemOp[0] = MF.getMachineMemOperand( + MachinePointerInfo::getConstantPool(MF), + MachineMemOperand::MOLoad, 4, 4); + + cast<MachineSDNode>(ResNode)->setMemRefs(MemOp, MemOp+1); + ReplaceNode(N, ResNode); return; } diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 29ef69ad0010f..faed6b867e2bc 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -722,6 +722,29 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { return false; break; } + case G_BRCOND: { + if (!validReg(MRI, I.getOperand(0).getReg(), 1, ARM::GPRRegBankID)) { + DEBUG(dbgs() << "Unsupported condition register for G_BRCOND"); + return false; + } + + // Set the flags. + auto Test = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::TSTri)) + .addReg(I.getOperand(0).getReg()) + .addImm(1) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*Test, TII, TRI, RBI)) + return false; + + // Branch conditionally. + auto Branch = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(ARM::Bcc)) + .add(I.getOperand(1)) + .add(predOps(ARMCC::EQ, ARM::CPSR)); + if (!constrainSelectedInstRegOperands(*Branch, TII, TRI, RBI)) + return false; + I.eraseFromParent(); + return true; + } default: return false; } diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index f23e62595d2e9..1c17c07e4cb00 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -66,14 +66,16 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, s32}, Libcall); } - // FIXME: Support s8 and s16 as well - for (unsigned Op : {G_SREM, G_UREM}) + for (unsigned Op : {G_SREM, G_UREM}) { + for (auto Ty : {s8, s16}) + setAction({Op, Ty}, WidenScalar); if (ST.hasDivideInARMMode()) setAction({Op, s32}, Lower); else if (AEABI(ST)) setAction({Op, s32}, Custom); else setAction({Op, s32}, Libcall); + } for (unsigned Op : {G_SEXT, G_ZEXT}) { setAction({Op, s32}, Legal); @@ -88,6 +90,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_SELECT, p0}, Legal); setAction({G_SELECT, 1, s1}, Legal); + setAction({G_BRCOND, s1}, Legal); + setAction({G_CONSTANT, s32}, Legal); for (auto Ty : {s1, s8, s16}) setAction({G_CONSTANT, Ty}, WidenScalar); diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp index 13acea3c28a9e..48b02d40b2466 100644 --- a/lib/Target/ARM/ARMMCInstLower.cpp +++ b/lib/Target/ARM/ARMMCInstLower.cpp @@ -153,9 +153,7 @@ void llvm::LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, break; } - for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { - const MachineOperand &MO = MI->getOperand(i); - + for (const MachineOperand &MO : MI->operands()) { MCOperand MCOp; if (AP.lowerOperand(MO, MCOp)) { if (MCOp.isImm() && EncodeImms) { diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index c0c09e8c15afd..8449302358948 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -259,6 +259,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { break; case G_SELECT: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + (void)Ty; LLT Ty2 = MRI.getType(MI.getOperand(1).getReg()); (void)Ty2; assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT"); @@ -282,6 +283,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } case G_FCMP: { LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + (void)Ty; LLT Ty1 = MRI.getType(MI.getOperand(2).getReg()); LLT Ty2 = MRI.getType(MI.getOperand(3).getReg()); (void)Ty2; @@ -329,6 +331,13 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { &ARM::ValueMappings[ARM::DPR3OpsIdx]}); break; } + case G_BR: + OperandsMapping = getOperandsMapping({nullptr}); + break; + case G_BRCOND: + OperandsMapping = + getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); + break; default: return getInvalidInstructionMapping(); } diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h index f41da3e8e2232..22ce949367f34 100644 --- a/lib/Target/ARM/ARMTargetMachine.h +++ b/lib/Target/ARM/ARMTargetMachine.h @@ -47,6 +47,8 @@ public: ~ARMBaseTargetMachine() override; const ARMSubtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. const ARMSubtarget *getSubtargetImpl() const = delete; bool isLittleEndian() const { return isLittle; } diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index cc7a7c3849bca..81b0aa7f8b98f 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -515,8 +515,9 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const { const TargetInstrInfo &TII = *BB->getParent()->getSubtarget().getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); + bool isSelectOp = MI.getOpcode() == BPF::Select; - assert(MI.getOpcode() == BPF::Select && "Unexpected instr type to insert"); + assert((isSelectOp || MI.getOpcode() == BPF::Select_Ri) && "Unexpected instr type to insert"); // To "insert" a SELECT instruction, we actually have to insert the diamond // control-flow pattern. The incoming instruction knows the destination vreg @@ -548,48 +549,40 @@ BPFTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, // Insert Branch if Flag unsigned LHS = MI.getOperand(1).getReg(); - unsigned RHS = MI.getOperand(2).getReg(); int CC = MI.getOperand(3).getImm(); + int NewCC; switch (CC) { case ISD::SETGT: - BuildMI(BB, DL, TII.get(BPF::JSGT_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JSGT_rr : BPF::JSGT_ri; break; case ISD::SETUGT: - BuildMI(BB, DL, TII.get(BPF::JUGT_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JUGT_rr : BPF::JUGT_ri; break; case ISD::SETGE: - BuildMI(BB, DL, TII.get(BPF::JSGE_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JSGE_rr : BPF::JSGE_ri; break; case ISD::SETUGE: - BuildMI(BB, DL, TII.get(BPF::JUGE_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JUGE_rr : BPF::JUGE_ri; break; case ISD::SETEQ: - BuildMI(BB, DL, TII.get(BPF::JEQ_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JEQ_rr : BPF::JEQ_ri; break; case ISD::SETNE: - BuildMI(BB, DL, TII.get(BPF::JNE_rr)) - .addReg(LHS) - .addReg(RHS) - .addMBB(Copy1MBB); + NewCC = isSelectOp ? BPF::JNE_rr : BPF::JNE_ri; break; default: report_fatal_error("unimplemented select CondCode " + Twine(CC)); } + if (isSelectOp) + BuildMI(BB, DL, TII.get(NewCC)) + .addReg(LHS) + .addReg(MI.getOperand(2).getReg()) + .addMBB(Copy1MBB); + else + BuildMI(BB, DL, TII.get(NewCC)) + .addReg(LHS) + .addImm(MI.getOperand(2).getImm()) + .addMBB(Copy1MBB); // Copy0MBB: // %FalseValue = ... diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index 5ad7772682084..f68357809add2 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -460,6 +460,11 @@ let usesCustomInserter = 1 in { "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2", [(set i64:$dst, (BPFselectcc i64:$lhs, i64:$rhs, (i64 imm:$imm), i64:$src, i64:$src2))]>; + def Select_Ri : Pseudo<(outs GPR:$dst), + (ins GPR:$lhs, i64imm:$rhs, i64imm:$imm, GPR:$src, GPR:$src2), + "# Select PSEUDO $dst = $lhs $imm $rhs ? $src : $src2", + [(set i64:$dst, + (BPFselectcc i64:$lhs, (i64 imm:$rhs), (i64 imm:$imm), i64:$src, i64:$src2))]>; } // load 64-bit global addr into register diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index b064778c4bbd3..d75d95a6baeae 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -7,8 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexbit" - #include "HexagonBitTracker.h" #include "HexagonTargetMachine.h" #include "llvm/ADT/BitVector.h" @@ -42,6 +40,8 @@ #include <utility> #include <vector> +#define DEBUG_TYPE "hexbit" + using namespace llvm; static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden, diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td index 2dc74632e9be2..30ebf89c98083 100644 --- a/lib/Target/Hexagon/HexagonDepInstrInfo.td +++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td @@ -45863,6 +45863,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000000; let isSoloAin1 = 1; +let hasSideEffects = 1; } def Y2_dccleaninva : HInst< (outs), @@ -45872,6 +45873,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000010; let isSoloAin1 = 1; +let hasSideEffects = 1; } def Y2_dcfetch : HInst< (outs), @@ -45900,6 +45902,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000001; let isSoloAin1 = 1; +let hasSideEffects = 1; } def Y2_dczeroa : HInst< (outs), @@ -45909,6 +45912,7 @@ tc_30665cb0, TypeST>, Enc_ecbcc8 { let Inst{13-0} = 0b00000000000000; let Inst{31-21} = 0b10100000110; let isSoloAin1 = 1; +let hasSideEffects = 1; let mayStore = 1; } def Y2_icinva : HInst< diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp index 03c4a83594b33..80361015e6499 100644 --- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp +++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp @@ -59,8 +59,6 @@ // J2_jump <BB#6>, %PC<imp-def,dead> // Successors according to CFG: BB#6 BB#3 -#define DEBUG_TYPE "hexagon-eif" - #include "Hexagon.h" #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" @@ -90,6 +88,8 @@ #include <cassert> #include <iterator> +#define DEBUG_TYPE "hexagon-eif" + using namespace llvm; namespace llvm { diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp index 734f3c6658d92..a2f6dd68c1a13 100644 --- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp +++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp @@ -86,8 +86,6 @@ // however, is that finding the locations where the implicit uses need // to be added, and updating the live ranges will be more involved. -#define DEBUG_TYPE "expand-condsets" - #include "HexagonInstrInfo.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SetVector.h" @@ -116,6 +114,8 @@ #include <set> #include <utility> +#define DEBUG_TYPE "expand-condsets" + using namespace llvm; static cl::opt<unsigned> OptTfrLimit("expand-condsets-tfr-limit", diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index c790579ccebc0..e5e75198b2d18 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -8,8 +8,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexagon-pei" - #include "HexagonFrameLowering.h" #include "HexagonBlockRanges.h" #include "HexagonInstrInfo.h" @@ -63,6 +61,8 @@ #include <utility> #include <vector> +#define DEBUG_TYPE "hexagon-pei" + // Hexagon stack frame layout as defined by the ABI: // // Incoming arguments diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp index bf31e16992840..0a955aedaf1a8 100644 --- a/lib/Target/Hexagon/HexagonGenInsert.cpp +++ b/lib/Target/Hexagon/HexagonGenInsert.cpp @@ -7,8 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "hexinsert" - #include "BitTracker.h" #include "HexagonBitTracker.h" #include "HexagonInstrInfo.h" @@ -44,6 +42,8 @@ #include <utility> #include <vector> +#define DEBUG_TYPE "hexinsert" + using namespace llvm; static cl::opt<unsigned> VRegIndexCutoff("insert-vreg-cutoff", cl::init(~0U), diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp index 3470480d607dc..2da211563e0a1 100644 --- a/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -7,8 +7,6 @@ // //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "gen-pred" - #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" #include "llvm/ADT/SetVector.h" @@ -35,6 +33,8 @@ #include <set> #include <utility> +#define DEBUG_TYPE "gen-pred" + using namespace llvm; namespace llvm { diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 67242764d4531..3997702bc962d 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -1286,16 +1286,6 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { MachinePointerInfo(SV)); } -// Creates a SPLAT instruction for a constant value VAL. -static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT, - SDValue Val) { - EVT T = VT.getVectorElementType(); - if (T == MVT::i8 || T == MVT::i16) - return DAG.getNode(HexagonISD::VSPLAT, dl, VT, Val); - - return SDValue(); -} - static bool isSExtFree(SDValue N) { // A sign-extend of a truncate of a sign-extend is free. if (N.getOpcode() == ISD::TRUNCATE && @@ -1374,79 +1364,6 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { return SDValue(); } -// Handle only specific vector loads. -SDValue HexagonTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const { - EVT VT = Op.getValueType(); - SDLoc DL(Op); - LoadSDNode *LoadNode = cast<LoadSDNode>(Op); - SDValue Chain = LoadNode->getChain(); - SDValue Ptr = Op.getOperand(1); - SDValue LoweredLoad; - SDValue Result; - SDValue Base = LoadNode->getBasePtr(); - ISD::LoadExtType Ext = LoadNode->getExtensionType(); - unsigned Alignment = LoadNode->getAlignment(); - SDValue LoadChain; - - if(Ext == ISD::NON_EXTLOAD) - Ext = ISD::ZEXTLOAD; - - if (VT == MVT::v4i16) { - if (Alignment == 2) { - SDValue Loads[4]; - // Base load. - Loads[0] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Base, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // Base+2 load. - SDValue Increment = DAG.getConstant(2, DL, MVT::i32); - Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); - Loads[1] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // SHL 16, then OR base and base+2. - SDValue ShiftAmount = DAG.getConstant(16, DL, MVT::i32); - SDValue Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[1], ShiftAmount); - SDValue Tmp2 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[0]); - // Base + 4. - Increment = DAG.getConstant(4, DL, MVT::i32); - Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); - Loads[2] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // Base + 6. - Increment = DAG.getConstant(6, DL, MVT::i32); - Ptr = DAG.getNode(ISD::ADD, DL, Base.getValueType(), Base, Increment); - Loads[3] = DAG.getExtLoad(Ext, DL, MVT::i32, Chain, Ptr, - LoadNode->getPointerInfo(), MVT::i16, Alignment, - LoadNode->getMemOperand()->getFlags()); - // SHL 16, then OR base+4 and base+6. - Tmp1 = DAG.getNode(ISD::SHL, DL, MVT::i32, Loads[3], ShiftAmount); - SDValue Tmp4 = DAG.getNode(ISD::OR, DL, MVT::i32, Tmp1, Loads[2]); - // Combine to i64. This could be optimised out later if we can - // affect reg allocation of this code. - Result = DAG.getNode(HexagonISD::COMBINE, DL, MVT::i64, Tmp4, Tmp2); - LoadChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, - Loads[0].getValue(1), Loads[1].getValue(1), - Loads[2].getValue(1), Loads[3].getValue(1)); - } else { - // Perform default type expansion. - Result = DAG.getLoad(MVT::i64, DL, Chain, Ptr, LoadNode->getPointerInfo(), - LoadNode->getAlignment(), - LoadNode->getMemOperand()->getFlags()); - LoadChain = Result.getValue(1); - } - } else - llvm_unreachable("Custom lowering unsupported load"); - - Result = DAG.getNode(ISD::BITCAST, DL, VT, Result); - // Since we pretend to lower a load, we need the original chain - // info attached to the result. - SDValue Ops[] = { Result, LoadChain }; - - return DAG.getMergeValues(Ops, DL); -} - SDValue HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const { EVT ValTy = Op.getValueType(); @@ -1971,18 +1888,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM, // Handling of vector operations. // - // Custom lower v4i16 load only. Let v4i16 store to be - // promoted for now. promoteLdStType(MVT::v4i8, MVT::i32); promoteLdStType(MVT::v2i16, MVT::i32); promoteLdStType(MVT::v8i8, MVT::i64); + promoteLdStType(MVT::v4i16, MVT::i64); promoteLdStType(MVT::v2i32, MVT::i64); - setOperationAction(ISD::LOAD, MVT::v4i16, Custom); - setOperationAction(ISD::STORE, MVT::v4i16, Promote); - AddPromotedToType(ISD::LOAD, MVT::v4i16, MVT::i64); - AddPromotedToType(ISD::STORE, MVT::v4i16, MVT::i64); - // Set the action for vector operations to "expand", then override it with // either "custom" or "legal" for specific cases. static const unsigned VectExpOps[] = { @@ -2301,7 +2212,8 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE"; - case HexagonISD::VPACK: return "HexagonISD::VPACK"; + case HexagonISD::VPACKE: return "HexagonISD::VPACKE"; + case HexagonISD::VPACKO: return "HexagonISD::VPACKO"; case HexagonISD::VASL: return "HexagonISD::VASL"; case HexagonISD::VASR: return "HexagonISD::VASR"; case HexagonISD::VLSR: return "HexagonISD::VLSR"; @@ -2394,7 +2306,7 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) // Test if V1 is a SCALAR_TO_VECTOR. if (Lane == 0 && V1.getOpcode() == ISD::SCALAR_TO_VECTOR) - return createSplat(DAG, dl, VT, V1.getOperand(0)); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0)); // Test if V1 is a BUILD_VECTOR which is equivalent to a SCALAR_TO_VECTOR // (and probably will turn into a SCALAR_TO_VECTOR once legalization @@ -2409,28 +2321,26 @@ HexagonTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) } } if (IsScalarToVector) - return createSplat(DAG, dl, VT, V1.getOperand(0)); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, V1.getOperand(0)); } - return createSplat(DAG, dl, VT, DAG.getConstant(Lane, dl, MVT::i32)); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, + DAG.getConstant(Lane, dl, MVT::i32)); } if (UseHVX) { ArrayRef<int> Mask = SVN->getMask(); size_t MaskLen = Mask.size(); - int ElemSizeInBits = VT.getScalarSizeInBits(); - if ((Subtarget.useHVXSglOps() && (ElemSizeInBits * MaskLen) == 64 * 8) || - (Subtarget.useHVXDblOps() && (ElemSizeInBits * MaskLen) == 128 * 8)) { - // Return 1 for odd and 2 of even - StridedLoadKind Pattern = isStridedLoad(Mask); + unsigned SizeInBits = VT.getScalarSizeInBits() * MaskLen; + if ((Subtarget.useHVXSglOps() && SizeInBits == 64 * 8) || + (Subtarget.useHVXDblOps() && SizeInBits == 128 * 8)) { + StridedLoadKind Pattern = isStridedLoad(Mask); if (Pattern == StridedLoadKind::NoPattern) return SDValue(); - SDValue Vec0 = Op.getOperand(0); - SDValue Vec1 = Op.getOperand(1); - SDValue StridePattern = DAG.getConstant(Pattern, dl, MVT::i32); - SDValue Ops[] = { Vec1, Vec0, StridePattern }; - return DAG.getNode(HexagonISD::VPACK, dl, VT, Ops); + unsigned Opc = Pattern == StridedLoadKind::Even ? HexagonISD::VPACKE + : HexagonISD::VPACKO; + return DAG.getNode(Opc, dl, VT, {Op.getOperand(1), Op.getOperand(0)}); } // We used to assert in the "else" part here, but that is bad for Halide // Halide creates intermediate double registers by interleaving two @@ -2531,19 +2441,26 @@ HexagonTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (Size > 64) return SDValue(); - APInt APSplatBits, APSplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; unsigned NElts = BVN->getNumOperands(); // Try to generate a SPLAT instruction. - if ((VT.getSimpleVT() == MVT::v4i8 || VT.getSimpleVT() == MVT::v4i16) && - (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, - HasAnyUndefs, 0, true) && SplatBitSize <= 16)) { - unsigned SplatBits = APSplatBits.getZExtValue(); - int32_t SextVal = ((int32_t) (SplatBits << (32 - SplatBitSize)) >> - (32 - SplatBitSize)); - return createSplat(DAG, dl, VT, DAG.getConstant(SextVal, dl, MVT::i32)); + if (VT == MVT::v4i8 || VT == MVT::v4i16 || VT == MVT::v2i32) { + APInt APSplatBits, APSplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize, + HasAnyUndefs, 0, false)) { + if (SplatBitSize == VT.getVectorElementType().getSizeInBits()) { + unsigned ZV = APSplatBits.getZExtValue(); + assert(SplatBitSize <= 32 && "Can only handle up to i32"); + // Sign-extend the splat value from SplatBitSize to 32. + int32_t SV = SplatBitSize < 32 + ? int32_t(ZV << (32-SplatBitSize)) >> (32-SplatBitSize) + : int32_t(ZV); + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, + DAG.getConstant(SV, dl, MVT::i32)); + } + } } // Try to generate COMBINE to build v2i32 vectors. @@ -2974,8 +2891,6 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::BlockAddress: return LowerBlockAddress(Op, DAG); case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG); case ISD::VASTART: return LowerVASTART(Op, DAG); - // Custom lower some vector loads. - case ISD::LOAD: return LowerLOAD(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); case ISD::SETCC: return LowerSETCC(Op, DAG); case ISD::VSELECT: return LowerVSELECT(Op, DAG); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index bfd2c94eeabaa..d66cbc95e9188 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -62,7 +62,8 @@ namespace HexagonISD { EXTRACTU, EXTRACTURP, VCOMBINE, - VPACK, + VPACKE, + VPACKO, TC_RETURN, EH_RETURN, DCFETCH, @@ -164,7 +165,6 @@ namespace HexagonISD { SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const; SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const; bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg, diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td index c611857ec26af..104a28654dd50 100644 --- a/lib/Target/Hexagon/HexagonIntrinsics.td +++ b/lib/Target/Hexagon/HexagonIntrinsics.td @@ -1366,6 +1366,18 @@ defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>; defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>; defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>; +//******************************************************************* +// SYSTEM +//******************************************************************* + +def: T_R_pat<Y2_dccleana, int_hexagon_Y2_dccleana>; +def: T_R_pat<Y2_dccleaninva, int_hexagon_Y2_dccleaninva>; +def: T_R_pat<Y2_dcinva, int_hexagon_Y2_dcinva>; +def: T_R_pat<Y2_dczeroa, int_hexagon_Y2_dczeroa>; + +def: T_RR_pat<Y4_l2fetch, int_hexagon_Y4_l2fetch>; +def: T_RP_pat<Y5_l2fetch, int_hexagon_Y5_l2fetch>; + include "HexagonIntrinsicsV3.td" include "HexagonIntrinsicsV4.td" include "HexagonIntrinsicsV5.td" diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp index a331c978f59d2..374ffa3799b03 100644 --- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp +++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp @@ -10,8 +10,6 @@ // load/store instructions. //===----------------------------------------------------------------------===// -#define DEBUG_TYPE "opt-addr-mode" - #include "HexagonInstrInfo.h" #include "HexagonSubtarget.h" #include "MCTargetDesc/HexagonBaseInfo.h" @@ -36,6 +34,8 @@ #include <cassert> #include <cstdint> +#define DEBUG_TYPE "opt-addr-mode" + static cl::opt<int> CodeGrowthLimit("hexagon-amode-growth-limit", cl::Hidden, cl::init(0), cl::desc("Code growth limit for address mode " "optimization")); diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index ba98b8994937f..804a547d5b339 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2250,6 +2250,12 @@ def: Storea_pat<SwapSt<atomic_store_16>, I32, addrgp, PS_storerhabs>; def: Storea_pat<SwapSt<atomic_store_32>, I32, addrgp, PS_storeriabs>; def: Storea_pat<SwapSt<atomic_store_64>, I64, addrgp, PS_storerdabs>; +// Prefer this pattern to S2_asl_i_p_or for the special case of joining +// two 32-bit words into a 64-bit word. +let AddedComplexity = 200 in +def: Pat<(or (shl (Aext64 I32:$a), (i32 32)), (Zext64 I32:$b)), + (A2_combinew I32:$a, I32:$b)>; + def: Pat<(or (or (or (shl (i64 (zext (and I32:$b, (i32 65535)))), (i32 16)), (i64 (zext (i32 (and I32:$a, (i32 65535)))))), (shl (i64 (anyext (and I32:$c, (i32 65535)))), (i32 32))), @@ -2971,45 +2977,40 @@ def: Pat<(v64i32 (HexagonVCOMBINE (v32i32 VecDblRegs:$Vs), (V6_vcombine_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, Requires<[UseHVXDbl]>; -def SDTHexagonVPACK: SDTypeProfile<1, 3, [SDTCisSameAs<1, 2>, - SDTCisInt<3>]>; - -def HexagonVPACK: SDNode<"HexagonISD::VPACK", SDTHexagonVPACK>; - -// 0 as the last argument denotes vpacke. 1 denotes vpacko -def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs), - (v64i8 VectorRegs:$Vt), (i32 0))), - (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; -def: Pat<(v64i8 (HexagonVPACK (v64i8 VectorRegs:$Vs), - (v64i8 VectorRegs:$Vt), (i32 1))), - (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; -def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs), - (v32i16 VectorRegs:$Vt), (i32 0))), - (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; -def: Pat<(v32i16 (HexagonVPACK (v32i16 VectorRegs:$Vs), - (v32i16 VectorRegs:$Vt), (i32 1))), - (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>, - Requires<[UseHVXSgl]>; - -def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs), - (v128i8 VecDblRegs:$Vt), (i32 0))), - (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; -def: Pat<(v128i8 (HexagonVPACK (v128i8 VecDblRegs:$Vs), - (v128i8 VecDblRegs:$Vt), (i32 1))), - (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; -def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs), - (v64i16 VecDblRegs:$Vt), (i32 0))), - (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; -def: Pat<(v64i16 (HexagonVPACK (v64i16 VecDblRegs:$Vs), - (v64i16 VecDblRegs:$Vt), (i32 1))), - (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>, - Requires<[UseHVXDbl]>; +def SDTHexagonVPACK: SDTypeProfile<1, 2, [SDTCisSameAs<1, 2>, SDTCisVec<1>]>; + +def HexagonVPACKE: SDNode<"HexagonISD::VPACKE", SDTHexagonVPACK>; +def HexagonVPACKO: SDNode<"HexagonISD::VPACKO", SDTHexagonVPACK>; + +let Predicates = [UseHVXSgl] in { + def: Pat<(v64i8 (HexagonVPACKE (v64i8 VectorRegs:$Vs), + (v64i8 VectorRegs:$Vt))), + (V6_vpackeb VectorRegs:$Vs, VectorRegs:$Vt)>; + def: Pat<(v64i8 (HexagonVPACKO (v64i8 VectorRegs:$Vs), + (v64i8 VectorRegs:$Vt))), + (V6_vpackob VectorRegs:$Vs, VectorRegs:$Vt)>; + def: Pat<(v32i16 (HexagonVPACKE (v32i16 VectorRegs:$Vs), + (v32i16 VectorRegs:$Vt))), + (V6_vpackeh VectorRegs:$Vs, VectorRegs:$Vt)>; + def: Pat<(v32i16 (HexagonVPACKO (v32i16 VectorRegs:$Vs), + (v32i16 VectorRegs:$Vt))), + (V6_vpackoh VectorRegs:$Vs, VectorRegs:$Vt)>; +} + +let Predicates = [UseHVXDbl] in { + def: Pat<(v128i8 (HexagonVPACKE (v128i8 VecDblRegs:$Vs), + (v128i8 VecDblRegs:$Vt))), + (V6_vpackeb_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; + def: Pat<(v128i8 (HexagonVPACKO (v128i8 VecDblRegs:$Vs), + (v128i8 VecDblRegs:$Vt))), + (V6_vpackob_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; + def: Pat<(v64i16 (HexagonVPACKE (v64i16 VecDblRegs:$Vs), + (v64i16 VecDblRegs:$Vt))), + (V6_vpackeh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; + def: Pat<(v64i16 (HexagonVPACKO (v64i16 VecDblRegs:$Vs), + (v64i16 VecDblRegs:$Vt))), + (V6_vpackoh_128B VecDblRegs:$Vs, VecDblRegs:$Vt)>; +} def V2I1: PatLeaf<(v2i1 PredRegs:$R)>; def V4I1: PatLeaf<(v4i1 PredRegs:$R)>; @@ -3073,6 +3074,10 @@ def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>; // four halfwords of 64-bits destination register. def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>; +def: Pat<(v2i32 (HexagonVSPLAT s8_0ImmPred:$s8)), + (A2_combineii imm:$s8, imm:$s8)>; +def: Pat<(v2i32 (HexagonVSPLAT I32:$Rs)), (A2_combinew I32:$Rs, I32:$Rs)>; + class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type> : Pat <(Op Type:$Rss, Type:$Rtt), @@ -3099,14 +3104,11 @@ def: VArith_pat <A2_xorp, xor, V8I8>; def: VArith_pat <A2_xorp, xor, V4I16>; def: VArith_pat <A2_xorp, xor, V2I32>; -def: Pat<(v2i32 (sra V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), - (i32 u5_0ImmPred:$c))))), +def: Pat<(v2i32 (sra V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))), (S2_asr_i_vw V2I32:$b, imm:$c)>; -def: Pat<(v2i32 (srl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), - (i32 u5_0ImmPred:$c))))), +def: Pat<(v2i32 (srl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))), (S2_lsr_i_vw V2I32:$b, imm:$c)>; -def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), - (i32 u5_0ImmPred:$c))))), +def: Pat<(v2i32 (shl V2I32:$b, (v2i32 (HexagonVSPLAT u5_0ImmPred:$c)))), (S2_asl_i_vw V2I32:$b, imm:$c)>; def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))), diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp index 34df2ebcc5206..ea86c9c42f478 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.cpp +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.cpp @@ -53,6 +53,10 @@ static cl::opt<bool> EmitJtInText("hexagon-emit-jt-text", cl::Hidden, cl::init(false), cl::desc("Emit hexagon jump tables in function section")); +static cl::opt<bool> + EmitLutInText("hexagon-emit-lut-text", cl::Hidden, cl::init(false), + cl::desc("Emit hexagon lookup tables in function section")); + // TraceGVPlacement controls messages for all builds. For builds with assertions // (debug or release), messages are also controlled by the usual debug flags // (e.g. -debug and -debug-only=globallayout) @@ -136,6 +140,13 @@ MCSection *HexagonTargetObjectFile::SelectSectionForGlobal( << (Kind.isBSS() ? "kind_bss " : "" ) << (Kind.isBSSLocal() ? "kind_bss_local " : "" )); + // If the lookup table is used by more than one function, do not place + // it in text section. + if (EmitLutInText && GO->getName().startswith("switch.table")) { + if (const Function *Fn = getLutUsedFunction(GO)) + return selectSectionForLookupTable(GO, TM, Fn); + } + if (isGlobalInSmallSection(GO, TM)) return selectSmallSectionForGlobal(GO, Kind, TM); @@ -402,3 +413,39 @@ MCSection *HexagonTargetObjectFile::selectSmallSectionForGlobal( // Otherwise, we work the same as ELF. return TargetLoweringObjectFileELF::SelectSectionForGlobal(GO, Kind, TM); } + +// Return the function that uses the lookup table. If there are more +// than one live function that uses this look table, bail out and place +// the lookup table in default section. +const Function * +HexagonTargetObjectFile::getLutUsedFunction(const GlobalObject *GO) const { + const Function *ReturnFn = nullptr; + for (auto U : GO->users()) { + // validate each instance of user to be a live function. + auto *I = dyn_cast<Instruction>(U); + if (!I) + continue; + auto *Bb = I->getParent(); + if (!Bb) + continue; + auto *UserFn = Bb->getParent(); + if (!ReturnFn) + ReturnFn = UserFn; + else if (ReturnFn != UserFn) + return nullptr; + } + return ReturnFn; +} + +MCSection *HexagonTargetObjectFile::selectSectionForLookupTable( + const GlobalObject *GO, const TargetMachine &TM, const Function *Fn) const { + + SectionKind Kind = SectionKind::getText(); + // If the function has explicit section, place the lookup table in this + // explicit section. + if (Fn->hasSection()) + return getExplicitSectionGlobal(Fn, Kind, TM); + + const auto *FuncObj = dyn_cast<GlobalObject>(Fn); + return SelectSectionForGlobal(FuncObj, Kind, TM); +} diff --git a/lib/Target/Hexagon/HexagonTargetObjectFile.h b/lib/Target/Hexagon/HexagonTargetObjectFile.h index 373d850b53be7..eff44f097e03f 100644 --- a/lib/Target/Hexagon/HexagonTargetObjectFile.h +++ b/lib/Target/Hexagon/HexagonTargetObjectFile.h @@ -36,6 +36,8 @@ namespace llvm { bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference, const Function &F) const override; + const Function *getLutUsedFunction(const GlobalObject *GO) const; + private: MCSectionELF *SmallDataSection; MCSectionELF *SmallBSSSection; @@ -46,6 +48,10 @@ namespace llvm { MCSection *selectSmallSectionForGlobal(const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const; + + MCSection *selectSectionForLookupTable(const GlobalObject *GO, + const TargetMachine &TM, + const Function *Fn) const; }; } // namespace llvm diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index b72c9d5344787..e12188e706025 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -304,9 +304,6 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); - bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, - const MCSubtargetInfo *STI); - bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); @@ -2514,16 +2511,6 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQIMacro: return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; - case Mips::MFTC0: case Mips::MTTC0: - case Mips::MFTGPR: case Mips::MTTGPR: - case Mips::MFTLO: case Mips::MTTLO: - case Mips::MFTHI: case Mips::MTTHI: - case Mips::MFTACX: case Mips::MTTACX: - case Mips::MFTDSP: case Mips::MTTDSP: - case Mips::MFTC1: case Mips::MTTC1: - case Mips::MFTHC1: case Mips::MTTHC1: - case Mips::CFTC1: case Mips::CTTC1: - return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; } } @@ -4895,212 +4882,6 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return false; } -// Map the DSP accumulator and control register to the corresponding gpr -// operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions -// do not map the DSP registers contigously to gpr registers. -static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) { - switch (Inst.getOpcode()) { - case Mips::MFTLO: - case Mips::MTTLO: - switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { - case Mips::AC0: - return Mips::ZERO; - case Mips::AC1: - return Mips::A0; - case Mips::AC2: - return Mips::T0; - case Mips::AC3: - return Mips::T4; - default: - llvm_unreachable("Unknown register for 'mttr' alias!"); - } - case Mips::MFTHI: - case Mips::MTTHI: - switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { - case Mips::AC0: - return Mips::AT; - case Mips::AC1: - return Mips::A1; - case Mips::AC2: - return Mips::T1; - case Mips::AC3: - return Mips::T5; - default: - llvm_unreachable("Unknown register for 'mttr' alias!"); - } - case Mips::MFTACX: - case Mips::MTTACX: - switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { - case Mips::AC0: - return Mips::V0; - case Mips::AC1: - return Mips::A2; - case Mips::AC2: - return Mips::T2; - case Mips::AC3: - return Mips::T6; - default: - llvm_unreachable("Unknown register for 'mttr' alias!"); - } - case Mips::MFTDSP: - case Mips::MTTDSP: - return Mips::S0; - default: - llvm_unreachable("Unknown instruction for 'mttr' dsp alias!"); - } -} - -// Map the floating point register operand to the corresponding register -// operand. -static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) { - switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) { - case Mips::F0: return Mips::ZERO; - case Mips::F1: return Mips::AT; - case Mips::F2: return Mips::V0; - case Mips::F3: return Mips::V1; - case Mips::F4: return Mips::A0; - case Mips::F5: return Mips::A1; - case Mips::F6: return Mips::A2; - case Mips::F7: return Mips::A3; - case Mips::F8: return Mips::T0; - case Mips::F9: return Mips::T1; - case Mips::F10: return Mips::T2; - case Mips::F11: return Mips::T3; - case Mips::F12: return Mips::T4; - case Mips::F13: return Mips::T5; - case Mips::F14: return Mips::T6; - case Mips::F15: return Mips::T7; - case Mips::F16: return Mips::S0; - case Mips::F17: return Mips::S1; - case Mips::F18: return Mips::S2; - case Mips::F19: return Mips::S3; - case Mips::F20: return Mips::S4; - case Mips::F21: return Mips::S5; - case Mips::F22: return Mips::S6; - case Mips::F23: return Mips::S7; - case Mips::F24: return Mips::T8; - case Mips::F25: return Mips::T9; - case Mips::F26: return Mips::K0; - case Mips::F27: return Mips::K1; - case Mips::F28: return Mips::GP; - case Mips::F29: return Mips::SP; - case Mips::F30: return Mips::FP; - case Mips::F31: return Mips::RA; - default: llvm_unreachable("Unknown register for mttc1 alias!"); - } -} - -// Map the coprocessor operand the corresponding gpr register operand. -static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) { - switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) { - case Mips::COP00: return Mips::ZERO; - case Mips::COP01: return Mips::AT; - case Mips::COP02: return Mips::V0; - case Mips::COP03: return Mips::V1; - case Mips::COP04: return Mips::A0; - case Mips::COP05: return Mips::A1; - case Mips::COP06: return Mips::A2; - case Mips::COP07: return Mips::A3; - case Mips::COP08: return Mips::T0; - case Mips::COP09: return Mips::T1; - case Mips::COP010: return Mips::T2; - case Mips::COP011: return Mips::T3; - case Mips::COP012: return Mips::T4; - case Mips::COP013: return Mips::T5; - case Mips::COP014: return Mips::T6; - case Mips::COP015: return Mips::T7; - case Mips::COP016: return Mips::S0; - case Mips::COP017: return Mips::S1; - case Mips::COP018: return Mips::S2; - case Mips::COP019: return Mips::S3; - case Mips::COP020: return Mips::S4; - case Mips::COP021: return Mips::S5; - case Mips::COP022: return Mips::S6; - case Mips::COP023: return Mips::S7; - case Mips::COP024: return Mips::T8; - case Mips::COP025: return Mips::T9; - case Mips::COP026: return Mips::K0; - case Mips::COP027: return Mips::K1; - case Mips::COP028: return Mips::GP; - case Mips::COP029: return Mips::SP; - case Mips::COP030: return Mips::FP; - case Mips::COP031: return Mips::RA; - default: llvm_unreachable("Unknown register for mttc0 alias!"); - } -} - -/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing -/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits. -bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, - const MCSubtargetInfo *STI) { - MipsTargetStreamer &TOut = getTargetStreamer(); - unsigned rd = 0; - unsigned u = 1; - unsigned sel = 0; - unsigned h = 0; - bool IsMFTR = false; - switch (Inst.getOpcode()) { - case Mips::MFTC0: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::MTTC0: - u = 0; - rd = getRegisterForMxtrC0(Inst, IsMFTR); - sel = Inst.getOperand(2).getImm(); - break; - case Mips::MFTGPR: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::MTTGPR: - rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg(); - break; - case Mips::MFTLO: - case Mips::MFTHI: - case Mips::MFTACX: - case Mips::MFTDSP: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::MTTLO: - case Mips::MTTHI: - case Mips::MTTACX: - case Mips::MTTDSP: - rd = getRegisterForMxtrDSP(Inst, IsMFTR); - sel = 1; - break; - case Mips::MFTHC1: - h = 1; - LLVM_FALLTHROUGH; - case Mips::MFTC1: - IsMFTR = true; - rd = getRegisterForMxtrFP(Inst, IsMFTR); - sel = 2; - break; - case Mips::MTTHC1: - h = 1; - LLVM_FALLTHROUGH; - case Mips::MTTC1: - rd = getRegisterForMxtrFP(Inst, IsMFTR); - sel = 2; - break; - case Mips::CFTC1: - IsMFTR = true; - LLVM_FALLTHROUGH; - case Mips::CTTC1: - rd = getRegisterForMxtrFP(Inst, IsMFTR); - sel = 3; - break; - } - unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd; - unsigned Op1 = - IsMFTR ? rd - : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg() - : Inst.getOperand(0).getReg()); - - TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc, - STI); - return false; -} - unsigned MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst, const OperandVector &Operands) { diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 7caeb08589af6..2907b77158575 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -193,21 +193,6 @@ void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI); } -void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0, - unsigned Reg1, int16_t Imm0, int16_t Imm1, - int16_t Imm2, SMLoc IDLoc, - const MCSubtargetInfo *STI) { - MCInst TmpInst; - TmpInst.setOpcode(Opcode); - TmpInst.addOperand(MCOperand::createReg(Reg0)); - TmpInst.addOperand(MCOperand::createReg(Reg1)); - TmpInst.addOperand(MCOperand::createImm(Imm0)); - TmpInst.addOperand(MCOperand::createImm(Imm1)); - TmpInst.addOperand(MCOperand::createImm(Imm2)); - TmpInst.setLoc(IDLoc); - getStreamer().EmitInstruction(TmpInst, *STI); -} - void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, const MCSubtargetInfo *STI) { diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index d2f0fdcc6cc11..6ceb055775387 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -190,6 +190,9 @@ def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true", def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">; +def FeatureLongCalls : SubtargetFeature<"long-calls", "UseLongCalls", "true", + "Disable use of the jal instruction">; + //===----------------------------------------------------------------------===// // Mips processors supported. //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index a6ec9fb2e5987..20319f85696cd 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -364,6 +364,18 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::UDIV, MVT::i64, Expand); setOperationAction(ISD::UREM, MVT::i64, Expand); + if (!(Subtarget.hasDSP() && Subtarget.hasMips32r2())) { + setOperationAction(ISD::ADDC, MVT::i32, Expand); + setOperationAction(ISD::ADDE, MVT::i32, Expand); + } + + setOperationAction(ISD::ADDC, MVT::i64, Expand); + setOperationAction(ISD::ADDE, MVT::i64, Expand); + setOperationAction(ISD::SUBC, MVT::i32, Expand); + setOperationAction(ISD::SUBE, MVT::i32, Expand); + setOperationAction(ISD::SUBC, MVT::i64, Expand); + setOperationAction(ISD::SUBE, MVT::i64, Expand); + // Operations not directly supported by Mips. setOperationAction(ISD::BR_CC, MVT::f32, Expand); setOperationAction(ISD::BR_CC, MVT::f64, Expand); @@ -469,6 +481,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM, setTargetDAGCombine(ISD::AND); setTargetDAGCombine(ISD::OR); setTargetDAGCombine(ISD::ADD); + setTargetDAGCombine(ISD::SUB); setTargetDAGCombine(ISD::AssertZext); setTargetDAGCombine(ISD::SHL); @@ -923,14 +936,127 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, } } +static SDValue performMADD_MSUBCombine(SDNode *ROOTNode, SelectionDAG &CurDAG, + const MipsSubtarget &Subtarget) { + // ROOTNode must have a multiplication as an operand for the match to be + // successful. + if (ROOTNode->getOperand(0).getOpcode() != ISD::MUL && + ROOTNode->getOperand(1).getOpcode() != ISD::MUL) + return SDValue(); + + // We don't handle vector types here. + if (ROOTNode->getValueType(0).isVector()) + return SDValue(); + + // For MIPS64, madd / msub instructions are inefficent to use with 64 bit + // arithmetic. E.g. + // (add (mul a b) c) => + // let res = (madd (mthi (drotr c 32))x(mtlo c) a b) in + // MIPS64: (or (dsll (mfhi res) 32) (dsrl (dsll (mflo res) 32) 32) + // or + // MIPS64R2: (dins (mflo res) (mfhi res) 32 32) + // + // The overhead of setting up the Hi/Lo registers and reassembling the + // result makes this a dubious optimzation for MIPS64. The core of the + // problem is that Hi/Lo contain the upper and lower 32 bits of the + // operand and result. + // + // It requires a chain of 4 add/mul for MIPS64R2 to get better code + // density than doing it naively, 5 for MIPS64. Additionally, using + // madd/msub on MIPS64 requires the operands actually be 32 bit sign + // extended operands, not true 64 bit values. + // + // FIXME: For the moment, disable this completely for MIPS64. + if (Subtarget.hasMips64()) + return SDValue(); + + SDValue Mult = ROOTNode->getOperand(0).getOpcode() == ISD::MUL + ? ROOTNode->getOperand(0) + : ROOTNode->getOperand(1); + + SDValue AddOperand = ROOTNode->getOperand(0).getOpcode() == ISD::MUL + ? ROOTNode->getOperand(1) + : ROOTNode->getOperand(0); + + // Transform this to a MADD only if the user of this node is the add. + // If there are other users of the mul, this function returns here. + if (!Mult.hasOneUse()) + return SDValue(); + + // maddu and madd are unusual instructions in that on MIPS64 bits 63..31 + // must be in canonical form, i.e. sign extended. For MIPS32, the operands + // of the multiply must have 32 or more sign bits, otherwise we cannot + // perform this optimization. We have to check this here as we're performing + // this optimization pre-legalization. + SDValue MultLHS = Mult->getOperand(0); + SDValue MultRHS = Mult->getOperand(1); + + bool IsSigned = MultLHS->getOpcode() == ISD::SIGN_EXTEND && + MultRHS->getOpcode() == ISD::SIGN_EXTEND; + bool IsUnsigned = MultLHS->getOpcode() == ISD::ZERO_EXTEND && + MultRHS->getOpcode() == ISD::ZERO_EXTEND; + + if (!IsSigned && !IsUnsigned) + return SDValue(); + + // Initialize accumulator. + SDLoc DL(ROOTNode); + SDValue TopHalf; + SDValue BottomHalf; + BottomHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand, + CurDAG.getIntPtrConstant(0, DL)); + + TopHalf = CurDAG.getNode(ISD::EXTRACT_ELEMENT, DL, MVT::i32, AddOperand, + CurDAG.getIntPtrConstant(1, DL)); + SDValue ACCIn = CurDAG.getNode(MipsISD::MTLOHI, DL, MVT::Untyped, + BottomHalf, + TopHalf); + + // Create MipsMAdd(u) / MipsMSub(u) node. + bool IsAdd = ROOTNode->getOpcode() == ISD::ADD; + unsigned Opcode = IsAdd ? (IsUnsigned ? MipsISD::MAddu : MipsISD::MAdd) + : (IsUnsigned ? MipsISD::MSubu : MipsISD::MSub); + SDValue MAddOps[3] = { + CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(0)), + CurDAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Mult->getOperand(1)), ACCIn}; + EVT VTs[2] = {MVT::i32, MVT::i32}; + SDValue MAdd = CurDAG.getNode(Opcode, DL, VTs, MAddOps); + + SDValue ResLo = CurDAG.getNode(MipsISD::MFLO, DL, MVT::i32, MAdd); + SDValue ResHi = CurDAG.getNode(MipsISD::MFHI, DL, MVT::i32, MAdd); + SDValue Combined = + CurDAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, ResLo, ResHi); + return Combined; +} + +static SDValue performSUBCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const MipsSubtarget &Subtarget) { + // (sub v0 (mul v1, v2)) => (msub v1, v2, v0) + if (DCI.isBeforeLegalizeOps()) { + if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && + !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64) + return performMADD_MSUBCombine(N, DAG, Subtarget); + + return SDValue(); + } + + return SDValue(); +} + static SDValue performADDCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const MipsSubtarget &Subtarget) { - // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) + // (add v0 (mul v1, v2)) => (madd v1, v2, v0) + if (DCI.isBeforeLegalizeOps()) { + if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && + !Subtarget.inMips16Mode() && N->getValueType(0) == MVT::i64) + return performMADD_MSUBCombine(N, DAG, Subtarget); - if (DCI.isBeforeLegalizeOps()) return SDValue(); + } + // (add v0, (add v1, abs_lo(tjt))) => (add (add v0, v1), abs_lo(tjt)) SDValue Add = N->getOperand(1); if (Add.getOpcode() != ISD::ADD) @@ -1058,6 +1184,8 @@ SDValue MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) return performAssertZextCombine(N, DAG, DCI, Subtarget); case ISD::SHL: return performSHLCombine(N, DAG, DCI, Subtarget); + case ISD::SUB: + return performSUBCombine(N, DAG, DCI, Subtarget); } return SDValue(); @@ -3021,6 +3149,20 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, EVT Ty = Callee.getValueType(); bool GlobalOrExternal = false, IsCallReloc = false; + // The long-calls feature is ignored in case of PIC. + // While we do not support -mshared / -mno-shared properly, + // ignore long-calls in case of -mabicalls too. + if (Subtarget.useLongCalls() && !Subtarget.isABICalls() && !IsPIC) { + // Get the address of the callee into a register to prevent + // using of the `jal` instruction for the direct call. + if (auto *N = dyn_cast<GlobalAddressSDNode>(Callee)) + Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG) + : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG); + else if (auto *N = dyn_cast<ExternalSymbolSDNode>(Callee)) + Callee = Subtarget.hasSym32() ? getAddrNonPIC(N, SDLoc(N), Ty, DAG) + : getAddrNonPICSym64(N, SDLoc(N), Ty, DAG); + } + if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) { if (IsPIC) { const GlobalValue *Val = G->getGlobal(); diff --git a/lib/Target/Mips/MipsInstrFPU.td b/lib/Target/Mips/MipsInstrFPU.td index 94f3a74be98bc..0333fe6520fab 100644 --- a/lib/Target/Mips/MipsInstrFPU.td +++ b/lib/Target/Mips/MipsInstrFPU.td @@ -443,8 +443,17 @@ let AdditionalPredicates = [NotInMicroMips] in { } def MFC1 : MMRel, MFC1_FT<"mfc1", GPR32Opnd, FGR32Opnd, II_MFC1, bitconvert>, MFC1_FM<0>; +def MFC1_D64 : MFC1_FT<"mfc1", GPR32Opnd, FGR64Opnd, II_MFC1>, MFC1_FM<0>, + FGR_64 { + let DecoderNamespace = "Mips64"; +} def MTC1 : MMRel, MTC1_FT<"mtc1", FGR32Opnd, GPR32Opnd, II_MTC1, bitconvert>, MFC1_FM<4>; +def MTC1_D64 : MTC1_FT<"mtc1", FGR64Opnd, GPR32Opnd, II_MTC1>, MFC1_FM<4>, + FGR_64 { + let DecoderNamespace = "Mips64"; +} + let AdditionalPredicates = [NotInMicroMips] in { def MFHC1_D32 : MMRel, MFC1_FT<"mfhc1", GPR32Opnd, AFGR64Opnd, II_MFHC1>, MFC1_FM<3>, ISA_MIPS32R2, FGR_32; diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td index edc0981e6278a..64bee5bfba18e 100644 --- a/lib/Target/Mips/MipsMTInstrFormats.td +++ b/lib/Target/Mips/MipsMTInstrFormats.td @@ -35,8 +35,6 @@ class FIELD5<bits<5> Val> { def FIELD5_1_DMT_EMT : FIELD5<0b00001>; def FIELD5_2_DMT_EMT : FIELD5<0b01111>; def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>; -def FIELD5_MFTR : FIELD5<0b01000>; -def FIELD5_MTTR : FIELD5<0b01100>; class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst { bits<32> Inst; @@ -52,25 +50,6 @@ class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst { let Inst{2-0} = 0b001; } -class COP0_MFTTR_MT<FIELD5 Op> : MipsMTInst { - bits<32> Inst; - - bits<5> rt; - bits<5> rd; - bits<1> u; - bits<1> h; - bits<3> sel; - let Inst{31-26} = 0b010000; // COP0 - let Inst{25-21} = Op.Value; // MFMC0 - let Inst{20-16} = rt; - let Inst{15-11} = rd; - let Inst{10-6} = 0b00000; // rx - currently unsupported. - let Inst{5} = u; - let Inst{4} = h; - let Inst{3} = 0b0; - let Inst{2-0} = sel; -} - class SPECIAL3_MT_FORK : MipsMTInst { bits<32> Inst; diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td index 72e626cbec40a..ab6693f60fd94 100644 --- a/lib/Target/Mips/MipsMTInstrInfo.td +++ b/lib/Target/Mips/MipsMTInstrInfo.td @@ -6,13 +6,6 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This file describes the MIPS MT ASE as defined by MD00378 1.12. -// -// TODO: Add support for the microMIPS encodings for the MT ASE and add the -// instruction mappings. -// -//===----------------------------------------------------------------------===// //===----------------------------------------------------------------------===// // MIPS MT Instruction Encodings @@ -34,10 +27,6 @@ class FORK_ENC : SPECIAL3_MT_FORK; class YIELD_ENC : SPECIAL3_MT_YIELD; -class MFTR_ENC : COP0_MFTTR_MT<FIELD5_MFTR>; - -class MTTR_ENC : COP0_MFTTR_MT<FIELD5_MTTR>; - //===----------------------------------------------------------------------===// // MIPS MT Instruction Descriptions //===----------------------------------------------------------------------===// @@ -50,22 +39,6 @@ class MT_1R_DESC_BASE<string instr_asm, InstrItinClass Itin = NoItinerary> { InstrItinClass Itinerary = Itin; } -class MFTR_DESC { - dag OutOperandList = (outs GPR32Opnd:$rd); - dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h); - string AsmString = "mftr\t$rd, $rt, $u, $sel, $h"; - list<dag> Pattern = []; - InstrItinClass Itinerary = II_MFTR; -} - -class MTTR_DESC { - dag OutOperandList = (outs GPR32Opnd:$rd); - dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h); - string AsmString = "mttr\t$rt, $rd, $u, $sel, $h"; - list<dag> Pattern = []; - InstrItinClass Itinerary = II_MTTR; -} - class FORK_DESC { dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd); dag InOperandList = (ins GPR32Opnd:$rt); @@ -106,74 +79,9 @@ let hasSideEffects = 1, isNotDuplicable = 1, def FORK : FORK_ENC, FORK_DESC, ASE_MT; def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT; - - def MFTR : MFTR_ENC, MFTR_DESC, ASE_MT; - - def MTTR : MTTR_ENC, MTTR_DESC, ASE_MT; } //===----------------------------------------------------------------------===// -// MIPS MT Pseudo Instructions - used to support mtfr & mttr aliases. -//===----------------------------------------------------------------------===// -def MFTC0 : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins COP0Opnd:$rt, - uimm3:$sel), - "mftc0 $rd, $rt, $sel">, ASE_MT; - -def MFTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rt, - uimm3:$sel), - "mftgpr $rd, $rt">, ASE_MT; - -def MFTLO : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), - "mftlo $rt, $ac">, ASE_MT; - -def MFTHI : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), - "mfthi $rt, $ac">, ASE_MT; - -def MFTACX : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), - "mftacx $rt, $ac">, ASE_MT; - -def MFTDSP : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins), - "mftdsp $rt">, ASE_MT; - -def MFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft), - "mftc1 $rt, $ft">, ASE_MT; - -def MFTHC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft), - "mfthc1 $rt, $ft">, ASE_MT; - -def CFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGRCCOpnd:$ft), - "cftc1 $rt, $ft">, ASE_MT; - - -def MTTC0 : MipsAsmPseudoInst<(outs COP0Opnd:$rd), (ins GPR32Opnd:$rt, - uimm3:$sel), - "mttc0 $rt, $rd, $sel">, ASE_MT; - -def MTTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins GPR32Opnd:$rd), - "mttgpr $rd, $rt">, ASE_MT; - -def MTTLO : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), - "mttlo $rt, $ac">, ASE_MT; - -def MTTHI : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), - "mtthi $rt, $ac">, ASE_MT; - -def MTTACX : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), - "mttacx $rt, $ac">, ASE_MT; - -def MTTDSP : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rt), - "mttdsp $rt">, ASE_MT; - -def MTTC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt), - "mttc1 $rt, $ft">, ASE_MT; - -def MTTHC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt), - "mtthc1 $rt, $ft">, ASE_MT; - -def CTTC1 : MipsAsmPseudoInst<(outs FGRCCOpnd:$ft), (ins GPR32Opnd:$rt), - "cttc1 $rt, $ft">, ASE_MT; - -//===----------------------------------------------------------------------===// // MIPS MT Instruction Definitions //===----------------------------------------------------------------------===// @@ -187,22 +95,4 @@ let AdditionalPredicates = [NotInMicroMips] in { def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT; def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT; - - def : MipsInstAlias<"mftc0 $rd, $rt", (MFTC0 GPR32Opnd:$rd, COP0Opnd:$rt, 0), - 1>, ASE_MT; - - def : MipsInstAlias<"mftlo $rt", (MFTLO GPR32Opnd:$rt, AC0), 1>, ASE_MT; - - def : MipsInstAlias<"mfthi $rt", (MFTHI GPR32Opnd:$rt, AC0), 1>, ASE_MT; - - def : MipsInstAlias<"mftacx $rt", (MFTACX GPR32Opnd:$rt, AC0), 1>, ASE_MT; - - def : MipsInstAlias<"mttc0 $rd, $rt", (MTTC0 COP0Opnd:$rt, GPR32Opnd:$rd, 0), - 1>, ASE_MT; - - def : MipsInstAlias<"mttlo $rt", (MTTLO AC0, GPR32Opnd:$rt), 1>, ASE_MT; - - def : MipsInstAlias<"mtthi $rt", (MTTHI AC0, GPR32Opnd:$rt), 1>, ASE_MT; - - def : MipsInstAlias<"mttacx $rt", (MTTACX AC0, GPR32Opnd:$rt), 1>, ASE_MT; } diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp index 49ae6dd4cd399..4be26dd25dc04 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp @@ -245,46 +245,64 @@ void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) { } } -void MipsSEDAGToDAGISel::selectAddESubE(unsigned MOp, SDValue InFlag, - SDValue CmpLHS, const SDLoc &DL, - SDNode *Node) const { - unsigned Opc = InFlag.getOpcode(); (void)Opc; - - assert(((Opc == ISD::ADDC || Opc == ISD::ADDE) || - (Opc == ISD::SUBC || Opc == ISD::SUBE)) && - "(ADD|SUB)E flag operand must come from (ADD|SUB)C/E insn"); - - unsigned SLTuOp = Mips::SLTu, ADDuOp = Mips::ADDu; - if (Subtarget->isGP64bit()) { - SLTuOp = Mips::SLTu64; - ADDuOp = Mips::DADDu; - } - - SDValue Ops[] = { CmpLHS, InFlag.getOperand(1) }; +void MipsSEDAGToDAGISel::selectAddE(SDNode *Node, const SDLoc &DL) const { + SDValue InFlag = Node->getOperand(2); + unsigned Opc = InFlag.getOpcode(); SDValue LHS = Node->getOperand(0), RHS = Node->getOperand(1); EVT VT = LHS.getValueType(); - SDNode *Carry = CurDAG->getMachineNode(SLTuOp, DL, VT, Ops); - - if (Subtarget->isGP64bit()) { - // On 64-bit targets, sltu produces an i64 but our backend currently says - // that SLTu64 produces an i32. We need to fix this in the long run but for - // now, just make the DAG type-correct by asserting the upper bits are zero. - Carry = CurDAG->getMachineNode(Mips::SUBREG_TO_REG, DL, VT, - CurDAG->getTargetConstant(0, DL, VT), - SDValue(Carry, 0), - CurDAG->getTargetConstant(Mips::sub_32, DL, - VT)); + // In the base case, we can rely on the carry bit from the addsc + // instruction. + if (Opc == ISD::ADDC) { + SDValue Ops[3] = {LHS, RHS, InFlag}; + CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Ops); + return; } - // Generate a second addition only if we know that RHS is not a - // constant-zero node. - SDNode *AddCarry = Carry; - ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS); - if (!C || C->getZExtValue()) - AddCarry = CurDAG->getMachineNode(ADDuOp, DL, VT, SDValue(Carry, 0), RHS); + assert(Opc == ISD::ADDE && "ISD::ADDE not in a chain of ADDE nodes!"); + + // The more complex case is when there is a chain of ISD::ADDE nodes like: + // (adde (adde (adde (addc a b) c) d) e). + // + // The addwc instruction does not write to the carry bit, instead it writes + // to bit 20 of the dsp control register. To match this series of nodes, each + // intermediate adde node must be expanded to write the carry bit before the + // addition. + + // Start by reading the overflow field for addsc and moving the value to the + // carry field. The usage of 1 here with MipsISD::RDDSP / Mips::WRDSP + // corresponds to reading/writing the entire control register to/from a GPR. + + SDValue CstOne = CurDAG->getTargetConstant(1, DL, MVT::i32); + + SDValue OuFlag = CurDAG->getTargetConstant(20, DL, MVT::i32); + + SDNode *DSPCtrlField = + CurDAG->getMachineNode(Mips::RDDSP, DL, MVT::i32, MVT::Glue, CstOne, InFlag); + + SDNode *Carry = CurDAG->getMachineNode( + Mips::EXT, DL, MVT::i32, SDValue(DSPCtrlField, 0), OuFlag, CstOne); - CurDAG->SelectNodeTo(Node, MOp, VT, MVT::Glue, LHS, SDValue(AddCarry, 0)); + SDValue Ops[4] = {SDValue(DSPCtrlField, 0), + CurDAG->getTargetConstant(6, DL, MVT::i32), CstOne, + SDValue(Carry, 0)}; + SDNode *DSPCFWithCarry = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, Ops); + + // My reading of the the MIPS DSP 3.01 specification isn't as clear as I + // would like about whether bit 20 always gets overwritten by addwc. + // Hence take an extremely conservative view and presume it's sticky. We + // therefore need to clear it. + + SDValue Zero = CurDAG->getRegister(Mips::ZERO, MVT::i32); + + SDValue InsOps[4] = {Zero, OuFlag, CstOne, SDValue(DSPCFWithCarry, 0)}; + SDNode *DSPCtrlFinal = CurDAG->getMachineNode(Mips::INS, DL, MVT::i32, InsOps); + + SDNode *WrDSP = CurDAG->getMachineNode(Mips::WRDSP, DL, MVT::Glue, + SDValue(DSPCtrlFinal, 0), CstOne); + + SDValue Operands[3] = {LHS, RHS, SDValue(WrDSP, 0)}; + CurDAG->SelectNodeTo(Node, Mips::ADDWC, VT, MVT::Glue, Operands); } /// Match frameindex @@ -765,19 +783,8 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) { switch(Opcode) { default: break; - case ISD::SUBE: { - SDValue InFlag = Node->getOperand(2); - unsigned Opc = Subtarget->isGP64bit() ? Mips::DSUBu : Mips::SUBu; - selectAddESubE(Opc, InFlag, InFlag.getOperand(0), DL, Node); - return true; - } - case ISD::ADDE: { - if (Subtarget->hasDSP()) // Select DSP instructions, ADDSC and ADDWC. - break; - SDValue InFlag = Node->getOperand(2); - unsigned Opc = Subtarget->isGP64bit() ? Mips::DADDu : Mips::ADDu; - selectAddESubE(Opc, InFlag, InFlag.getValue(0), DL, Node); + selectAddE(Node, DL); return true; } diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.h b/lib/Target/Mips/MipsSEISelDAGToDAG.h index f89a350cab044..6f38289c5a457 100644 --- a/lib/Target/Mips/MipsSEISelDAGToDAG.h +++ b/lib/Target/Mips/MipsSEISelDAGToDAG.h @@ -41,8 +41,7 @@ private: const SDLoc &dl, EVT Ty, bool HasLo, bool HasHi); - void selectAddESubE(unsigned MOp, SDValue InFlag, SDValue CmpLHS, - const SDLoc &DL, SDNode *Node) const; + void selectAddE(SDNode *Node, const SDLoc &DL) const; bool selectAddrFrameIndex(SDValue Addr, SDValue &Base, SDValue &Offset) const; bool selectAddrFrameIndexOffset(SDValue Addr, SDValue &Base, SDValue &Offset, diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp index 06a97b9d123ec..72b2738bfac44 100644 --- a/lib/Target/Mips/MipsSEISelLowering.cpp +++ b/lib/Target/Mips/MipsSEISelLowering.cpp @@ -179,8 +179,6 @@ MipsSETargetLowering::MipsSETargetLowering(const MipsTargetMachine &TM, setOperationAction(ISD::LOAD, MVT::i32, Custom); setOperationAction(ISD::STORE, MVT::i32, Custom); - setTargetDAGCombine(ISD::ADDE); - setTargetDAGCombine(ISD::SUBE); setTargetDAGCombine(ISD::MUL); setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom); @@ -421,163 +419,6 @@ SDValue MipsSETargetLowering::LowerOperation(SDValue Op, return MipsTargetLowering::LowerOperation(Op, DAG); } -// selectMADD - -// Transforms a subgraph in CurDAG if the following pattern is found: -// (addc multLo, Lo0), (adde multHi, Hi0), -// where, -// multHi/Lo: product of multiplication -// Lo0: initial value of Lo register -// Hi0: initial value of Hi register -// Return true if pattern matching was successful. -static bool selectMADD(SDNode *ADDENode, SelectionDAG *CurDAG) { - // ADDENode's second operand must be a flag output of an ADDC node in order - // for the matching to be successful. - SDNode *ADDCNode = ADDENode->getOperand(2).getNode(); - - if (ADDCNode->getOpcode() != ISD::ADDC) - return false; - - SDValue MultHi = ADDENode->getOperand(0); - SDValue MultLo = ADDCNode->getOperand(0); - SDNode *MultNode = MultHi.getNode(); - unsigned MultOpc = MultHi.getOpcode(); - - // MultHi and MultLo must be generated by the same node, - if (MultLo.getNode() != MultNode) - return false; - - // and it must be a multiplication. - if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI) - return false; - - // MultLo amd MultHi must be the first and second output of MultNode - // respectively. - if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0) - return false; - - // Transform this to a MADD only if ADDENode and ADDCNode are the only users - // of the values of MultNode, in which case MultNode will be removed in later - // phases. - // If there exist users other than ADDENode or ADDCNode, this function returns - // here, which will result in MultNode being mapped to a single MULT - // instruction node rather than a pair of MULT and MADD instructions being - // produced. - if (!MultHi.hasOneUse() || !MultLo.hasOneUse()) - return false; - - SDLoc DL(ADDENode); - - // Initialize accumulator. - SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped, - ADDCNode->getOperand(1), - ADDENode->getOperand(1)); - - // create MipsMAdd(u) node - MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MAddu : MipsISD::MAdd; - - SDValue MAdd = CurDAG->getNode(MultOpc, DL, MVT::Untyped, - MultNode->getOperand(0),// Factor 0 - MultNode->getOperand(1),// Factor 1 - ACCIn); - - // replace uses of adde and addc here - if (!SDValue(ADDCNode, 0).use_empty()) { - SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MAdd); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDCNode, 0), LoOut); - } - if (!SDValue(ADDENode, 0).use_empty()) { - SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MAdd); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(ADDENode, 0), HiOut); - } - - return true; -} - -// selectMSUB - -// Transforms a subgraph in CurDAG if the following pattern is found: -// (addc Lo0, multLo), (sube Hi0, multHi), -// where, -// multHi/Lo: product of multiplication -// Lo0: initial value of Lo register -// Hi0: initial value of Hi register -// Return true if pattern matching was successful. -static bool selectMSUB(SDNode *SUBENode, SelectionDAG *CurDAG) { - // SUBENode's second operand must be a flag output of an SUBC node in order - // for the matching to be successful. - SDNode *SUBCNode = SUBENode->getOperand(2).getNode(); - - if (SUBCNode->getOpcode() != ISD::SUBC) - return false; - - SDValue MultHi = SUBENode->getOperand(1); - SDValue MultLo = SUBCNode->getOperand(1); - SDNode *MultNode = MultHi.getNode(); - unsigned MultOpc = MultHi.getOpcode(); - - // MultHi and MultLo must be generated by the same node, - if (MultLo.getNode() != MultNode) - return false; - - // and it must be a multiplication. - if (MultOpc != ISD::SMUL_LOHI && MultOpc != ISD::UMUL_LOHI) - return false; - - // MultLo amd MultHi must be the first and second output of MultNode - // respectively. - if (MultHi.getResNo() != 1 || MultLo.getResNo() != 0) - return false; - - // Transform this to a MSUB only if SUBENode and SUBCNode are the only users - // of the values of MultNode, in which case MultNode will be removed in later - // phases. - // If there exist users other than SUBENode or SUBCNode, this function returns - // here, which will result in MultNode being mapped to a single MULT - // instruction node rather than a pair of MULT and MSUB instructions being - // produced. - if (!MultHi.hasOneUse() || !MultLo.hasOneUse()) - return false; - - SDLoc DL(SUBENode); - - // Initialize accumulator. - SDValue ACCIn = CurDAG->getNode(MipsISD::MTLOHI, DL, MVT::Untyped, - SUBCNode->getOperand(0), - SUBENode->getOperand(0)); - - // create MipsSub(u) node - MultOpc = MultOpc == ISD::UMUL_LOHI ? MipsISD::MSubu : MipsISD::MSub; - - SDValue MSub = CurDAG->getNode(MultOpc, DL, MVT::Glue, - MultNode->getOperand(0),// Factor 0 - MultNode->getOperand(1),// Factor 1 - ACCIn); - - // replace uses of sube and subc here - if (!SDValue(SUBCNode, 0).use_empty()) { - SDValue LoOut = CurDAG->getNode(MipsISD::MFLO, DL, MVT::i32, MSub); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBCNode, 0), LoOut); - } - if (!SDValue(SUBENode, 0).use_empty()) { - SDValue HiOut = CurDAG->getNode(MipsISD::MFHI, DL, MVT::i32, MSub); - CurDAG->ReplaceAllUsesOfValueWith(SDValue(SUBENode, 0), HiOut); - } - - return true; -} - -static SDValue performADDECombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget &Subtarget) { - if (DCI.isBeforeLegalize()) - return SDValue(); - - if (Subtarget.hasMips32() && !Subtarget.hasMips32r6() && - N->getValueType(0) == MVT::i32 && selectMADD(N, &DAG)) - return SDValue(N, 0); - - return SDValue(); -} - // Fold zero extensions into MipsISD::VEXTRACT_[SZ]EXT_ELT // // Performs the following transformations: @@ -820,19 +661,6 @@ static SDValue performORCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue performSUBECombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const MipsSubtarget &Subtarget) { - if (DCI.isBeforeLegalize()) - return SDValue(); - - if (Subtarget.hasMips32() && N->getValueType(0) == MVT::i32 && - selectMSUB(N, &DAG)) - return SDValue(N, 0); - - return SDValue(); -} - static SDValue genConstMult(SDValue X, uint64_t C, const SDLoc &DL, EVT VT, EVT ShiftTy, SelectionDAG &DAG) { // Clear the upper (64 - VT.sizeInBits) bits. @@ -1110,16 +938,12 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { SDValue Val; switch (N->getOpcode()) { - case ISD::ADDE: - return performADDECombine(N, DAG, DCI, Subtarget); case ISD::AND: Val = performANDCombine(N, DAG, DCI, Subtarget); break; case ISD::OR: Val = performORCombine(N, DAG, DCI, Subtarget); break; - case ISD::SUBE: - return performSUBECombine(N, DAG, DCI, Subtarget); case ISD::MUL: return performMULCombine(N, DAG, DCI, this); case ISD::SHL: @@ -3596,9 +3420,17 @@ MipsSETargetLowering::emitST_F16_PSEUDO(MachineInstr &MI, : (Subtarget.isABI_O32() ? &Mips::GPR32RegClass : &Mips::GPR64RegClass); const bool UsingMips32 = RC == &Mips::GPR32RegClass; - unsigned Rs = RegInfo.createVirtualRegister(RC); + unsigned Rs = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); BuildMI(*BB, MI, DL, TII->get(Mips::COPY_U_H), Rs).addReg(Ws).addImm(0); + if(!UsingMips32) { + unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR64RegClass); + BuildMI(*BB, MI, DL, TII->get(Mips::SUBREG_TO_REG), Tmp) + .addImm(0) + .addReg(Rs) + .addImm(Mips::sub_32); + Rs = Tmp; + } BuildMI(*BB, MI, DL, TII->get(UsingMips32 ? Mips::SH : Mips::SH64)) .addReg(Rs) .addReg(Rt) @@ -3649,6 +3481,12 @@ MipsSETargetLowering::emitLD_F16_PSEUDO(MachineInstr &MI, for (unsigned i = 1; i < MI.getNumOperands(); i++) MIB.add(MI.getOperand(i)); + if(!UsingMips32) { + unsigned Tmp = RegInfo.createVirtualRegister(&Mips::GPR32RegClass); + BuildMI(*BB, MI, DL, TII->get(Mips::COPY), Tmp).addReg(Rt, 0, Mips::sub_32); + Rt = Tmp; + } + BuildMI(*BB, MI, DL, TII->get(Mips::FILL_H), Wd).addReg(Rt); MI.eraseFromParent(); @@ -3716,6 +3554,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, assert(Subtarget.hasMSA() && Subtarget.hasMips32r2()); bool IsFGR64onMips64 = Subtarget.hasMips64() && IsFGR64; + bool IsFGR64onMips32 = !Subtarget.hasMips64() && IsFGR64; const TargetInstrInfo *TII = Subtarget.getInstrInfo(); DebugLoc DL = MI.getDebugLoc(); @@ -3726,7 +3565,9 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); const TargetRegisterClass *GPRRC = IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - unsigned MFC1Opc = IsFGR64onMips64 ? Mips::DMFC1 : Mips::MFC1; + unsigned MFC1Opc = IsFGR64onMips64 + ? Mips::DMFC1 + : (IsFGR64onMips32 ? Mips::MFC1_D64 : Mips::MFC1); unsigned FILLOpc = IsFGR64onMips64 ? Mips::FILL_D : Mips::FILL_W; // Perform the register class copy as mentioned above. @@ -3735,7 +3576,7 @@ MipsSETargetLowering::emitFPROUND_PSEUDO(MachineInstr &MI, BuildMI(*BB, MI, DL, TII->get(FILLOpc), Wtemp).addReg(Rtemp); unsigned WPHI = Wtemp; - if (!Subtarget.hasMips64() && IsFGR64) { + if (IsFGR64onMips32) { unsigned Rtemp2 = RegInfo.createVirtualRegister(GPRRC); BuildMI(*BB, MI, DL, TII->get(Mips::MFHC1_D64), Rtemp2).addReg(Fs); unsigned Wtemp2 = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); @@ -3829,7 +3670,9 @@ MipsSETargetLowering::emitFPEXTEND_PSEUDO(MachineInstr &MI, MachineRegisterInfo &RegInfo = BB->getParent()->getRegInfo(); const TargetRegisterClass *GPRRC = IsFGR64onMips64 ? &Mips::GPR64RegClass : &Mips::GPR32RegClass; - unsigned MTC1Opc = IsFGR64onMips64 ? Mips::DMTC1 : Mips::MTC1; + unsigned MTC1Opc = IsFGR64onMips64 + ? Mips::DMTC1 + : (IsFGR64onMips32 ? Mips::MTC1_D64 : Mips::MTC1); unsigned COPYOpc = IsFGR64onMips64 ? Mips::COPY_S_D : Mips::COPY_S_W; unsigned Wtemp = RegInfo.createVirtualRegister(&Mips::MSA128WRegClass); diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td index 8ec55ab6284da..c2947bb44ef58 100644 --- a/lib/Target/Mips/MipsSchedule.td +++ b/lib/Target/Mips/MipsSchedule.td @@ -226,7 +226,6 @@ def II_MFC1 : InstrItinClass; def II_MFHC1 : InstrItinClass; def II_MFC2 : InstrItinClass; def II_MFHI_MFLO : InstrItinClass; // mfhi and mflo -def II_MFTR : InstrItinClass; def II_MOD : InstrItinClass; def II_MODU : InstrItinClass; def II_MOVE : InstrItinClass; @@ -256,7 +255,6 @@ def II_MTC1 : InstrItinClass; def II_MTHC1 : InstrItinClass; def II_MTC2 : InstrItinClass; def II_MTHI_MTLO : InstrItinClass; // mthi and mtlo -def II_MTTR : InstrItinClass; def II_MUL : InstrItinClass; def II_MUH : InstrItinClass; def II_MUHU : InstrItinClass; @@ -666,14 +664,12 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_MFHC0 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MFC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MFC2 , [InstrStage<2, [ALU]>]>, - InstrItinData<II_MFTR , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTC0 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTHC0 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTC2 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MFHC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTHC1 , [InstrStage<2, [ALU]>]>, - InstrItinData<II_MTTR , [InstrStage<2, [ALU]>]>, InstrItinData<II_CACHE , [InstrStage<1, [ALU]>]>, InstrItinData<II_PREF , [InstrStage<1, [ALU]>]>, InstrItinData<II_CACHEE , [InstrStage<1, [ALU]>]>, diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index 7619e7b08612b..cce3b8c4c8d18 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -152,6 +152,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // HasMT -- support MT ASE. bool HasMT; + // Disable use of the `jal` instruction. + bool UseLongCalls = false; + InstrItineraryData InstrItins; // We can override the determination of whether we are in mips16 mode @@ -269,6 +272,8 @@ public: bool useSoftFloat() const { return IsSoftFloat; } + bool useLongCalls() const { return UseLongCalls; } + bool enableLongBranchPass() const { return hasStandardEncoding() || allowMixed16_32(); } diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index af24838665e1f..7d9f99ce071e8 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -119,9 +119,6 @@ public: SMLoc IDLoc, const MCSubtargetInfo *STI); void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, SMLoc IDLoc, const MCSubtargetInfo *STI); - void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0, - int16_t Imm1, int16_t Imm2, SMLoc IDLoc, - const MCSubtargetInfo *STI); void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, const MCSubtargetInfo *STI); void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount, diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp index a00b56af0490a..92c8c224b71b4 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp @@ -271,7 +271,8 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo, unsigned RegBits = getMachineOpValue(MI, MI.getOperand(OpNo+1), Fixups, STI) << 12; const MCOperand &MO = MI.getOperand(OpNo); - assert(MO.isImm()); + assert(MO.isImm() && !(MO.getImm() % 16) && + "Expecting an immediate that is a multiple of 16"); return ((getMachineOpValue(MI, MO, Fixups, STI) >> 4) & 0xFFF) | RegBits; } diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 3aaf7ef2c2a02..901539b682baa 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -178,7 +178,7 @@ namespace { /// a base register plus a signed 16-bit displacement [r+imm]. bool SelectAddrImm(SDValue N, SDValue &Disp, SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, false); + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0); } /// SelectAddrImmOffs - Return true if the operand is valid for a preinc @@ -211,7 +211,11 @@ namespace { /// a base register plus a signed 16-bit displacement that is a multiple of 4. /// Suitable for use by STD and friends. bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) { - return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, true); + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4); + } + + bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) { + return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16); } // Select an address into a single register. @@ -305,6 +309,7 @@ private: bool AllUsersSelectZero(SDNode *N); void SwapAllSelectUsers(SDNode *N); + bool isOffsetMultipleOf(SDNode *N, unsigned Val) const; void transferMemOperands(SDNode *N, SDNode *Result); }; @@ -2999,6 +3004,25 @@ SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, return get64BitZExtCompare(LHS, RHS, CC, RHSValue, dl); } +/// Does this node represent a load/store node whose address can be represented +/// with a register plus an immediate that's a multiple of \p Val: +bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const { + LoadSDNode *LDN = dyn_cast<LoadSDNode>(N); + StoreSDNode *STN = dyn_cast<StoreSDNode>(N); + SDValue AddrOp; + if (LDN) + AddrOp = LDN->getOperand(1); + else if (STN) + AddrOp = STN->getOperand(2); + + short Imm = 0; + if (AddrOp.getOpcode() == ISD::ADD) + return isIntS16Immediate(AddrOp.getOperand(1), Imm) && !(Imm % Val); + + // If the address comes from the outside, the offset will be zero. + return AddrOp.getOpcode() == ISD::CopyFromReg; +} + void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 0e069ec1665f7..b3a3c73f6df03 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -2130,12 +2130,12 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) { /// Returns true if the address N can be represented by a base register plus /// a signed 16-bit displacement [r+imm], and if it is not better -/// represented as reg+reg. If Aligned is true, only accept displacements -/// suitable for STD and friends, i.e. multiples of 4. +/// represented as reg+reg. If \p Alignment is non-zero, only accept +/// displacements that are multiples of that value. bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG, - bool Aligned) const { + unsigned Alignment) const { // FIXME dl should come from parent load or store, not from address SDLoc dl(N); // If this can be more profitably realized as r+r, fail. @@ -2145,7 +2145,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, if (N.getOpcode() == ISD::ADD) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) { Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType()); @@ -2169,7 +2169,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, } else if (N.getOpcode() == ISD::OR) { int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && - (!Aligned || (imm & 3) == 0)) { + (!Alignment || (imm % Alignment) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add // (for better address arithmetic) if the LHS and RHS of the OR are // provably disjoint. @@ -2196,7 +2196,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" int16_t Imm; - if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { + if (isIntS16Immediate(CN, Imm) && (!Alignment || (Imm % Alignment) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, CN->getValueType(0)); @@ -2206,7 +2206,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // Handle 32-bit sext immediates with LIS + addr mode. if ((CN->getValueType(0) == MVT::i32 || (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) && - (!Aligned || (CN->getZExtValue() & 3) == 0)) { + (!Alignment || (CN->getZExtValue() % Alignment) == 0)) { int Addr = (int)CN->getZExtValue(); // Otherwise, break this down into an LIS + disp. @@ -2321,14 +2321,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base, // LDU/STU can only handle immediates that are a multiple of 4. if (VT != MVT::i64) { - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, false)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0)) return false; } else { // LDU/STU need an address with at least 4-byte alignment. if (Alignment < 4) return false; - if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, true)) + if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4)) return false; } diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 821927d3b1576..49d7d8220af16 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -616,7 +616,7 @@ namespace llvm { /// is not better represented as reg+reg. If Aligned is true, only accept /// displacements suitable for STD and friends, i.e. multiples of 4. bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base, - SelectionDAG &DAG, bool Aligned) const; + SelectionDAG &DAG, unsigned Alignment) const; /// SelectAddressRegRegOnly - Given the specified addressed, force it to be /// represented as an indexed [r+r] operation. diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp index 13b4f9ab962da..e74ba38c351f0 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.cpp +++ b/lib/Target/PowerPC/PPCInstrInfo.cpp @@ -1635,8 +1635,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, if (equalityOnly) { // We need to check the uses of the condition register in order to reject // non-equality comparisons. - for (MachineRegisterInfo::use_instr_iterator I =MRI->use_instr_begin(CRReg), - IE = MRI->use_instr_end(); I != IE; ++I) { + for (MachineRegisterInfo::use_instr_iterator + I = MRI->use_instr_begin(CRReg), IE = MRI->use_instr_end(); + I != IE; ++I) { MachineInstr *UseMI = &*I; if (UseMI->getOpcode() == PPC::BCC) { unsigned Pred = UseMI->getOperand(0).getImm(); @@ -1658,8 +1659,9 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, for (MachineBasicBlock::iterator EL = CmpInstr.getParent()->end(); I != EL; ++I) { bool FoundUse = false; - for (MachineRegisterInfo::use_instr_iterator J =MRI->use_instr_begin(CRReg), - JE = MRI->use_instr_end(); J != JE; ++J) + for (MachineRegisterInfo::use_instr_iterator + J = MRI->use_instr_begin(CRReg), JE = MRI->use_instr_end(); + J != JE; ++J) if (&*J == &*I) { FoundUse = true; break; diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 6d9f55206b6af..dd7fc2659102a 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -405,6 +405,25 @@ def unaligned4sextloadi32 : PatFrag<(ops node:$ptr), (sextloadi32 node:$ptr), [{ return cast<LoadSDNode>(N)->getAlignment() < 4; }]>; +// This is a somewhat weaker condition than actually checking for 16-byte +// alignment. It is simply checking that the displacement can be represented +// as an immediate that is a multiple of 16 (i.e. the requirements for DQ-Form +// instructions). +def quadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def quadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return isOffsetMultipleOf(N, 16); +}]>; +def nonQuadwOffsetLoad : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; +def nonQuadwOffsetStore : PatFrag<(ops node:$val, node:$ptr), + (store node:$val, node:$ptr), [{ + return !isOffsetMultipleOf(N, 16); +}]>; + //===----------------------------------------------------------------------===// // PowerPC Flag Definitions. @@ -815,7 +834,8 @@ def pred : Operand<OtherVT> { def iaddr : ComplexPattern<iPTR, 2, "SelectAddrImm", [], []>; def xaddr : ComplexPattern<iPTR, 2, "SelectAddrIdx", [], []>; def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>; -def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" +def ixaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX4", [], []>; // "std" +def iqaddr : ComplexPattern<iPTR, 2, "SelectAddrImmX16", [], []>; // "stxv" // The address in a single register. This is used with the SjLj // pseudo-instructions. diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 43635a8919e2b..942e8b392b82b 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -2606,37 +2606,41 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { } // IsLittleEndian, HasP9Vector // D-Form Load/Store - def : Pat<(v4i32 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4f32 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2i64 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (load iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddr:$src)), (LXV memrix16:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddr:$src)), (LXV memrix16:$src)>; - - def : Pat<(store v4f32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v4i32:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v2f64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(store v2i64:$rS, iaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddr:$dst), + def : Pat<(v4i32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4f32 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2i64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (quadwOffsetLoad iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iqaddr:$src)), (LXV memrix16:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iqaddr:$src)), (LXV memrix16:$src)>; + + def : Pat<(quadwOffsetStore v4f32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(quadwOffsetStore v2i64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddr:$dst), + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iqaddr:$dst), (STXV $rS, memrix16:$dst)>; - def : Pat<(v2f64 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v2i64 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4f32 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4i32 (load xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xaddr:$src)), (LXVX xaddr:$src)>; - def : Pat<(store v2f64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v2i64:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v4f32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(store v4i32:$rS, xaddr:$dst), (STXVX $rS, xaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xaddr:$dst), - (STXVX $rS, xaddr:$dst)>; - def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xaddr:$dst), - (STXVX $rS, xaddr:$dst)>; + def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>; + def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; + def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst), + (STXVX $rS, xoaddr:$dst)>; def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))), (v4i32 (LXVWSX xoaddr:$src))>; def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))), @@ -2788,21 +2792,21 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in { let isPseudo = 1 in { def DFLOADf32 : Pseudo<(outs vssrc:$XT), (ins memrix:$src), "#DFLOADf32", - [(set f32:$XT, (load iaddr:$src))]>; + [(set f32:$XT, (load ixaddr:$src))]>; def DFLOADf64 : Pseudo<(outs vsfrc:$XT), (ins memrix:$src), "#DFLOADf64", - [(set f64:$XT, (load iaddr:$src))]>; + [(set f64:$XT, (load ixaddr:$src))]>; def DFSTOREf32 : Pseudo<(outs), (ins vssrc:$XT, memrix:$dst), "#DFSTOREf32", - [(store f32:$XT, iaddr:$dst)]>; + [(store f32:$XT, ixaddr:$dst)]>; def DFSTOREf64 : Pseudo<(outs), (ins vsfrc:$XT, memrix:$dst), "#DFSTOREf64", - [(store f64:$XT, iaddr:$dst)]>; + [(store f64:$XT, ixaddr:$dst)]>; } - def : Pat<(f64 (extloadf32 iaddr:$src)), - (COPY_TO_REGCLASS (DFLOADf32 iaddr:$src), VSFRC)>; - def : Pat<(f32 (fpround (extloadf32 iaddr:$src))), - (f32 (DFLOADf32 iaddr:$src))>; + def : Pat<(f64 (extloadf32 ixaddr:$src)), + (COPY_TO_REGCLASS (DFLOADf32 ixaddr:$src), VSFRC)>; + def : Pat<(f32 (fpround (extloadf32 ixaddr:$src))), + (f32 (DFLOADf32 ixaddr:$src))>; } // end HasP9Vector, AddedComplexity // Integer extend helper dags 32 -> 64 @@ -2881,13 +2885,13 @@ def FltToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 xoaddr:$A))))); } def FltToLongLoadP9 { - dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 iaddr:$A))))); + dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (extloadf32 ixaddr:$A))))); } def FltToULongLoad { dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 xoaddr:$A))))); } def FltToULongLoadP9 { - dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 iaddr:$A))))); + dag A = (i64 (PPCmfvsr (PPCfctiduz (f64 (extloadf32 ixaddr:$A))))); } def FltToLong { dag A = (i64 (PPCmfvsr (PPCfctidz (fpextend f32:$A)))); @@ -2911,13 +2915,13 @@ def DblToIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load xoaddr:$A))))); } def DblToIntLoadP9 { - dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load iaddr:$A))))); + dag A = (i32 (PPCmfvsr (PPCfctiwz (f64 (load ixaddr:$A))))); } def DblToUIntLoad { dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load xoaddr:$A))))); } def DblToUIntLoadP9 { - dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load iaddr:$A))))); + dag A = (i32 (PPCmfvsr (PPCfctiwuz (f64 (load ixaddr:$A))))); } def DblToLongLoad { dag A = (i64 (PPCmfvsr (PPCfctidz (f64 (load xoaddr:$A))))); @@ -3088,17 +3092,17 @@ let AddedComplexity = 400 in { (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>; def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPSXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; + (XSCVDPSXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)), (v4i32 (XXSPLTW (COPY_TO_REGCLASS - (XSCVDPUXWS (DFLOADf64 iaddr:$A)), VSRC), 1))>; + (XSCVDPUXWS (DFLOADf64 ixaddr:$A)), VSRC), 1))>; def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS - (DFLOADf32 iaddr:$A), + (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)), (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS - (DFLOADf32 iaddr:$A), + (DFLOADf32 ixaddr:$A), VSFRC)), 0))>; } diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp index 8af7f7e981171..9207165c46a6d 100644 --- a/lib/Target/PowerPC/PPCRegisterInfo.cpp +++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp @@ -754,19 +754,31 @@ bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF, return false; } -// Figure out if the offset in the instruction must be a multiple of 4. -// This is true for instructions like "STD". -static bool usesIXAddr(const MachineInstr &MI) { +// If the offset must be a multiple of some value, return what that value is. +static unsigned offsetMinAlign(const MachineInstr &MI) { unsigned OpC = MI.getOpcode(); switch (OpC) { default: - return false; + return 1; case PPC::LWA: case PPC::LWA_32: case PPC::LD: + case PPC::LDU: case PPC::STD: - return true; + case PPC::STDU: + case PPC::DFLOADf32: + case PPC::DFLOADf64: + case PPC::DFSTOREf32: + case PPC::DFSTOREf64: + case PPC::LXSD: + case PPC::LXSSP: + case PPC::STXSD: + case PPC::STXSSP: + return 4; + case PPC::LXV: + case PPC::STXV: + return 16; } } @@ -852,9 +864,6 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, MI.getOperand(FIOperandNum).ChangeToRegister( FrameIndex < 0 ? getBaseRegister(MF) : getFrameRegister(MF), false); - // Figure out if the offset in the instruction is shifted right two bits. - bool isIXAddr = usesIXAddr(MI); - // If the instruction is not present in ImmToIdxMap, then it has no immediate // form (and must be r+r). bool noImmForm = !MI.isInlineAsm() && OpC != TargetOpcode::STACKMAP && @@ -883,7 +892,8 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // happen in invalid code. assert(OpC != PPC::DBG_VALUE && "This should be handled in a target-independent way"); - if (!noImmForm && ((isInt<16>(Offset) && (!isIXAddr || (Offset & 3) == 0)) || + if (!noImmForm && ((isInt<16>(Offset) && + ((Offset % offsetMinAlign(MI)) == 0)) || OpC == TargetOpcode::STACKMAP || OpC == TargetOpcode::PATCHPOINT)) { MI.getOperand(OffsetOperandNo).ChangeToImmediate(Offset); @@ -1076,5 +1086,5 @@ bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, return MI->getOpcode() == PPC::DBG_VALUE || // DBG_VALUE is always Reg+Imm MI->getOpcode() == TargetOpcode::STACKMAP || MI->getOpcode() == TargetOpcode::PATCHPOINT || - (isInt<16>(Offset) && (!usesIXAddr(*MI) || (Offset & 3) == 0)); + (isInt<16>(Offset) && (Offset % offsetMinAlign(*MI)) == 0); } diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h index 2dc3828334ac5..be705507b5347 100644 --- a/lib/Target/PowerPC/PPCTargetMachine.h +++ b/lib/Target/PowerPC/PPCTargetMachine.h @@ -41,6 +41,8 @@ public: ~PPCTargetMachine() override; const PPCSubtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. const PPCSubtarget *getSubtargetImpl() const = delete; // Pass Pipeline Configuration diff --git a/lib/Target/Sparc/Sparc.td b/lib/Target/Sparc/Sparc.td index 11004c5a952fc..91cab00b2b651 100644 --- a/lib/Target/Sparc/Sparc.td +++ b/lib/Target/Sparc/Sparc.td @@ -20,6 +20,10 @@ include "llvm/Target/Target.td" // SPARC Subtarget features. // +def FeatureSoftMulDiv + : SubtargetFeature<"soft-mul-div", "UseSoftMulDiv", "true", + "Use software emulation for integer multiply and divide">; + def FeatureV9 : SubtargetFeature<"v9", "IsV9", "true", "Enable SPARC-V9 instructions">; @@ -75,7 +79,7 @@ class Proc<string Name, list<SubtargetFeature> Features> : Processor<Name, NoItineraries, Features>; def : Proc<"generic", []>; -def : Proc<"v7", []>; +def : Proc<"v7", [FeatureSoftMulDiv]>; def : Proc<"v8", []>; def : Proc<"supersparc", []>; def : Proc<"sparclite", []>; diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index 9e7e3c6b705a9..6767a59a97571 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -1689,6 +1689,19 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM, setOperationAction(ISD::MULHS, MVT::i32, Expand); setOperationAction(ISD::MUL, MVT::i32, Expand); + if (Subtarget->useSoftMulDiv()) { + // .umul works for both signed and unsigned + setOperationAction(ISD::SMUL_LOHI, MVT::i32, Expand); + setOperationAction(ISD::UMUL_LOHI, MVT::i32, Expand); + setLibcallName(RTLIB::MUL_I32, ".umul"); + + setOperationAction(ISD::SDIV, MVT::i32, Expand); + setLibcallName(RTLIB::SDIV_I32, ".div"); + + setOperationAction(ISD::UDIV, MVT::i32, Expand); + setLibcallName(RTLIB::UDIV_I32, ".udiv"); + } + if (Subtarget->is64Bit()) { setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand); diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index ae45c8be67524..3194ad4aeb6b1 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -27,6 +27,9 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">; // True when generating 64-bit code. This also implies HasV9. def Is64Bit : Predicate<"Subtarget->is64Bit()">; +def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">, + AssemblerPredicate<"FeatureSoftMulDiv">; + // HasV9 - This predicate is true when the target processor supports V9 // instructions. Note that the machine may be running in 32-bit mode. def HasV9 : Predicate<"Subtarget->isV9()">, diff --git a/lib/Target/Sparc/SparcSubtarget.cpp b/lib/Target/Sparc/SparcSubtarget.cpp index 43ddef3cc96e9..daac56add87ce 100644 --- a/lib/Target/Sparc/SparcSubtarget.cpp +++ b/lib/Target/Sparc/SparcSubtarget.cpp @@ -28,6 +28,7 @@ void SparcSubtarget::anchor() { } SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) { + UseSoftMulDiv = false; IsV9 = false; IsLeon = false; V8DeprecatedInsts = false; diff --git a/lib/Target/Sparc/SparcSubtarget.h b/lib/Target/Sparc/SparcSubtarget.h index fa42da425ff2d..d18139984b87f 100644 --- a/lib/Target/Sparc/SparcSubtarget.h +++ b/lib/Target/Sparc/SparcSubtarget.h @@ -32,6 +32,7 @@ class StringRef; class SparcSubtarget : public SparcGenSubtargetInfo { Triple TargetTriple; virtual void anchor(); + bool UseSoftMulDiv; bool IsV9; bool IsLeon; bool V8DeprecatedInsts; @@ -76,6 +77,7 @@ public: bool enableMachineScheduler() const override; + bool useSoftMulDiv() const { return UseSoftMulDiv; } bool isV9() const { return IsV9; } bool isLeon() const { return IsLeon; } bool isVIS() const { return IsVIS; } diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index ee23692ad1db2..33680789ee082 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -275,6 +275,10 @@ public: SMLoc getEndLoc() const override { return EndLoc; } void print(raw_ostream &OS) const override; + /// getLocRange - Get the range between the first and last token of this + /// operand. + SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); } + // Used by the TableGen code to add particular types of operand // to an instruction. void addRegOperands(MCInst &Inst, unsigned N) const { @@ -1164,6 +1168,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands, return false; } +std::string SystemZMnemonicSpellCheck(StringRef S, uint64_t FBS); + bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -1209,8 +1215,13 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(ErrorLoc, "invalid operand for instruction"); } - case Match_MnemonicFail: - return Error(IDLoc, "invalid instruction"); + case Match_MnemonicFail: { + uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + std::string Suggestion = SystemZMnemonicSpellCheck( + ((SystemZOperand &)*Operands[0]).getToken(), FBS); + return Error(IDLoc, "invalid instruction" + Suggestion, + ((SystemZOperand &)*Operands[0]).getLocRange()); + } } llvm_unreachable("Unexpected match type"); diff --git a/lib/Target/SystemZ/LLVMBuild.txt b/lib/Target/SystemZ/LLVMBuild.txt index 6f8431db7b11c..9b8b141fd52ab 100644 --- a/lib/Target/SystemZ/LLVMBuild.txt +++ b/lib/Target/SystemZ/LLVMBuild.txt @@ -31,5 +31,5 @@ has_jit = 1 type = Library name = SystemZCodeGen parent = SystemZ -required_libraries = Analysis AsmPrinter CodeGen Core MC SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target +required_libraries = Analysis AsmPrinter CodeGen Core MC Scalar SelectionDAG Support SystemZAsmPrinter SystemZDesc SystemZInfo Target add_to_library_groups = SystemZ diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td index c5faa0d62881d..fda9c30fe3fcc 100644 --- a/lib/Target/SystemZ/SystemZFeatures.td +++ b/lib/Target/SystemZ/SystemZFeatures.td @@ -189,6 +189,58 @@ def Arch11NewFeatures : SystemZFeatureList<[ //===----------------------------------------------------------------------===// // +// New features added in the Twelvth Edition of the z/Architecture +// +//===----------------------------------------------------------------------===// + +def FeatureMiscellaneousExtensions2 : SystemZFeature< + "miscellaneous-extensions-2", "MiscellaneousExtensions2", + "Assume that the miscellaneous-extensions facility 2 is installed" +>; + +def FeatureGuardedStorage : SystemZFeature< + "guarded-storage", "GuardedStorage", + "Assume that the guarded-storage facility is installed" +>; + +def FeatureMessageSecurityAssist7 : SystemZFeature< + "message-security-assist-extension7", "MessageSecurityAssist7", + "Assume that the message-security-assist extension facility 7 is installed" +>; + +def FeatureMessageSecurityAssist8 : SystemZFeature< + "message-security-assist-extension8", "MessageSecurityAssist8", + "Assume that the message-security-assist extension facility 8 is installed" +>; + +def FeatureVectorEnhancements1 : SystemZFeature< + "vector-enhancements-1", "VectorEnhancements1", + "Assume that the vector enhancements facility 1 is installed" +>; +def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">; + +def FeatureVectorPackedDecimal : SystemZFeature< + "vector-packed-decimal", "VectorPackedDecimal", + "Assume that the vector packed decimal facility is installed" +>; + +def FeatureInsertReferenceBitsMultiple : SystemZFeature< + "insert-reference-bits-multiple", "InsertReferenceBitsMultiple", + "Assume that the insert-reference-bits-multiple facility is installed" +>; + +def Arch12NewFeatures : SystemZFeatureList<[ + FeatureMiscellaneousExtensions2, + FeatureGuardedStorage, + FeatureMessageSecurityAssist7, + FeatureMessageSecurityAssist8, + FeatureVectorEnhancements1, + FeatureVectorPackedDecimal, + FeatureInsertReferenceBitsMultiple +]>; + +//===----------------------------------------------------------------------===// +// // Cumulative supported and unsupported feature sets // //===----------------------------------------------------------------------===// @@ -201,9 +253,13 @@ def Arch10SupportedFeatures : SystemZFeatureAdd<Arch9SupportedFeatures.List, Arch10NewFeatures.List>; def Arch11SupportedFeatures : SystemZFeatureAdd<Arch10SupportedFeatures.List, Arch11NewFeatures.List>; +def Arch12SupportedFeatures + : SystemZFeatureAdd<Arch11SupportedFeatures.List, Arch12NewFeatures.List>; -def Arch11UnsupportedFeatures +def Arch12UnsupportedFeatures : SystemZFeatureList<[]>; +def Arch11UnsupportedFeatures + : SystemZFeatureAdd<Arch12UnsupportedFeatures.List, Arch12NewFeatures.List>; def Arch10UnsupportedFeatures : SystemZFeatureAdd<Arch11UnsupportedFeatures.List, Arch11NewFeatures.List>; def Arch9UnsupportedFeatures diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 2801141cd951f..2d916d2e15214 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -101,7 +101,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass); addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass); } - addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); + if (Subtarget.hasVectorEnhancements1()) + addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass); + else + addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass); if (Subtarget.hasVector()) { addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass); @@ -316,7 +319,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::AND, VT, Legal); setOperationAction(ISD::OR, VT, Legal); setOperationAction(ISD::XOR, VT, Legal); - setOperationAction(ISD::CTPOP, VT, Custom); + if (Subtarget.hasVectorEnhancements1()) + setOperationAction(ISD::CTPOP, VT, Legal); + else + setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Legal); setOperationAction(ISD::CTLZ, VT, Legal); @@ -414,10 +420,60 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, setOperationAction(ISD::FROUND, MVT::v2f64, Legal); } + // The vector enhancements facility 1 has instructions for these. + if (Subtarget.hasVectorEnhancements1()) { + setOperationAction(ISD::FADD, MVT::v4f32, Legal); + setOperationAction(ISD::FNEG, MVT::v4f32, Legal); + setOperationAction(ISD::FSUB, MVT::v4f32, Legal); + setOperationAction(ISD::FMUL, MVT::v4f32, Legal); + setOperationAction(ISD::FMA, MVT::v4f32, Legal); + setOperationAction(ISD::FDIV, MVT::v4f32, Legal); + setOperationAction(ISD::FABS, MVT::v4f32, Legal); + setOperationAction(ISD::FSQRT, MVT::v4f32, Legal); + setOperationAction(ISD::FRINT, MVT::v4f32, Legal); + setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal); + setOperationAction(ISD::FFLOOR, MVT::v4f32, Legal); + setOperationAction(ISD::FCEIL, MVT::v4f32, Legal); + setOperationAction(ISD::FTRUNC, MVT::v4f32, Legal); + setOperationAction(ISD::FROUND, MVT::v4f32, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f64, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::f64, Legal); + setOperationAction(ISD::FMINNAN, MVT::f64, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::v2f64, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v2f64, Legal); + setOperationAction(ISD::FMINNUM, MVT::v2f64, Legal); + setOperationAction(ISD::FMINNAN, MVT::v2f64, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::f32, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMAXNAN, MVT::v4f32, Legal); + setOperationAction(ISD::FMINNUM, MVT::v4f32, Legal); + setOperationAction(ISD::FMINNAN, MVT::v4f32, Legal); + + setOperationAction(ISD::FMAXNUM, MVT::f128, Legal); + setOperationAction(ISD::FMAXNAN, MVT::f128, Legal); + setOperationAction(ISD::FMINNUM, MVT::f128, Legal); + setOperationAction(ISD::FMINNAN, MVT::f128, Legal); + } + // We have fused multiply-addition for f32 and f64 but not f128. setOperationAction(ISD::FMA, MVT::f32, Legal); setOperationAction(ISD::FMA, MVT::f64, Legal); - setOperationAction(ISD::FMA, MVT::f128, Expand); + if (Subtarget.hasVectorEnhancements1()) + setOperationAction(ISD::FMA, MVT::f128, Legal); + else + setOperationAction(ISD::FMA, MVT::f128, Expand); + + // We don't have a copysign instruction on vector registers. + if (Subtarget.hasVectorEnhancements1()) + setOperationAction(ISD::FCOPYSIGN, MVT::f128, Expand); // Needed so that we don't try to implement f128 constant loads using // a load-and-extend of a f80 constant (in cases where the constant @@ -425,6 +481,12 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM, for (MVT VT : MVT::fp_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::f80, Expand); + // We don't have extending load instruction on vector registers. + if (Subtarget.hasVectorEnhancements1()) { + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f32, Expand); + setLoadExtAction(ISD::EXTLOAD, MVT::f128, MVT::f64, Expand); + } + // Floating-point truncation and stores need to be done separately. setTruncStoreAction(MVT::f64, MVT::f32, Expand); setTruncStoreAction(MVT::f128, MVT::f32, Expand); @@ -489,7 +551,7 @@ bool SystemZTargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const { case MVT::f64: return true; case MVT::f128: - return false; + return Subtarget.hasVectorEnhancements1(); default: break; } @@ -1462,21 +1524,25 @@ static bool isIntrinsicWithCC(SDValue Op, unsigned &Opcode, unsigned &CCValid) { return true; case Intrinsic::s390_vfcedbs: + case Intrinsic::s390_vfcesbs: Opcode = SystemZISD::VFCMPES; CCValid = SystemZ::CCMASK_VCMP; return true; case Intrinsic::s390_vfchdbs: + case Intrinsic::s390_vfchsbs: Opcode = SystemZISD::VFCMPHS; CCValid = SystemZ::CCMASK_VCMP; return true; case Intrinsic::s390_vfchedbs: + case Intrinsic::s390_vfchesbs: Opcode = SystemZISD::VFCMPHES; CCValid = SystemZ::CCMASK_VCMP; return true; case Intrinsic::s390_vftcidb: + case Intrinsic::s390_vftcisb: Opcode = SystemZISD::VFTCI; CCValid = SystemZ::CCMASK_VCMP; return true; @@ -2316,11 +2382,15 @@ static SDValue expandV4F32ToV2F64(SelectionDAG &DAG, int Start, const SDLoc &DL, // Build a comparison of vectors CmpOp0 and CmpOp1 using opcode Opcode, // producing a result of type VT. -static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL, - EVT VT, SDValue CmpOp0, SDValue CmpOp1) { - // There is no hardware support for v4f32, so extend the vector into - // two v2f64s and compare those. - if (CmpOp0.getValueType() == MVT::v4f32) { +SDValue SystemZTargetLowering::getVectorCmp(SelectionDAG &DAG, unsigned Opcode, + const SDLoc &DL, EVT VT, + SDValue CmpOp0, + SDValue CmpOp1) const { + // There is no hardware support for v4f32 (unless we have the vector + // enhancements facility 1), so extend the vector into two v2f64s + // and compare those. + if (CmpOp0.getValueType() == MVT::v4f32 && + !Subtarget.hasVectorEnhancements1()) { SDValue H0 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp0); SDValue L0 = expandV4F32ToV2F64(DAG, 2, DL, CmpOp0); SDValue H1 = expandV4F32ToV2F64(DAG, 0, DL, CmpOp1); @@ -2334,9 +2404,11 @@ static SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &DL, // Lower a vector comparison of type CC between CmpOp0 and CmpOp1, producing // an integer mask of type VT. -static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT, - ISD::CondCode CC, SDValue CmpOp0, - SDValue CmpOp1) { +SDValue SystemZTargetLowering::lowerVectorSETCC(SelectionDAG &DAG, + const SDLoc &DL, EVT VT, + ISD::CondCode CC, + SDValue CmpOp0, + SDValue CmpOp1) const { bool IsFP = CmpOp0.getValueType().isFloatingPoint(); bool Invert = false; SDValue Cmp; @@ -2960,6 +3032,12 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op, // We define this so that it can be used for constant division. lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); + else if (Subtarget.hasMiscellaneousExtensions2()) + // SystemZISD::SMUL_LOHI returns the low result in the odd register and + // the high result in the even register. ISD::SMUL_LOHI is defined to + // return the low half first, so the results are in reverse order. + lowerGR128Binary(DAG, DL, VT, SystemZISD::SMUL_LOHI, + Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); else { // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI: // @@ -4658,6 +4736,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(SELECT_CCMASK); OPCODE(ADJDYNALLOC); OPCODE(POPCNT); + OPCODE(SMUL_LOHI); OPCODE(UMUL_LOHI); OPCODE(SDIVREM); OPCODE(UDIVREM); @@ -6118,6 +6197,7 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::SelectF32: case SystemZ::SelectF64: case SystemZ::SelectF128: + case SystemZ::SelectVR128: return emitSelect(MI, MBB, 0); case SystemZ::CondStore8Mux: diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 6c9c404816f09..abe8b7233e60c 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -88,6 +88,7 @@ enum NodeType : unsigned { // Wrappers around the ISD opcodes of the same name. The output is GR128. // Input operands may be GR64 or GR32, depending on the instruction. + SMUL_LOHI, UMUL_LOHI, SDIVREM, UDIVREM, @@ -479,6 +480,12 @@ private: const SystemZSubtarget &Subtarget; // Implement LowerOperation for individual opcodes. + SDValue getVectorCmp(SelectionDAG &DAG, unsigned Opcode, + const SDLoc &DL, EVT VT, + SDValue CmpOp0, SDValue CmpOp1) const; + SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, + EVT VT, ISD::CondCode CC, + SDValue CmpOp0, SDValue CmpOp1) const; SDValue lowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerBR_CC(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const; diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index 10172bd452034..02aeaadad0d9a 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -12,9 +12,12 @@ //===----------------------------------------------------------------------===// // C's ?: operator for floating-point operands. -def SelectF32 : SelectWrapper<FP32>; -def SelectF64 : SelectWrapper<FP64>; -def SelectF128 : SelectWrapper<FP128>; +def SelectF32 : SelectWrapper<f32, FP32>; +def SelectF64 : SelectWrapper<f64, FP64>; +let Predicates = [FeatureNoVectorEnhancements1] in + def SelectF128 : SelectWrapper<f128, FP128>; +let Predicates = [FeatureVectorEnhancements1] in + def SelectVR128 : SelectWrapper<f128, VR128>; defm CondStoreF32 : CondStores<FP32, nonvolatile_store, nonvolatile_load, bdxaddr20only>; @@ -69,8 +72,9 @@ let Defs = [CC], usesCustomInserter = 1 in { let Predicates = [FeatureVector] in { defm : CompareZeroFP<LTEBRCompare_VecPseudo, FP32>; defm : CompareZeroFP<LTDBRCompare_VecPseudo, FP64>; - defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>; } +let Predicates = [FeatureVector, FeatureNoVectorEnhancements1] in + defm : CompareZeroFP<LTXBRCompare_VecPseudo, FP128>; // Moves between 64-bit integer and floating-point registers. def LGDR : UnaryRRE<"lgdr", 0xB3CD, bitconvert, GR64, FP64>; @@ -83,8 +87,12 @@ let isCodeGenOnly = 1 in { } // The sign of an FP128 is in the high register. -def : Pat<(fcopysign FP32:$src1, FP128:$src2), - (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 FP128:$src2)))), + (CPSDRsd FP32:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureVectorEnhancements1] in + def : Pat<(fcopysign FP32:$src1, (f32 (fpround (f128 VR128:$src2)))), + (CPSDRsd FP32:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>; // fcopysign with an FP64 result. let isCodeGenOnly = 1 in @@ -92,8 +100,12 @@ let isCodeGenOnly = 1 in def CPSDRdd : BinaryRRFb<"cpsdr", 0xB372, fcopysign, FP64, FP64, FP64>; // The sign of an FP128 is in the high register. -def : Pat<(fcopysign FP64:$src1, FP128:$src2), - (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 FP128:$src2)))), + (CPSDRdd FP64:$src1, (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureVectorEnhancements1] in + def : Pat<(fcopysign FP64:$src1, (f64 (fpround (f128 VR128:$src2)))), + (CPSDRdd FP64:$src1, (EXTRACT_SUBREG VR128:$src2, subreg_r64))>; // fcopysign with an FP128 result. Use "upper" as the high half and leave // the low half as-is. @@ -101,12 +113,14 @@ class CopySign128<RegisterOperand cls, dag upper> : Pat<(fcopysign FP128:$src1, cls:$src2), (INSERT_SUBREG FP128:$src1, upper, subreg_h64)>; -def : CopySign128<FP32, (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64), - FP32:$src2)>; -def : CopySign128<FP64, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64), - FP64:$src2)>; -def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64), - (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : CopySign128<FP32, (CPSDRds (EXTRACT_SUBREG FP128:$src1, subreg_h64), + FP32:$src2)>; + def : CopySign128<FP64, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64), + FP64:$src2)>; + def : CopySign128<FP128, (CPSDRdd (EXTRACT_SUBREG FP128:$src1, subreg_h64), + (EXTRACT_SUBREG FP128:$src2, subreg_h64))>; +} defm LoadStoreF32 : MVCLoadStore<load, f32, MVCSequence, 4>; defm LoadStoreF64 : MVCLoadStore<load, f64, MVCSequence, 8>; @@ -166,20 +180,32 @@ def LEXBRA : TernaryRRFe<"lexbra", 0xB346, FP128, FP128>, def LDXBRA : TernaryRRFe<"ldxbra", 0xB345, FP128, FP128>, Requires<[FeatureFPExtension]>; -def : Pat<(f32 (fpround FP128:$src)), - (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>; -def : Pat<(f64 (fpround FP128:$src)), - (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : Pat<(f32 (fpround FP128:$src)), + (EXTRACT_SUBREG (LEXBR FP128:$src), subreg_hr32)>; + def : Pat<(f64 (fpround FP128:$src)), + (EXTRACT_SUBREG (LDXBR FP128:$src), subreg_h64)>; +} // Extend register floating-point values to wider representations. -def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>; -def LXEBR : UnaryRRE<"lxebr", 0xB306, fpextend, FP128, FP32>; -def LXDBR : UnaryRRE<"lxdbr", 0xB305, fpextend, FP128, FP64>; +def LDEBR : UnaryRRE<"ldebr", 0xB304, fpextend, FP64, FP32>; +def LXEBR : UnaryRRE<"lxebr", 0xB306, null_frag, FP128, FP32>; +def LXDBR : UnaryRRE<"lxdbr", 0xB305, null_frag, FP128, FP64>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : Pat<(f128 (fpextend (f32 FP32:$src))), (LXEBR FP32:$src)>; + def : Pat<(f128 (fpextend (f64 FP64:$src))), (LXDBR FP64:$src)>; +} // Extend memory floating-point values to wider representations. def LDEB : UnaryRXE<"ldeb", 0xED04, extloadf32, FP64, 4>; -def LXEB : UnaryRXE<"lxeb", 0xED06, extloadf32, FP128, 4>; -def LXDB : UnaryRXE<"lxdb", 0xED05, extloadf64, FP128, 8>; +def LXEB : UnaryRXE<"lxeb", 0xED06, null_frag, FP128, 4>; +def LXDB : UnaryRXE<"lxdb", 0xED05, null_frag, FP128, 8>; +let Predicates = [FeatureNoVectorEnhancements1] in { + def : Pat<(f128 (extloadf32 bdxaddr12only:$src)), + (LXEB bdxaddr12only:$src)>; + def : Pat<(f128 (extloadf64 bdxaddr12only:$src)), + (LXDB bdxaddr12only:$src)>; +} // Convert a signed integer register value to a floating-point one. def CEFBR : UnaryRRE<"cefbr", 0xB394, sint_to_fp, FP32, GR32>; @@ -426,16 +452,18 @@ def : Pat<(fmul (f64 (fpextend FP32:$src1)), // f128 multiplication of two FP64 registers. def MXDBR : BinaryRRE<"mxdbr", 0xB307, null_frag, FP128, FP64>; -def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))), - (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), - FP64:$src1, subreg_h64), FP64:$src2)>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fmul (f128 (fpextend FP64:$src1)), (f128 (fpextend FP64:$src2))), + (MXDBR (INSERT_SUBREG (f128 (IMPLICIT_DEF)), + FP64:$src1, subreg_h64), FP64:$src2)>; // f128 multiplication of an FP64 register and an f64 memory. def MXDB : BinaryRXE<"mxdb", 0xED07, null_frag, FP128, load, 8>; -def : Pat<(fmul (f128 (fpextend FP64:$src1)), - (f128 (extloadf64 bdxaddr12only:$addr))), - (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), - bdxaddr12only:$addr)>; +let Predicates = [FeatureNoVectorEnhancements1] in + def : Pat<(fmul (f128 (fpextend FP64:$src1)), + (f128 (extloadf64 bdxaddr12only:$addr))), + (MXDB (INSERT_SUBREG (f128 (IMPLICIT_DEF)), FP64:$src1, subreg_h64), + bdxaddr12only:$addr)>; // Fused multiply-add. def MAEBR : TernaryRRD<"maebr", 0xB30E, z_fma, FP32, FP32>; diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index 7620e06ccbc96..033a0a879d37d 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -1091,6 +1091,94 @@ class InstVRIe<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{7-0} = op{7-0}; } +class InstVRIf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<5> V2; + bits<5> V3; + bits<8> I4; + bits<4> M5; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = V2{3-0}; + let Inst{31-28} = V3{3-0}; + let Inst{27-24} = 0; + let Inst{23-20} = M5; + let Inst{19-12} = I4; + let Inst{11} = V1{4}; + let Inst{10} = V2{4}; + let Inst{9} = V3{4}; + let Inst{8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRIg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<5> V2; + bits<8> I3; + bits<8> I4; + bits<4> M5; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = V2{3-0}; + let Inst{31-24} = I4; + let Inst{23-20} = M5; + let Inst{19-12} = I3; + let Inst{11} = V1{4}; + let Inst{10} = V2{4}; + let Inst{9-8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRIh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<16> I2; + bits<4> I3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = 0; + let Inst{31-16} = I2; + let Inst{15-12} = I3; + let Inst{11} = V1{4}; + let Inst{10-8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRIi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<4> R2; + bits<8> I3; + bits<4> M4; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = V1{3-0}; + let Inst{35-32} = R2; + let Inst{31-24} = 0; + let Inst{23-20} = M4; + let Inst{19-12} = I3; + let Inst{11} = V1{4}; + let Inst{10-8} = 0; + let Inst{7-0} = op{7-0}; +} + // Depending on the instruction mnemonic, certain bits may be or-ed into // the M4 value provided as explicit operand. These are passed as m4or. class InstVRRa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern, @@ -1259,6 +1347,67 @@ class InstVRRf<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{7-0} = op{7-0}; } +class InstVRRg<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = 0; + let Inst{35-32} = V1{3-0}; + let Inst{31-12} = 0; + let Inst{11} = 0; + let Inst{10} = V1{4}; + let Inst{9-8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRRh<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<5> V2; + bits<4> M3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = 0; + let Inst{35-32} = V1{3-0}; + let Inst{31-28} = V2{3-0}; + let Inst{27-24} = 0; + let Inst{23-20} = M3; + let Inst{19-12} = 0; + let Inst{11} = 0; + let Inst{10} = V1{4}; + let Inst{9} = V2{4}; + let Inst{8} = 0; + let Inst{7-0} = op{7-0}; +} + +class InstVRRi<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<4> R1; + bits<5> V2; + bits<4> M3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = R1; + let Inst{35-32} = V2{3-0}; + let Inst{31-24} = 0; + let Inst{23-20} = M3; + let Inst{19-12} = 0; + let Inst{11} = 0; + let Inst{10} = V2{4}; + let Inst{9-8} = 0; + let Inst{7-0} = op{7-0}; +} + class InstVRSa<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -1321,6 +1470,25 @@ class InstVRSc<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{7-0} = op{7-0}; } +class InstVRSd<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<16> BD2; + bits<4> R3; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = 0; + let Inst{35-32} = R3; + let Inst{31-16} = BD2; + let Inst{15-12} = V1{3-0}; + let Inst{11-9} = 0; + let Inst{8} = V1{4}; + let Inst{7-0} = op{7-0}; +} + class InstVRV<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -1358,6 +1526,24 @@ class InstVRX<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> let Inst{7-0} = op{7-0}; } +class InstVSI<bits<16> op, dag outs, dag ins, string asmstr, list<dag> pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<5> V1; + bits<16> BD2; + bits<8> I3; + + let Inst{47-40} = op{15-8}; + let Inst{39-32} = I3; + let Inst{31-16} = BD2; + let Inst{15-12} = V1{3-0}; + let Inst{11-9} = 0; + let Inst{8} = V1{4}; + let Inst{7-0} = op{7-0}; +} + //===----------------------------------------------------------------------===// // Instruction classes for .insn directives //===----------------------------------------------------------------------===// @@ -1910,6 +2096,25 @@ class FixedCondBranchRX<CondVariant V, string mnemonic, bits<8> opcode> let M1 = V.ccmask; } +class CondBranchRXY<string mnemonic, bits<16> opcode> + : InstRXYb<opcode, (outs), (ins cond4:$valid, cond4:$M1, bdxaddr20only:$XBD2), + !subst("#", "${M1}", mnemonic)#"\t$XBD2", []> { + let CCMaskFirst = 1; +} + +class AsmCondBranchRXY<string mnemonic, bits<16> opcode> + : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2), + mnemonic#"\t$M1, $XBD2", []>; + +class FixedCondBranchRXY<CondVariant V, string mnemonic, bits<16> opcode, + SDPatternOperator operator = null_frag> + : InstRXYb<opcode, (outs), (ins bdxaddr20only:$XBD2), + !subst("#", V.suffix, mnemonic)#"\t$XBD2", + [(operator (load bdxaddr20only:$XBD2))]> { + let isAsmParserOnly = V.alternate; + let M1 = V.ccmask; +} + class CmpBranchRIEa<string mnemonic, bits<16> opcode, RegisterOperand cls, Immediate imm> : InstRIEa<opcode, (outs), (ins cls:$R1, imm:$I2, cond4:$M3), @@ -2272,6 +2477,24 @@ class StoreLengthVRSb<string mnemonic, bits<16> opcode, let AccessBytes = bytes; } +class StoreLengthVRSd<string mnemonic, bits<16> opcode, + SDPatternOperator operator, bits<5> bytes> + : InstVRSd<opcode, (outs), (ins VR128:$V1, GR32:$R3, bdaddr12only:$BD2), + mnemonic#"\t$V1, $R3, $BD2", + [(operator VR128:$V1, GR32:$R3, bdaddr12only:$BD2)]> { + let mayStore = 1; + let AccessBytes = bytes; +} + +class StoreLengthVSI<string mnemonic, bits<16> opcode, + SDPatternOperator operator, bits<5> bytes> + : InstVSI<opcode, (outs), (ins VR128:$V1, bdaddr12only:$BD2, imm32zx8:$I3), + mnemonic#"\t$V1, $BD2, $I3", + [(operator VR128:$V1, imm32zx8:$I3, bdaddr12only:$BD2)]> { + let mayStore = 1; + let AccessBytes = bytes; +} + class StoreMultipleRS<string mnemonic, bits<8> opcode, RegisterOperand cls, AddressingMode mode = bdaddr12only> : InstRSa<opcode, (outs), (ins cls:$R1, cls:$R3, mode:$BD2), @@ -2700,6 +2923,11 @@ class SideEffectBinaryRX<string mnemonic, bits<8> opcode, : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2), mnemonic##"\t$R1, $XBD2", []>; +class SideEffectBinaryRXY<string mnemonic, bits<16> opcode, + RegisterOperand cls> + : InstRXYa<opcode, (outs), (ins cls:$R1, bdxaddr20only:$XBD2), + mnemonic##"\t$R1, $XBD2", []>; + class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode, RegisterOperand cls> : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2), @@ -3188,6 +3416,11 @@ class BinaryVRIeFloatGeneric<string mnemonic, bits<16> opcode> (ins VR128:$V2, imm32zx12:$I3, imm32zx4:$M4, imm32zx4:$M5), mnemonic#"\t$V1, $V2, $I3, $M4, $M5", []>; +class BinaryVRIh<string mnemonic, bits<16> opcode> + : InstVRIh<opcode, (outs VR128:$V1), + (ins imm32zx16:$I2, imm32zx4:$I3), + mnemonic#"\t$V1, $I2, $I3", []>; + class BinaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0> : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, imm32zx4:$M5), @@ -3316,6 +3549,10 @@ class BinaryVRRf<string mnemonic, bits<16> opcode, SDPatternOperator operator, mnemonic#"\t$V1, $R2, $R3", [(set tr.op:$V1, (tr.vt (operator GR64:$R2, GR64:$R3)))]>; +class BinaryVRRi<string mnemonic, bits<16> opcode, RegisterOperand cls> + : InstVRRi<opcode, (outs cls:$R1), (ins VR128:$V2, imm32zx4:$M3), + mnemonic#"\t$R1, $V2, $M3", []>; + class BinaryVRSa<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type> : InstVRSa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V3, shift12only:$BD2), @@ -3353,6 +3590,15 @@ class BinaryVRScGeneric<string mnemonic, bits<16> opcode> (ins VR128:$V3, shift12only:$BD2, imm32zx4: $M4), mnemonic#"\t$R1, $V3, $BD2, $M4", []>; +class BinaryVRSd<string mnemonic, bits<16> opcode, SDPatternOperator operator, + bits<5> bytes> + : InstVRSd<opcode, (outs VR128:$V1), (ins GR32:$R3, bdaddr12only:$BD2), + mnemonic#"\t$V1, $R3, $BD2", + [(set VR128:$V1, (operator GR32:$R3, bdaddr12only:$BD2))]> { + let mayLoad = 1; + let AccessBytes = bytes; +} + class BinaryVRX<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr, bits<5> bytes> : InstVRX<opcode, (outs VR128:$V1), (ins bdxaddr12only:$XBD2, imm32zx4:$M3), @@ -3398,6 +3644,15 @@ class StoreBinaryRSL<string mnemonic, bits<16> opcode, RegisterOperand cls> let mayStore = 1; } +class BinaryVSI<string mnemonic, bits<16> opcode, SDPatternOperator operator, + bits<5> bytes> + : InstVSI<opcode, (outs VR128:$V1), (ins bdaddr12only:$BD2, imm32zx8:$I3), + mnemonic#"\t$V1, $BD2, $I3", + [(set VR128:$V1, (operator imm32zx8:$I3, bdaddr12only:$BD2))]> { + let mayLoad = 1; + let AccessBytes = bytes; +} + class StoreBinaryVRV<string mnemonic, bits<16> opcode, bits<5> bytes, Immediate index> : InstVRV<opcode, (outs), (ins VR128:$V1, bdvaddr12only:$VBD2, index:$M3), @@ -3625,6 +3880,12 @@ class CompareVRRaFloatGeneric<string mnemonic, bits<16> opcode> let M5 = 0; } +class CompareVRRh<string mnemonic, bits<16> opcode> + : InstVRRh<opcode, (outs), (ins VR128:$V1, VR128:$V2, imm32zx4:$M3), + mnemonic#"\t$V1, $V2, $M3", []> { + let isCompare = 1; +} + class TestRXE<string mnemonic, bits<16> opcode, SDPatternOperator operator, RegisterOperand cls> : InstRXE<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2), @@ -3639,6 +3900,10 @@ class TestRSL<string mnemonic, bits<16> opcode> let mayLoad = 1; } +class TestVRRg<string mnemonic, bits<16> opcode> + : InstVRRg<opcode, (outs), (ins VR128:$V1), + mnemonic#"\t$V1", []>; + class SideEffectTernarySSc<string mnemonic, bits<8> opcode> : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1, shift12only:$BD2, imm32zx4:$I3), @@ -3842,6 +4107,11 @@ class TernaryVRId<string mnemonic, bits<16> opcode, SDPatternOperator operator, let M5 = type; } +class TernaryVRIi<string mnemonic, bits<16> opcode, RegisterOperand cls> + : InstVRIi<opcode, (outs VR128:$V1), + (ins cls:$R2, imm32zx8:$I3, imm32zx4:$M4), + mnemonic#"\t$V1, $R2, $I3, $M4", []>; + class TernaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> m4or> : InstVRRa<opcode, (outs tr1.op:$V1), @@ -3914,6 +4184,25 @@ class TernaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator, let M6 = 0; } +class TernaryVRRcFloat<string mnemonic, bits<16> opcode, + SDPatternOperator operator, TypedReg tr1, TypedReg tr2, + bits<4> type = 0, bits<4> m5 = 0> + : InstVRRc<opcode, (outs tr1.op:$V1), + (ins tr2.op:$V2, tr2.op:$V3, imm32zx4:$M6), + mnemonic#"\t$V1, $V2, $V3, $M6", + [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2), + (tr2.vt tr2.op:$V3), + imm32zx4:$M6)))]> { + let M4 = type; + let M5 = m5; +} + +class TernaryVRRcFloatGeneric<string mnemonic, bits<16> opcode> + : InstVRRc<opcode, (outs VR128:$V1), + (ins VR128:$V2, VR128:$V3, imm32zx4:$M4, imm32zx4:$M5, + imm32zx4:$M6), + mnemonic#"\t$V1, $V2, $V3, $M4, $M5, $M6", []>; + class TernaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<4> type = 0> : InstVRRd<opcode, (outs tr1.op:$V1), @@ -4019,20 +4308,38 @@ class QuaternaryVRIdGeneric<string mnemonic, bits<16> opcode> let DisableEncoding = "$V1src"; } +class QuaternaryVRIf<string mnemonic, bits<16> opcode> + : InstVRIf<opcode, (outs VR128:$V1), + (ins VR128:$V2, VR128:$V3, + imm32zx8:$I4, imm32zx4:$M5), + mnemonic#"\t$V1, $V2, $V3, $I4, $M5", []>; + +class QuaternaryVRIg<string mnemonic, bits<16> opcode> + : InstVRIg<opcode, (outs VR128:$V1), + (ins VR128:$V2, imm32zx8:$I3, + imm32zx8:$I4, imm32zx4:$M5), + mnemonic#"\t$V1, $V2, $I3, $I4, $M5", []>; + class QuaternaryVRRd<string mnemonic, bits<16> opcode, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, - bits<4> type, SDPatternOperator m6mask, bits<4> m6or> + TypedReg tr3, TypedReg tr4, bits<4> type, + SDPatternOperator m6mask = imm32zx4, bits<4> m6or = 0> : InstVRRd<opcode, (outs tr1.op:$V1), - (ins tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, m6mask:$M6), + (ins tr2.op:$V2, tr3.op:$V3, tr4.op:$V4, m6mask:$M6), mnemonic#"\t$V1, $V2, $V3, $V4, $M6", [(set tr1.op:$V1, (tr1.vt (operator (tr2.vt tr2.op:$V2), - (tr2.vt tr2.op:$V3), - (tr2.vt tr2.op:$V4), + (tr3.vt tr3.op:$V3), + (tr4.vt tr4.op:$V4), m6mask:$M6)))], m6or> { let M5 = type; } +class QuaternaryVRRdGeneric<string mnemonic, bits<16> opcode> + : InstVRRd<opcode, (outs VR128:$V1), + (ins VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, imm32zx4:$M6), + mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>; + // Declare a pair of instructions, one which sets CC and one which doesn't. // The CC-setting form ends with "S" and sets the low bit of M6. // Also create aliases to make use of M6 operand optional in assembler. @@ -4041,13 +4348,15 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode, SDPatternOperator operator_cc, TypedReg tr1, TypedReg tr2, bits<4> type, bits<4> modifier = 0> { - def "" : QuaternaryVRRd<mnemonic, opcode, operator, tr1, tr2, type, + def "" : QuaternaryVRRd<mnemonic, opcode, operator, + tr1, tr2, tr2, tr2, type, imm32zx4even, !and (modifier, 14)>; def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4", (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2, tr2.op:$V3, tr2.op:$V4, 0)>; let Defs = [CC] in - def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, tr1, tr2, type, + def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc, + tr1, tr2, tr2, tr2, type, imm32zx4even, !add (!and (modifier, 14), 1)>; def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4", (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2, @@ -4055,10 +4364,7 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode, } multiclass QuaternaryOptVRRdSPairGeneric<string mnemonic, bits<16> opcode> { - def "" : InstVRRd<opcode, (outs VR128:$V1), - (ins VR128:$V2, VR128:$V3, VR128:$V4, - imm32zx4:$M5, imm32zx4:$M6), - mnemonic#"\t$V1, $V2, $V3, $V4, $M5, $M6", []>; + def "" : QuaternaryVRRdGeneric<mnemonic, opcode>; def : InstAlias<mnemonic#"\t$V1, $V2, $V3, $V4, $M5", (!cast<Instruction>(NAME) VR128:$V1, VR128:$V2, VR128:$V3, VR128:$V4, imm32zx4:$M5, 0)>; @@ -4366,10 +4672,10 @@ class RotateSelectRIEfPseudo<RegisterOperand cls1, RegisterOperand cls2> // Implements "$dst = $cc & (8 >> CC) ? $src1 : $src2", where CC is // the value of the PSW's 2-bit condition code field. -class SelectWrapper<RegisterOperand cls> +class SelectWrapper<ValueType vt, RegisterOperand cls> : Pseudo<(outs cls:$dst), (ins cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc), - [(set cls:$dst, (z_select_ccmask cls:$src1, cls:$src2, + [(set (vt cls:$dst), (z_select_ccmask cls:$src1, cls:$src2, imm32zx4:$valid, imm32zx4:$cc))]> { let usesCustomInserter = 1; // Although the instructions used by these nodes do not in themselves diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp index 66a5ff12be464..4533f4fdf21ae 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.cpp +++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp @@ -869,6 +869,37 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB, return; } + // Move 128-bit floating-point values between VR128 and FP128. + if (SystemZ::VR128BitRegClass.contains(DestReg) && + SystemZ::FP128BitRegClass.contains(SrcReg)) { + unsigned SrcRegHi = + RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_h64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + unsigned SrcRegLo = + RI.getMatchingSuperReg(RI.getSubReg(SrcReg, SystemZ::subreg_l64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + + BuildMI(MBB, MBBI, DL, get(SystemZ::VMRHG), DestReg) + .addReg(SrcRegHi, getKillRegState(KillSrc)) + .addReg(SrcRegLo, getKillRegState(KillSrc)); + return; + } + if (SystemZ::FP128BitRegClass.contains(DestReg) && + SystemZ::VR128BitRegClass.contains(SrcReg)) { + unsigned DestRegHi = + RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_h64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + unsigned DestRegLo = + RI.getMatchingSuperReg(RI.getSubReg(DestReg, SystemZ::subreg_l64), + SystemZ::subreg_r64, &SystemZ::VR128BitRegClass); + + if (DestRegHi != SrcReg) + copyPhysReg(MBB, MBBI, DL, DestRegHi, SrcReg, false); + BuildMI(MBB, MBBI, DL, get(SystemZ::VREPG), DestRegLo) + .addReg(SrcReg, getKillRegState(KillSrc)).addImm(1); + return; + } + // Everything else needs only one instruction. unsigned Opcode; if (SystemZ::GR64BitRegClass.contains(DestReg, SrcReg)) @@ -1434,6 +1465,7 @@ SystemZII::Branch SystemZInstrInfo::getBranchInfo(const MachineInstr &MI) const { switch (MI.getOpcode()) { case SystemZ::BR: + case SystemZ::BI: case SystemZ::J: case SystemZ::JG: return SystemZII::Branch(SystemZII::BranchNormal, SystemZ::CCMASK_ANY, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 4569be7602e45..f64c0d15ef83b 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -48,6 +48,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { let isIndirectBranch = 1 in { def BC : CondBranchRX<"b#", 0x47>; def BCR : CondBranchRR<"b#r", 0x07>; + def BIC : CondBranchRXY<"bi#", 0xe347>, + Requires<[FeatureMiscellaneousExtensions2]>; } } @@ -58,6 +60,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { let isIndirectBranch = 1 in { def BCAsm : AsmCondBranchRX<"bc", 0x47>; def BCRAsm : AsmCondBranchRR<"bcr", 0x07>; + def BICAsm : AsmCondBranchRXY<"bic", 0xe347>, + Requires<[FeatureMiscellaneousExtensions2]>; } // Define AsmParser extended mnemonics for each general condition-code mask @@ -69,6 +73,8 @@ let isBranch = 1, isTerminator = 1, Uses = [CC] in { let isIndirectBranch = 1 in { def BAsm#V : FixedCondBranchRX <CV<V>, "b#", 0x47>; def BRAsm#V : FixedCondBranchRR <CV<V>, "b#r", 0x07>; + def BIAsm#V : FixedCondBranchRXY<CV<V>, "bi#", 0xe347>, + Requires<[FeatureMiscellaneousExtensions2]>; } } } @@ -81,6 +87,8 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1 in { let isIndirectBranch = 1 in { def B : FixedCondBranchRX<CondAlways, "b", 0x47>; def BR : FixedCondBranchRR<CondAlways, "br", 0x07, brind>; + def BI : FixedCondBranchRXY<CondAlways, "bi", 0xe347, brind>, + Requires<[FeatureMiscellaneousExtensions2]>; } } @@ -316,9 +324,9 @@ let isReturn = 1, isTerminator = 1, hasCtrlDep = 1 in { // Select instructions //===----------------------------------------------------------------------===// -def Select32Mux : SelectWrapper<GRX32>, Requires<[FeatureHighWord]>; -def Select32 : SelectWrapper<GR32>; -def Select64 : SelectWrapper<GR64>; +def Select32Mux : SelectWrapper<i32, GRX32>, Requires<[FeatureHighWord]>; +def Select32 : SelectWrapper<i32, GR32>; +def Select64 : SelectWrapper<i64, GR64>; // We don't define 32-bit Mux stores if we don't have STOCFH, because the // low-only STOC should then always be used if possible. @@ -921,6 +929,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { // Addition of memory. defm AH : BinaryRXPair<"ah", 0x4A, 0xE37A, add, GR32, asextloadi16, 2>; defm A : BinaryRXPair<"a", 0x5A, 0xE35A, add, GR32, load, 4>; + def AGH : BinaryRXY<"agh", 0xE338, add, GR64, asextloadi16, 2>, + Requires<[FeatureMiscellaneousExtensions2]>; def AGF : BinaryRXY<"agf", 0xE318, add, GR64, asextloadi32, 4>; def AG : BinaryRXY<"ag", 0xE308, add, GR64, load, 8>; @@ -1006,6 +1016,8 @@ let Defs = [CC], CCValues = 0xF, CompareZeroCCMask = 0x8 in { // Subtraction of memory. defm SH : BinaryRXPair<"sh", 0x4B, 0xE37B, sub, GR32, asextloadi16, 2>; defm S : BinaryRXPair<"s", 0x5B, 0xE35B, sub, GR32, load, 4>; + def SGH : BinaryRXY<"sgh", 0xE339, sub, GR64, asextloadi16, 2>, + Requires<[FeatureMiscellaneousExtensions2]>; def SGF : BinaryRXY<"sgf", 0xE319, sub, GR64, asextloadi32, 4>; def SG : BinaryRXY<"sg", 0xE309, sub, GR64, load, 8>; } @@ -1207,6 +1219,15 @@ defm : RMWIByte<xor, bdaddr20pair, XIY>; // Multiplication //===----------------------------------------------------------------------===// +// Multiplication of a register, setting the condition code. We prefer these +// over MS(G)R if available, even though we cannot use the condition code, +// since they are three-operand instructions. +let Predicates = [FeatureMiscellaneousExtensions2], + Defs = [CC], isCommutable = 1 in { + def MSRKC : BinaryRRFa<"msrkc", 0xB9FD, mul, GR32, GR32, GR32>; + def MSGRKC : BinaryRRFa<"msgrkc", 0xB9ED, mul, GR64, GR64, GR64>; +} + // Multiplication of a register. let isCommutable = 1 in { def MSR : BinaryRRE<"msr", 0xB252, mul, GR32, GR32>; @@ -1226,21 +1247,37 @@ def MSGFI : BinaryRIL<"msgfi", 0xC20, mul, GR64, imm64sx32>; // Multiplication of memory. defm MH : BinaryRXPair<"mh", 0x4C, 0xE37C, mul, GR32, asextloadi16, 2>; defm MS : BinaryRXPair<"ms", 0x71, 0xE351, mul, GR32, load, 4>; +def MGH : BinaryRXY<"mgh", 0xE33C, mul, GR64, asextloadi16, 2>, + Requires<[FeatureMiscellaneousExtensions2]>; def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>; def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>; +// Multiplication of memory, setting the condition code. +let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in { + def MSC : BinaryRXY<"msc", 0xE353, null_frag, GR32, load, 4>; + def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>; +} + // Multiplication of a register, producing two results. -def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; +def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; +def MGRK : BinaryRRFa<"mgrk", 0xB9EC, null_frag, GR128, GR64, GR64>, + Requires<[FeatureMiscellaneousExtensions2]>; def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>; def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>; +def : Pat<(z_smul_lohi GR64:$src1, GR64:$src2), + (MGRK GR64:$src1, GR64:$src2)>; def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2), (MLGR (AEXT128 GR64:$src1), GR64:$src2)>; // Multiplication of memory, producing two results. def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>; def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>; +def MG : BinaryRXY<"mg", 0xE384, null_frag, GR128, load, 8>, + Requires<[FeatureMiscellaneousExtensions2]>; def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>; def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>; +def : Pat<(z_smul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), + (MG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; @@ -1765,8 +1802,29 @@ let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { GR128, GR128, GR128>; def PCC : SideEffectInherentRRE<"pcc", 0xB92C>; } + let Predicates = [FeatureMessageSecurityAssist5] in - def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>; + def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>; + let Predicates = [FeatureMessageSecurityAssist7], isAsmParserOnly = 1 in + def PRNO : SideEffectBinaryMemMemRRE<"prno", 0xB93C, GR128, GR128>; + + let Predicates = [FeatureMessageSecurityAssist8] in + def KMA : SideEffectTernaryMemMemMemRRFb<"kma", 0xB929, + GR128, GR128, GR128>; +} + +//===----------------------------------------------------------------------===// +// Guarded storage +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureGuardedStorage] in { + def LGG : UnaryRXY<"lgg", 0xE34C, null_frag, GR64, 8>; + def LLGFSG : UnaryRXY<"llgfsg", 0xE348, null_frag, GR64, 4>; + + let mayLoad = 1 in + def LGSC : SideEffectBinaryRXY<"lgsc", 0xE34D, GR64>; + let mayStore = 1 in + def STGSC : SideEffectBinaryRXY<"stgsc", 0xE349, GR64>; } //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZInstrSystem.td b/lib/Target/SystemZ/SystemZInstrSystem.td index a9803c2d83e9f..0112ebf1eb10c 100644 --- a/lib/Target/SystemZ/SystemZInstrSystem.td +++ b/lib/Target/SystemZ/SystemZInstrSystem.td @@ -126,6 +126,10 @@ let hasSideEffects = 1, Defs = [CC] in let Predicates = [FeatureResetReferenceBitsMultiple], hasSideEffects = 1 in def RRBM : UnaryRRE<"rrbm", 0xB9AE, null_frag, GR64, GR64>; +// Insert reference bits multiple. +let Predicates = [FeatureInsertReferenceBitsMultiple], hasSideEffects = 1 in + def IRBM : UnaryRRE<"irbm", 0xB9AC, null_frag, GR64, GR64>; + // Perform frame management function. let hasSideEffects = 1 in def PFMF : SideEffectBinaryMemRRE<"pfmf", 0xB9AF, GR32, GR64>; diff --git a/lib/Target/SystemZ/SystemZInstrVector.td b/lib/Target/SystemZ/SystemZInstrVector.td index 0158fe6aec08d..c9a02d9c80821 100644 --- a/lib/Target/SystemZ/SystemZInstrVector.td +++ b/lib/Target/SystemZ/SystemZInstrVector.td @@ -14,7 +14,7 @@ let Predicates = [FeatureVector] in { // Register move. def VLR : UnaryVRRa<"vlr", 0xE756, null_frag, v128any, v128any>; - def VLR32 : UnaryAliasVRR<null_frag, v32eb, v32eb>; + def VLR32 : UnaryAliasVRR<null_frag, v32sb, v32sb>; def VLR64 : UnaryAliasVRR<null_frag, v64db, v64db>; // Load GR from VR element. @@ -141,7 +141,7 @@ let Predicates = [FeatureVector] in { // LEY and LDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - def VL32 : UnaryAliasVRX<load, v32eb, bdxaddr12pair>; + def VL32 : UnaryAliasVRX<load, v32sb, bdxaddr12pair>; def VL64 : UnaryAliasVRX<load, v64db, bdxaddr12pair>; // Load logical element and zero. @@ -154,6 +154,11 @@ let Predicates = [FeatureVector] in { (VLLEZF bdxaddr12only:$addr)>; def : Pat<(v2f64 (z_vllezf64 bdxaddr12only:$addr)), (VLLEZG bdxaddr12only:$addr)>; + let Predicates = [FeatureVectorEnhancements1] in { + def VLLEZLF : UnaryVRX<"vllezlf", 0xE704, z_vllezli32, v128f, 4, 6>; + def : Pat<(v4f32 (z_vllezlf32 bdxaddr12only:$addr)), + (VLLEZLF bdxaddr12only:$addr)>; + } // Load element. def VLEB : TernaryVRX<"vleb", 0xE700, z_vlei8, v128b, v128b, 1, imm32zx4>; @@ -170,6 +175,13 @@ let Predicates = [FeatureVector] in { def VGEG : TernaryVRV<"vgeg", 0xE712, 8, imm32zx1>; } +let Predicates = [FeatureVectorPackedDecimal] in { + // Load rightmost with length. The number of loaded bytes is only known + // at run time. + def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>; + def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>; +} + // Use replicating loads if we're inserting a single element into an // undefined vector. This avoids a false dependency on the previous // register contents. @@ -219,7 +231,7 @@ let Predicates = [FeatureVector] in { // STEY and STDY offer full 20-bit displacement fields. It's often better // to use those instructions rather than force a 20-bit displacement // into a GPR temporary. - def VST32 : StoreAliasVRX<store, v32eb, bdxaddr12pair>; + def VST32 : StoreAliasVRX<store, v32sb, bdxaddr12pair>; def VST64 : StoreAliasVRX<store, v64db, bdxaddr12pair>; // Scatter element. @@ -227,6 +239,13 @@ let Predicates = [FeatureVector] in { def VSCEG : StoreBinaryVRV<"vsceg", 0xE71A, 8, imm32zx1>; } +let Predicates = [FeatureVectorPackedDecimal] in { + // Store rightmost with length. The number of stored bytes is only known + // at run time. + def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>; + def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>; +} + //===----------------------------------------------------------------------===// // Selects and permutes //===----------------------------------------------------------------------===// @@ -256,6 +275,10 @@ let Predicates = [FeatureVector] in { // Permute doubleword immediate. def VPDI : TernaryVRRc<"vpdi", 0xE784, z_permute_dwords, v128g, v128g>; + // Bit Permute. + let Predicates = [FeatureVectorEnhancements1] in + def VBPERM : BinaryVRRc<"vbperm", 0xE785, int_s390_vbperm, v128g, v128b>; + // Replicate. def VREP: BinaryVRIcGeneric<"vrep", 0xE74D>; def VREPB : BinaryVRIc<"vrepb", 0xE74D, z_splat, v128b, v128b, 0>; @@ -424,6 +447,10 @@ let Predicates = [FeatureVector] in { def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>; def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>; + // Not exclusive or. + let Predicates = [FeatureVectorEnhancements1] in + def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>; + // Exclusive or. def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>; @@ -567,6 +594,17 @@ let Predicates = [FeatureVector] in { def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>; def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>; + // Multiply sum logical. + let Predicates = [FeatureVectorEnhancements1] in { + def VMSL : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>; + def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg, + v128q, v128g, v128g, v128q, 3>; + } + + // Nand. + let Predicates = [FeatureVectorEnhancements1] in + def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>; + // Nor. def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>; def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>; @@ -574,9 +612,19 @@ let Predicates = [FeatureVector] in { // Or. def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>; + // Or with complement. + let Predicates = [FeatureVectorEnhancements1] in + def VOC : BinaryVRRc<"voc", 0xE76F, null_frag, v128any, v128any>; + // Population count. def VPOPCT : UnaryVRRaGeneric<"vpopct", 0xE750>; def : Pat<(v16i8 (z_popcnt VR128:$x)), (VPOPCT VR128:$x, 0)>; + let Predicates = [FeatureVectorEnhancements1] in { + def VPOPCTB : UnaryVRRa<"vpopctb", 0xE750, ctpop, v128b, v128b, 0>; + def VPOPCTH : UnaryVRRa<"vpopcth", 0xE750, ctpop, v128h, v128h, 1>; + def VPOPCTF : UnaryVRRa<"vpopctf", 0xE750, ctpop, v128f, v128f, 2>; + def VPOPCTG : UnaryVRRa<"vpopctg", 0xE750, ctpop, v128g, v128g, 3>; + } // Element rotate left logical (with vector shift amount). def VERLLV : BinaryVRRcGeneric<"verllv", 0xE773>; @@ -724,6 +772,14 @@ multiclass BitwiseVectorOps<ValueType type> { (VNO VR128:$x, VR128:$y)>; def : Pat<(type (z_vnot VR128:$x)), (VNO VR128:$x, VR128:$x)>; } + let Predicates = [FeatureVectorEnhancements1] in { + def : Pat<(type (z_vnot (xor VR128:$x, VR128:$y))), + (VNX VR128:$x, VR128:$y)>; + def : Pat<(type (z_vnot (and VR128:$x, VR128:$y))), + (VNN VR128:$x, VR128:$y)>; + def : Pat<(type (or VR128:$x, (z_vnot VR128:$y))), + (VOC VR128:$x, VR128:$y)>; + } } defm : BitwiseVectorOps<v16i8>; @@ -879,6 +935,11 @@ let Predicates = [FeatureVector] in { def VFA : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>; def VFADB : BinaryVRRc<"vfadb", 0xE7E3, fadd, v128db, v128db, 3, 0>; def WFADB : BinaryVRRc<"wfadb", 0xE7E3, fadd, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFASB : BinaryVRRc<"vfasb", 0xE7E3, fadd, v128sb, v128sb, 2, 0>; + def WFASB : BinaryVRRc<"wfasb", 0xE7E3, fadd, v32sb, v32sb, 2, 8>; + def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, fadd, v128xb, v128xb, 4, 8>; + } // Convert from fixed 64-bit. def VCDG : TernaryVRRaFloatGeneric<"vcdg", 0xE7C3>; @@ -910,6 +971,11 @@ let Predicates = [FeatureVector] in { def VFD : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>; def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, fdiv, v128db, v128db, 3, 0>; def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, fdiv, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, fdiv, v128sb, v128sb, 2, 0>; + def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, fdiv, v32sb, v32sb, 2, 8>; + def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, fdiv, v128xb, v128xb, 4, 8>; + } // Load FP integer. def VFI : TernaryVRRaFloatGeneric<"vfi", 0xE7C7>; @@ -917,66 +983,213 @@ let Predicates = [FeatureVector] in { def WFIDB : TernaryVRRa<"wfidb", 0xE7C7, null_frag, v64db, v64db, 3, 8>; defm : VectorRounding<VFIDB, v128db>; defm : VectorRounding<WFIDB, v64db>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFISB : TernaryVRRa<"vfisb", 0xE7C7, int_s390_vfisb, v128sb, v128sb, 2, 0>; + def WFISB : TernaryVRRa<"wfisb", 0xE7C7, null_frag, v32sb, v32sb, 2, 8>; + def WFIXB : TernaryVRRa<"wfixb", 0xE7C7, null_frag, v128xb, v128xb, 4, 8>; + defm : VectorRounding<VFISB, v128sb>; + defm : VectorRounding<WFISB, v32sb>; + defm : VectorRounding<WFIXB, v128xb>; + } // Load lengthened. def VLDE : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>; - def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128eb, 2, 0>; - def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32eb, 2, 8>; + def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_vextend, v128db, v128sb, 2, 0>; + def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, fpextend, v64db, v32sb, 2, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + let isAsmParserOnly = 1 in { + def VFLL : UnaryVRRaFloatGeneric<"vfll", 0xE7C4>; + def VFLLS : UnaryVRRa<"vflls", 0xE7C4, null_frag, v128db, v128sb, 2, 0>; + def WFLLS : UnaryVRRa<"wflls", 0xE7C4, null_frag, v64db, v32sb, 2, 8>; + } + def WFLLD : UnaryVRRa<"wflld", 0xE7C4, fpextend, v128xb, v64db, 3, 8>; + def : Pat<(f128 (fpextend (f32 VR32:$src))), + (WFLLD (WLDEB VR32:$src))>; + } - // Load rounded, + // Load rounded. def VLED : TernaryVRRaFloatGeneric<"vled", 0xE7C5>; - def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128eb, v128db, 3, 0>; - def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32eb, v64db, 3, 8>; + def VLEDB : TernaryVRRa<"vledb", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; + def WLEDB : TernaryVRRa<"wledb", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; def : Pat<(v4f32 (z_vround (v2f64 VR128:$src))), (VLEDB VR128:$src, 0, 0)>; - def : FPConversion<WLEDB, fpround, v32eb, v64db, 0, 0>; + def : FPConversion<WLEDB, fpround, v32sb, v64db, 0, 0>; + let Predicates = [FeatureVectorEnhancements1] in { + let isAsmParserOnly = 1 in { + def VFLR : TernaryVRRaFloatGeneric<"vflr", 0xE7C5>; + def VFLRD : TernaryVRRa<"vflrd", 0xE7C5, null_frag, v128sb, v128db, 3, 0>; + def WFLRD : TernaryVRRa<"wflrd", 0xE7C5, null_frag, v32sb, v64db, 3, 8>; + } + def WFLRX : TernaryVRRa<"wflrx", 0xE7C5, null_frag, v64db, v128xb, 4, 8>; + def : FPConversion<WFLRX, fpround, v64db, v128xb, 0, 0>; + def : Pat<(f32 (fpround (f128 VR128:$src))), + (WLEDB (WFLRX VR128:$src, 0, 3), 0, 0)>; + } + + // Maximum. + multiclass VectorMax<Instruction insn, TypedReg tr> { + def : FPMinMax<insn, fmaxnum, tr, 4>; + def : FPMinMax<insn, fmaxnan, tr, 1>; + } + let Predicates = [FeatureVectorEnhancements1] in { + def VFMAX : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>; + def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb, + v128db, v128db, 3, 0>; + def WFMAXDB : TernaryVRRcFloat<"wfmaxdb", 0xE7EF, null_frag, + v64db, v64db, 3, 8>; + def VFMAXSB : TernaryVRRcFloat<"vfmaxsb", 0xE7EF, int_s390_vfmaxsb, + v128sb, v128sb, 2, 0>; + def WFMAXSB : TernaryVRRcFloat<"wfmaxsb", 0xE7EF, null_frag, + v32sb, v32sb, 2, 8>; + def WFMAXXB : TernaryVRRcFloat<"wfmaxxb", 0xE7EF, null_frag, + v128xb, v128xb, 4, 8>; + defm : VectorMax<VFMAXDB, v128db>; + defm : VectorMax<WFMAXDB, v64db>; + defm : VectorMax<VFMAXSB, v128sb>; + defm : VectorMax<WFMAXSB, v32sb>; + defm : VectorMax<WFMAXXB, v128xb>; + } + + // Minimum. + multiclass VectorMin<Instruction insn, TypedReg tr> { + def : FPMinMax<insn, fminnum, tr, 4>; + def : FPMinMax<insn, fminnan, tr, 1>; + } + let Predicates = [FeatureVectorEnhancements1] in { + def VFMIN : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>; + def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb, + v128db, v128db, 3, 0>; + def WFMINDB : TernaryVRRcFloat<"wfmindb", 0xE7EE, null_frag, + v64db, v64db, 3, 8>; + def VFMINSB : TernaryVRRcFloat<"vfminsb", 0xE7EE, int_s390_vfminsb, + v128sb, v128sb, 2, 0>; + def WFMINSB : TernaryVRRcFloat<"wfminsb", 0xE7EE, null_frag, + v32sb, v32sb, 2, 8>; + def WFMINXB : TernaryVRRcFloat<"wfminxb", 0xE7EE, null_frag, + v128xb, v128xb, 4, 8>; + defm : VectorMin<VFMINDB, v128db>; + defm : VectorMin<WFMINDB, v64db>; + defm : VectorMin<VFMINSB, v128sb>; + defm : VectorMin<WFMINSB, v32sb>; + defm : VectorMin<WFMINXB, v128xb>; + } // Multiply. def VFM : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>; def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, fmul, v128db, v128db, 3, 0>; def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, fmul, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, fmul, v128sb, v128sb, 2, 0>; + def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, fmul, v32sb, v32sb, 2, 8>; + def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, fmul, v128xb, v128xb, 4, 8>; + } // Multiply and add. def VFMA : TernaryVRReFloatGeneric<"vfma", 0xE78F>; def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, fma, v128db, v128db, 0, 3>; def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, fma, v64db, v64db, 8, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, fma, v128sb, v128sb, 0, 2>; + def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, fma, v32sb, v32sb, 8, 2>; + def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, fma, v128xb, v128xb, 8, 4>; + } // Multiply and subtract. def VFMS : TernaryVRReFloatGeneric<"vfms", 0xE78E>; def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, fms, v128db, v128db, 0, 3>; def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, fms, v64db, v64db, 8, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, fms, v128sb, v128sb, 0, 2>; + def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, fms, v32sb, v32sb, 8, 2>; + def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, fms, v128xb, v128xb, 8, 4>; + } + + // Negative multiply and add. + let Predicates = [FeatureVectorEnhancements1] in { + def VFNMA : TernaryVRReFloatGeneric<"vfnma", 0xE79F>; + def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, fnma, v128db, v128db, 0, 3>; + def WFNMADB : TernaryVRRe<"wfnmadb", 0xE79F, fnma, v64db, v64db, 8, 3>; + def VFNMASB : TernaryVRRe<"vfnmasb", 0xE79F, fnma, v128sb, v128sb, 0, 2>; + def WFNMASB : TernaryVRRe<"wfnmasb", 0xE79F, fnma, v32sb, v32sb, 8, 2>; + def WFNMAXB : TernaryVRRe<"wfnmaxb", 0xE79F, fnma, v128xb, v128xb, 8, 4>; + } + + // Negative multiply and subtract. + let Predicates = [FeatureVectorEnhancements1] in { + def VFNMS : TernaryVRReFloatGeneric<"vfnms", 0xE79E>; + def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, fnms, v128db, v128db, 0, 3>; + def WFNMSDB : TernaryVRRe<"wfnmsdb", 0xE79E, fnms, v64db, v64db, 8, 3>; + def VFNMSSB : TernaryVRRe<"vfnmssb", 0xE79E, fnms, v128sb, v128sb, 0, 2>; + def WFNMSSB : TernaryVRRe<"wfnmssb", 0xE79E, fnms, v32sb, v32sb, 8, 2>; + def WFNMSXB : TernaryVRRe<"wfnmsxb", 0xE79E, fnms, v128xb, v128xb, 8, 4>; + } // Perform sign operation. def VFPSO : BinaryVRRaFloatGeneric<"vfpso", 0xE7CC>; def VFPSODB : BinaryVRRa<"vfpsodb", 0xE7CC, null_frag, v128db, v128db, 3, 0>; def WFPSODB : BinaryVRRa<"wfpsodb", 0xE7CC, null_frag, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFPSOSB : BinaryVRRa<"vfpsosb", 0xE7CC, null_frag, v128sb, v128sb, 2, 0>; + def WFPSOSB : BinaryVRRa<"wfpsosb", 0xE7CC, null_frag, v32sb, v32sb, 2, 8>; + def WFPSOXB : BinaryVRRa<"wfpsoxb", 0xE7CC, null_frag, v128xb, v128xb, 4, 8>; + } // Load complement. def VFLCDB : UnaryVRRa<"vflcdb", 0xE7CC, fneg, v128db, v128db, 3, 0, 0>; def WFLCDB : UnaryVRRa<"wflcdb", 0xE7CC, fneg, v64db, v64db, 3, 8, 0>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFLCSB : UnaryVRRa<"vflcsb", 0xE7CC, fneg, v128sb, v128sb, 2, 0, 0>; + def WFLCSB : UnaryVRRa<"wflcsb", 0xE7CC, fneg, v32sb, v32sb, 2, 8, 0>; + def WFLCXB : UnaryVRRa<"wflcxb", 0xE7CC, fneg, v128xb, v128xb, 4, 8, 0>; + } // Load negative. def VFLNDB : UnaryVRRa<"vflndb", 0xE7CC, fnabs, v128db, v128db, 3, 0, 1>; def WFLNDB : UnaryVRRa<"wflndb", 0xE7CC, fnabs, v64db, v64db, 3, 8, 1>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFLNSB : UnaryVRRa<"vflnsb", 0xE7CC, fnabs, v128sb, v128sb, 2, 0, 1>; + def WFLNSB : UnaryVRRa<"wflnsb", 0xE7CC, fnabs, v32sb, v32sb, 2, 8, 1>; + def WFLNXB : UnaryVRRa<"wflnxb", 0xE7CC, fnabs, v128xb, v128xb, 4, 8, 1>; + } // Load positive. def VFLPDB : UnaryVRRa<"vflpdb", 0xE7CC, fabs, v128db, v128db, 3, 0, 2>; def WFLPDB : UnaryVRRa<"wflpdb", 0xE7CC, fabs, v64db, v64db, 3, 8, 2>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFLPSB : UnaryVRRa<"vflpsb", 0xE7CC, fabs, v128sb, v128sb, 2, 0, 2>; + def WFLPSB : UnaryVRRa<"wflpsb", 0xE7CC, fabs, v32sb, v32sb, 2, 8, 2>; + def WFLPXB : UnaryVRRa<"wflpxb", 0xE7CC, fabs, v128xb, v128xb, 4, 8, 2>; + } // Square root. def VFSQ : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>; def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, fsqrt, v128db, v128db, 3, 0>; def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, fsqrt, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, fsqrt, v128sb, v128sb, 2, 0>; + def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, fsqrt, v32sb, v32sb, 2, 8>; + def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, fsqrt, v128xb, v128xb, 4, 8>; + } // Subtract. def VFS : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>; def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, fsub, v128db, v128db, 3, 0>; def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, fsub, v64db, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, fsub, v128sb, v128sb, 2, 0>; + def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, fsub, v32sb, v32sb, 2, 8>; + def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, fsub, v128xb, v128xb, 4, 8>; + } // Test data class immediate. let Defs = [CC] in { def VFTCI : BinaryVRIeFloatGeneric<"vftci", 0xE74A>; def VFTCIDB : BinaryVRIe<"vftcidb", 0xE74A, z_vftci, v128g, v128db, 3, 0>; def WFTCIDB : BinaryVRIe<"wftcidb", 0xE74A, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + def VFTCISB : BinaryVRIe<"vftcisb", 0xE74A, z_vftci, v128f, v128sb, 2, 0>; + def WFTCISB : BinaryVRIe<"wftcisb", 0xE74A, null_frag, v32f, v32sb, 2, 8>; + def WFTCIXB : BinaryVRIe<"wftcixb", 0xE74A, null_frag, v128q, v128xb, 4, 8>; + } } } @@ -989,12 +1202,20 @@ let Predicates = [FeatureVector] in { let Defs = [CC] in { def WFC : CompareVRRaFloatGeneric<"wfc", 0xE7CB>; def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_fcmp, v64db, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_fcmp, v32sb, 2>; + def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_fcmp, v128xb, 4>; + } } // Compare and signal scalar. let Defs = [CC] in { def WFK : CompareVRRaFloatGeneric<"wfk", 0xE7CA>; def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, null_frag, v64db, 3>; + let Predicates = [FeatureVectorEnhancements1] in { + def WFKSB : CompareVRRa<"wfksb", 0xE7CA, null_frag, v32sb, 2>; + def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, null_frag, v128xb, 4>; + } } // Compare equal. @@ -1003,6 +1224,28 @@ let Predicates = [FeatureVector] in { v128g, v128db, 3, 0>; defm WFCEDB : BinaryVRRcSPair<"wfcedb", 0xE7E8, null_frag, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCESB : BinaryVRRcSPair<"vfcesb", 0xE7E8, z_vfcmpe, z_vfcmpes, + v128f, v128sb, 2, 0>; + defm WFCESB : BinaryVRRcSPair<"wfcesb", 0xE7E8, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCEXB : BinaryVRRcSPair<"wfcexb", 0xE7E8, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } + + // Compare and signal equal. + let Predicates = [FeatureVectorEnhancements1] in { + defm VFKEDB : BinaryVRRcSPair<"vfkedb", 0xE7E8, null_frag, null_frag, + v128g, v128db, 3, 4>; + defm WFKEDB : BinaryVRRcSPair<"wfkedb", 0xE7E8, null_frag, null_frag, + v64g, v64db, 3, 12>; + defm VFKESB : BinaryVRRcSPair<"vfkesb", 0xE7E8, null_frag, null_frag, + v128f, v128sb, 2, 4>; + defm WFKESB : BinaryVRRcSPair<"wfkesb", 0xE7E8, null_frag, null_frag, + v32f, v32sb, 2, 12>; + defm WFKEXB : BinaryVRRcSPair<"wfkexb", 0xE7E8, null_frag, null_frag, + v128q, v128xb, 4, 12>; + } // Compare high. def VFCH : BinaryVRRcSPairFloatGeneric<"vfch", 0xE7EB>; @@ -1010,6 +1253,28 @@ let Predicates = [FeatureVector] in { v128g, v128db, 3, 0>; defm WFCHDB : BinaryVRRcSPair<"wfchdb", 0xE7EB, null_frag, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCHSB : BinaryVRRcSPair<"vfchsb", 0xE7EB, z_vfcmph, z_vfcmphs, + v128f, v128sb, 2, 0>; + defm WFCHSB : BinaryVRRcSPair<"wfchsb", 0xE7EB, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCHXB : BinaryVRRcSPair<"wfchxb", 0xE7EB, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } + + // Compare and signal high. + let Predicates = [FeatureVectorEnhancements1] in { + defm VFKHDB : BinaryVRRcSPair<"vfkhdb", 0xE7EB, null_frag, null_frag, + v128g, v128db, 3, 4>; + defm WFKHDB : BinaryVRRcSPair<"wfkhdb", 0xE7EB, null_frag, null_frag, + v64g, v64db, 3, 12>; + defm VFKHSB : BinaryVRRcSPair<"vfkhsb", 0xE7EB, null_frag, null_frag, + v128f, v128sb, 2, 4>; + defm WFKHSB : BinaryVRRcSPair<"wfkhsb", 0xE7EB, null_frag, null_frag, + v32f, v32sb, 2, 12>; + defm WFKHXB : BinaryVRRcSPair<"wfkhxb", 0xE7EB, null_frag, null_frag, + v128q, v128xb, 4, 12>; + } // Compare high or equal. def VFCHE : BinaryVRRcSPairFloatGeneric<"vfche", 0xE7EA>; @@ -1017,6 +1282,28 @@ let Predicates = [FeatureVector] in { v128g, v128db, 3, 0>; defm WFCHEDB : BinaryVRRcSPair<"wfchedb", 0xE7EA, null_frag, null_frag, v64g, v64db, 3, 8>; + let Predicates = [FeatureVectorEnhancements1] in { + defm VFCHESB : BinaryVRRcSPair<"vfchesb", 0xE7EA, z_vfcmphe, z_vfcmphes, + v128f, v128sb, 2, 0>; + defm WFCHESB : BinaryVRRcSPair<"wfchesb", 0xE7EA, null_frag, null_frag, + v32f, v32sb, 2, 8>; + defm WFCHEXB : BinaryVRRcSPair<"wfchexb", 0xE7EA, null_frag, null_frag, + v128q, v128xb, 4, 8>; + } + + // Compare and signal high or equal. + let Predicates = [FeatureVectorEnhancements1] in { + defm VFKHEDB : BinaryVRRcSPair<"vfkhedb", 0xE7EA, null_frag, null_frag, + v128g, v128db, 3, 4>; + defm WFKHEDB : BinaryVRRcSPair<"wfkhedb", 0xE7EA, null_frag, null_frag, + v64g, v64db, 3, 12>; + defm VFKHESB : BinaryVRRcSPair<"vfkhesb", 0xE7EA, null_frag, null_frag, + v128f, v128sb, 2, 4>; + defm WFKHESB : BinaryVRRcSPair<"wfkhesb", 0xE7EA, null_frag, null_frag, + v32f, v32sb, 2, 12>; + defm WFKHEXB : BinaryVRRcSPair<"wfkhexb", 0xE7EA, null_frag, null_frag, + v128q, v128xb, 4, 12>; + } } //===----------------------------------------------------------------------===// @@ -1028,36 +1315,49 @@ def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (f128 VR128:$src))), (v16i8 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (f128 VR128:$src))), (v8i16 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (f128 VR128:$src))), (v4i32 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (f128 VR128:$src))), (v2i64 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (f128 VR128:$src))), (v4f32 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (f128 VR128:$src))), (v2f64 VR128:$src)>; + +def : Pat<(f128 (bitconvert (v16i8 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v8i16 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v4i32 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v2i64 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v4f32 VR128:$src))), (f128 VR128:$src)>; +def : Pat<(f128 (bitconvert (v2f64 VR128:$src))), (f128 VR128:$src)>; //===----------------------------------------------------------------------===// // Replicating scalars @@ -1134,6 +1434,20 @@ let AddedComplexity = 4 in { } //===----------------------------------------------------------------------===// +// Support for 128-bit floating-point values in vector registers +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureVectorEnhancements1] in { + def : Pat<(f128 (load bdxaddr12only:$addr)), + (VL bdxaddr12only:$addr)>; + def : Pat<(store (f128 VR128:$src), bdxaddr12only:$addr), + (VST VR128:$src, bdxaddr12only:$addr)>; + + def : Pat<(f128 fpimm0), (VZERO)>; + def : Pat<(f128 fpimmneg0), (WFLNXB (VZERO))>; +} + +//===----------------------------------------------------------------------===// // String instructions //===----------------------------------------------------------------------===// @@ -1202,3 +1516,37 @@ let Predicates = [FeatureVector] in { defm VSTRCZF : QuaternaryOptVRRdSPair<"vstrczf", 0xE78A, int_s390_vstrczf, z_vstrcz_cc, v128f, v128f, 2, 2>; } + +//===----------------------------------------------------------------------===// +// Packed-decimal instructions +//===----------------------------------------------------------------------===// + +let Predicates = [FeatureVectorPackedDecimal] in { + def VLIP : BinaryVRIh<"vlip", 0xE649>; + + def VPKZ : BinaryVSI<"vpkz", 0xE634, null_frag, 0>; + def VUPKZ : StoreLengthVSI<"vupkz", 0xE63C, null_frag, 0>; + + let Defs = [CC] in { + def VCVB : BinaryVRRi<"vcvb", 0xE650, GR32>; + def VCVBG : BinaryVRRi<"vcvbg", 0xE652, GR64>; + def VCVD : TernaryVRIi<"vcvd", 0xE658, GR32>; + def VCVDG : TernaryVRIi<"vcvdg", 0xE65A, GR64>; + + def VAP : QuaternaryVRIf<"vap", 0xE671>; + def VSP : QuaternaryVRIf<"vsp", 0xE673>; + + def VMP : QuaternaryVRIf<"vmp", 0xE678>; + def VMSP : QuaternaryVRIf<"vmsp", 0xE679>; + + def VDP : QuaternaryVRIf<"vdp", 0xE67A>; + def VRP : QuaternaryVRIf<"vrp", 0xE67B>; + def VSDP : QuaternaryVRIf<"vsdp", 0xE67E>; + + def VSRP : QuaternaryVRIg<"vsrp", 0xE659>; + def VPSOP : QuaternaryVRIg<"vpsop", 0xE65B>; + + def VTP : TestVRRg<"vtp", 0xE65F>; + def VCP : CompareVRRh<"vcp", 0xE677>; + } +} diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index 9c6d5819f8a7e..759a8bb0ce14d 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -181,6 +181,7 @@ def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask, [SDNPInGlue]>; def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>; def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>; +def z_smul_lohi : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>; def z_umul_lohi : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>; def z_sdivrem : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>; def z_udivrem : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>; @@ -549,6 +550,12 @@ def z_fma : PatFrag<(ops node:$src1, node:$src2, node:$src3), def z_fms : PatFrag<(ops node:$src1, node:$src2, node:$src3), (fma node:$src2, node:$src3, (fneg node:$src1))>; +// Negative fused multiply-add and multiply-subtract. +def fnma : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fneg (fma node:$src1, node:$src2, node:$src3))>; +def fnms : PatFrag<(ops node:$src1, node:$src2, node:$src3), + (fneg (fms node:$src1, node:$src2, node:$src3))>; + // Floating-point negative absolute. def fnabs : PatFrag<(ops node:$ptr), (fneg (fabs node:$ptr))>; @@ -624,6 +631,19 @@ def z_vllezf64 : PatFrag<(ops node:$addr), (scalar_to_vector (f64 (load node:$addr))), (z_vzero))>; +// Similarly for the high element of a zeroed vector. +def z_vllezli32 : z_vllez<i32, load, 0>; +def z_vllezlf32 : PatFrag<(ops node:$addr), + (bitconvert + (z_merge_high + (v2i64 + (bitconvert + (z_merge_high + (v4f32 (scalar_to_vector + (f32 (load node:$addr)))), + (v4f32 (z_vzero))))), + (v2i64 (z_vzero))))>; + // Store one element of a vector. class z_vste<ValueType scalartype, SDPatternOperator store> : PatFrag<(ops node:$vec, node:$addr, node:$index), diff --git a/lib/Target/SystemZ/SystemZPatterns.td b/lib/Target/SystemZ/SystemZPatterns.td index 16a7ed784d709..152521fb66a8d 100644 --- a/lib/Target/SystemZ/SystemZPatterns.td +++ b/lib/Target/SystemZ/SystemZPatterns.td @@ -167,3 +167,10 @@ class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1, TypedReg tr2, bits<3> suppress, bits<4> mode> : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))), (insn tr2.op:$vec, suppress, mode)>; + +// Use INSN to perform mininum/maximum operation OPERATOR on type TR. +// FUNCTION is the type of minimum/maximum function to perform. +class FPMinMax<Instruction insn, SDPatternOperator operator, TypedReg tr, + bits<4> function> + : Pat<(tr.vt (operator (tr.vt tr.op:$vec1), (tr.vt tr.op:$vec2))), + (insn tr.op:$vec1, tr.op:$vec2, function)>; diff --git a/lib/Target/SystemZ/SystemZProcessors.td b/lib/Target/SystemZ/SystemZProcessors.td index 1cdc0949ff4ae..0dca4582dc0d6 100644 --- a/lib/Target/SystemZ/SystemZProcessors.td +++ b/lib/Target/SystemZ/SystemZProcessors.td @@ -33,3 +33,6 @@ def : ProcessorModel<"zEC12", ZEC12Model, Arch10SupportedFeatures.List>; def : ProcessorModel<"arch11", Z13Model, Arch11SupportedFeatures.List>; def : ProcessorModel<"z13", Z13Model, Arch11SupportedFeatures.List>; +def : ProcessorModel<"arch12", Z14Model, Arch12SupportedFeatures.List>; +def : ProcessorModel<"z14", Z14Model, Arch12SupportedFeatures.List>; + diff --git a/lib/Target/SystemZ/SystemZRegisterInfo.td b/lib/Target/SystemZ/SystemZRegisterInfo.td index 36809ea81dc1c..52ba1a584017a 100644 --- a/lib/Target/SystemZ/SystemZRegisterInfo.td +++ b/lib/Target/SystemZ/SystemZRegisterInfo.td @@ -260,10 +260,10 @@ defm VF128 : SystemZRegClass<"VF128", // All vector registers. defm VR128 : SystemZRegClass<"VR128", - [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], 128, - (add (sequence "V%u", 0, 7), - (sequence "V%u", 16, 31), - (sequence "V%u", 8, 15))>; + [f128, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], + 128, (add (sequence "V%u", 0, 7), + (sequence "V%u", 16, 31), + (sequence "V%u", 8, 15))>; // Attaches a ValueType to a register operand, to make the instruction // definitions easier. @@ -272,7 +272,8 @@ class TypedReg<ValueType vtin, RegisterOperand opin> { RegisterOperand op = opin; } -def v32eb : TypedReg<f32, VR32>; +def v32f : TypedReg<i32, VR32>; +def v32sb : TypedReg<f32, VR32>; def v64g : TypedReg<i64, VR64>; def v64db : TypedReg<f64, VR64>; def v128b : TypedReg<v16i8, VR128>; @@ -280,8 +281,9 @@ def v128h : TypedReg<v8i16, VR128>; def v128f : TypedReg<v4i32, VR128>; def v128g : TypedReg<v2i64, VR128>; def v128q : TypedReg<v16i8, VR128>; -def v128eb : TypedReg<v4f32, VR128>; +def v128sb : TypedReg<v4f32, VR128>; def v128db : TypedReg<v2f64, VR128>; +def v128xb : TypedReg<f128, VR128>; def v128any : TypedReg<untyped, VR128>; //===----------------------------------------------------------------------===// diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td index 1ce0168f95e95..8dba89f70a422 100644 --- a/lib/Target/SystemZ/SystemZSchedule.td +++ b/lib/Target/SystemZ/SystemZSchedule.td @@ -59,7 +59,7 @@ def FPU2 : SchedWrite; def DFU : SchedWrite; def DFU2 : SchedWrite; -// Vector sub units (z13) +// Vector sub units (z13 and later) def VecBF : SchedWrite; def VecBF2 : SchedWrite; def VecDF : SchedWrite; @@ -75,6 +75,7 @@ def VecXsPm : SchedWrite; def VBU : SchedWrite; +include "SystemZScheduleZ14.td" include "SystemZScheduleZ13.td" include "SystemZScheduleZEC12.td" include "SystemZScheduleZ196.td" diff --git a/lib/Target/SystemZ/SystemZScheduleZ14.td b/lib/Target/SystemZ/SystemZScheduleZ14.td new file mode 100644 index 0000000000000..f11177af91a53 --- /dev/null +++ b/lib/Target/SystemZ/SystemZScheduleZ14.td @@ -0,0 +1,1611 @@ +//-- SystemZScheduleZ14.td - SystemZ Scheduling Definitions ----*- tblgen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Z14 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Z14Model : SchedMachineModel { + + let UnsupportedFeatures = Arch12UnsupportedFeatures.List; + + let IssueWidth = 8; + let MicroOpBufferSize = 60; // Issue queues + let LoadLatency = 1; // Optimistic load latency. + + let PostRAScheduler = 1; + + // Extra cycles for a mispredicted branch. + let MispredictPenalty = 20; +} + +let SchedModel = Z14Model in { + +// These definitions could be put in a subtarget common include file, +// but it seems the include system in Tablegen currently rejects +// multiple includes of same file. +def : WriteRes<GroupAlone, []> { + let NumMicroOps = 0; + let BeginGroup = 1; + let EndGroup = 1; +} +def : WriteRes<BeginGroup, []> { + let NumMicroOps = 0; + let BeginGroup = 1; +} +def : WriteRes<EndGroup, []> { + let NumMicroOps = 0; + let EndGroup = 1; +} +def : WriteRes<Lat2, []> { let Latency = 2; let NumMicroOps = 0;} +def : WriteRes<Lat3, []> { let Latency = 3; let NumMicroOps = 0;} +def : WriteRes<Lat4, []> { let Latency = 4; let NumMicroOps = 0;} +def : WriteRes<Lat5, []> { let Latency = 5; let NumMicroOps = 0;} +def : WriteRes<Lat6, []> { let Latency = 6; let NumMicroOps = 0;} +def : WriteRes<Lat7, []> { let Latency = 7; let NumMicroOps = 0;} +def : WriteRes<Lat8, []> { let Latency = 8; let NumMicroOps = 0;} +def : WriteRes<Lat9, []> { let Latency = 9; let NumMicroOps = 0;} +def : WriteRes<Lat10, []> { let Latency = 10; let NumMicroOps = 0;} +def : WriteRes<Lat11, []> { let Latency = 11; let NumMicroOps = 0;} +def : WriteRes<Lat12, []> { let Latency = 12; let NumMicroOps = 0;} +def : WriteRes<Lat15, []> { let Latency = 15; let NumMicroOps = 0;} +def : WriteRes<Lat20, []> { let Latency = 20; let NumMicroOps = 0;} +def : WriteRes<Lat30, []> { let Latency = 30; let NumMicroOps = 0;} + +// Execution units. +def Z14_FXaUnit : ProcResource<2>; +def Z14_FXbUnit : ProcResource<2>; +def Z14_LSUnit : ProcResource<2>; +def Z14_VecUnit : ProcResource<2>; +def Z14_VecFPdUnit : ProcResource<2> { let BufferSize = 1; /* blocking */ } +def Z14_VBUnit : ProcResource<2>; + +// Subtarget specific definitions of scheduling resources. +def : WriteRes<FXa, [Z14_FXaUnit]> { let Latency = 1; } +def : WriteRes<FXa2, [Z14_FXaUnit, Z14_FXaUnit]> { let Latency = 2; } +def : WriteRes<FXb, [Z14_FXbUnit]> { let Latency = 1; } +def : WriteRes<LSU, [Z14_LSUnit]> { let Latency = 4; } +def : WriteRes<VecBF, [Z14_VecUnit]> { let Latency = 8; } +def : WriteRes<VecBF2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; } +def : WriteRes<VecDF, [Z14_VecUnit]> { let Latency = 8; } +def : WriteRes<VecDF2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 9; } +def : WriteRes<VecDFX, [Z14_VecUnit]> { let Latency = 1; } +def : WriteRes<VecDFX2, [Z14_VecUnit, Z14_VecUnit]> { let Latency = 2; } +def : WriteRes<VecFPd, [Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit, + Z14_VecFPdUnit, Z14_VecFPdUnit, Z14_VecFPdUnit]> + { let Latency = 30; } +def : WriteRes<VecMul, [Z14_VecUnit]> { let Latency = 5; } +def : WriteRes<VecStr, [Z14_VecUnit]> { let Latency = 4; } +def : WriteRes<VecXsPm, [Z14_VecUnit]> { let Latency = 3; } +def : WriteRes<VBU, [Z14_VBUnit]>; // Virtual Branching Unit + +// -------------------------- INSTRUCTIONS ---------------------------------- // + +// InstRW constructs have been used in order to preserve the +// readability of the InstrInfo files. + +// For each instruction, as matched by a regexp, provide a list of +// resources that it needs. These will be combined into a SchedClass. + +//===----------------------------------------------------------------------===// +// Stack allocation +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "ADJDYNALLOC$")>; // Pseudo -> LA / LAY + +//===----------------------------------------------------------------------===// +// Branch instructions +//===----------------------------------------------------------------------===// + +// Branch +def : InstRW<[VBU], (instregex "(Call)?BRC(L)?(Asm.*)?$")>; +def : InstRW<[VBU], (instregex "(Call)?J(G)?(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "(Call)?BC(R)?(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "(Call)?B(R)?(Asm.*)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "BI(C)?(Asm.*)?$")>; +def : InstRW<[FXa, EndGroup], (instregex "BRCT(G)?$")>; +def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BRCTH$")>; +def : InstRW<[FXb, FXa, Lat2, GroupAlone], (instregex "BCT(G)?(R)?$")>; +def : InstRW<[FXa, FXa, FXb, FXb, Lat4, GroupAlone], + (instregex "B(R)?X(H|L).*$")>; + +// Compare and branch +def : InstRW<[FXb], (instregex "C(L)?(G)?(I|R)J(Asm.*)?$")>; +def : InstRW<[FXb, FXb, Lat2, GroupAlone], + (instregex "C(L)?(G)?(I|R)B(Call|Return|Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Trap instructions +//===----------------------------------------------------------------------===// + +// Trap +def : InstRW<[VBU], (instregex "(Cond)?Trap$")>; + +// Compare and trap +def : InstRW<[FXb], (instregex "C(G)?(I|R)T(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "CL(G)?RT(Asm.*)?$")>; +def : InstRW<[FXb], (instregex "CL(F|G)IT(Asm.*)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CL(G)?T(Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Call and return instructions +//===----------------------------------------------------------------------===// + +// Call +def : InstRW<[VBU, FXa, FXa, Lat3, GroupAlone], (instregex "(Call)?BRAS$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BRASL$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "(Call)?BAS(R)?$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "TLS_(G|L)DCALL$")>; + +// Return +def : InstRW<[FXb, EndGroup], (instregex "Return$")>; +def : InstRW<[FXb], (instregex "CondReturn$")>; + +//===----------------------------------------------------------------------===// +// Select instructions +//===----------------------------------------------------------------------===// + +// Select pseudo +def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>; + +// CondStore pseudos +def : InstRW<[FXa], (instregex "CondStore16(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore16Mux(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore32(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore32Mux(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore64(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore8(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStore8Mux(Inv)?$")>; + +//===----------------------------------------------------------------------===// +// Move instructions +//===----------------------------------------------------------------------===// + +// Moves +def : InstRW<[FXb, LSU, Lat5], (instregex "MV(G|H)?HI$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "MVI(Y)?$")>; + +// Move character +def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; + +// Pseudo -> reg move +def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>; +def : InstRW<[FXa], (instregex "EXTRACT_SUBREG$")>; +def : InstRW<[FXa], (instregex "INSERT_SUBREG$")>; +def : InstRW<[FXa], (instregex "REG_SEQUENCE$")>; +def : InstRW<[FXa], (instregex "SUBREG_TO_REG$")>; + +// Loads +def : InstRW<[LSU], (instregex "L(Y|FH|RL|Mux|CBB)?$")>; +def : InstRW<[LSU], (instregex "LG(RL)?$")>; +def : InstRW<[LSU], (instregex "L128$")>; + +def : InstRW<[FXa], (instregex "LLIH(F|H|L)$")>; +def : InstRW<[FXa], (instregex "LLIL(F|H|L)$")>; + +def : InstRW<[FXa], (instregex "LG(F|H)I$")>; +def : InstRW<[FXa], (instregex "LHI(Mux)?$")>; +def : InstRW<[FXa], (instregex "LR(Mux)?$")>; + +// Load and zero rightmost byte +def : InstRW<[LSU], (instregex "LZR(F|G)$")>; + +// Load and trap +def : InstRW<[FXb, LSU, Lat5], (instregex "L(FH|G)?AT$")>; + +// Load and test +def : InstRW<[FXa, LSU, Lat5], (instregex "LT(G)?$")>; +def : InstRW<[FXa], (instregex "LT(G)?R$")>; + +// Stores +def : InstRW<[FXb, LSU, Lat5], (instregex "STG(RL)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "ST128$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "ST(Y|FH|RL|Mux)?$")>; + +// String moves. +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>; + +//===----------------------------------------------------------------------===// +// Conditional move instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>; +def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>; +def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>; +def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>; + +//===----------------------------------------------------------------------===// +// Sign extensions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "L(B|H|G)R$")>; +def : InstRW<[FXa], (instregex "LG(B|H|F)R$")>; + +def : InstRW<[FXa, LSU, Lat5], (instregex "LTGF$")>; +def : InstRW<[FXa], (instregex "LTGFR$")>; + +def : InstRW<[FXa, LSU, Lat5], (instregex "LB(H|Mux)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LH(Y)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LH(H|Mux|RL)$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LG(B|H|F)$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LG(H|F)RL$")>; + +//===----------------------------------------------------------------------===// +// Zero extensions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LLCR(Mux)?$")>; +def : InstRW<[FXa], (instregex "LLHR(Mux)?$")>; +def : InstRW<[FXa], (instregex "LLG(C|H|F|T)R$")>; +def : InstRW<[LSU], (instregex "LLC(Mux)?$")>; +def : InstRW<[LSU], (instregex "LLH(Mux)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LL(C|H)H$")>; +def : InstRW<[LSU], (instregex "LLHRL$")>; +def : InstRW<[LSU], (instregex "LLG(C|H|F|T|HRL|FRL)$")>; + +// Load and zero rightmost byte +def : InstRW<[LSU], (instregex "LLZRGF$")>; + +// Load and trap +def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>; + +//===----------------------------------------------------------------------===// +// Truncations +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Multi-register moves +//===----------------------------------------------------------------------===// + +// Load multiple (estimated average of 5 ops) +def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], + (instregex "LM(H|Y|G)?$")>; + +// Load multiple disjoint +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; + +// Store multiple (estimated average of ceil(5/2) FXb ops) +def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, + GroupAlone], (instregex "STM(G|H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Byte swaps +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LRV(G)?R$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; + +//===----------------------------------------------------------------------===// +// Load address instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LA(Y|RL)?$")>; + +// Load the Global Offset Table address ( -> larl ) +def : InstRW<[FXa], (instregex "GOT$")>; + +//===----------------------------------------------------------------------===// +// Absolute and Negation +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "LP(G)?R$")>; +def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "L(N|P)GFR$")>; +def : InstRW<[FXa], (instregex "LN(R|GR)$")>; +def : InstRW<[FXa], (instregex "LC(R|GR)$")>; +def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "LCGFR$")>; + +//===----------------------------------------------------------------------===// +// Insertion +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "IC(Y)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "IC32(Y)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "ICM(H|Y)?$")>; +def : InstRW<[FXa], (instregex "II(F|H|L)Mux$")>; +def : InstRW<[FXa], (instregex "IIHF(64)?$")>; +def : InstRW<[FXa], (instregex "IIHH(64)?$")>; +def : InstRW<[FXa], (instregex "IIHL(64)?$")>; +def : InstRW<[FXa], (instregex "IILF(64)?$")>; +def : InstRW<[FXa], (instregex "IILH(64)?$")>; +def : InstRW<[FXa], (instregex "IILL(64)?$")>; + +//===----------------------------------------------------------------------===// +// Addition +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "A(Y)?$")>; +def : InstRW<[FXa, LSU, Lat6], (instregex "AH(Y)?$")>; +def : InstRW<[FXa], (instregex "AIH$")>; +def : InstRW<[FXa], (instregex "AFI(Mux)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "AG$")>; +def : InstRW<[FXa], (instregex "AGFI$")>; +def : InstRW<[FXa], (instregex "AGHI(K)?$")>; +def : InstRW<[FXa], (instregex "AGR(K)?$")>; +def : InstRW<[FXa], (instregex "AHI(K)?$")>; +def : InstRW<[FXa], (instregex "AHIMux(K)?$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "AL(Y)?$")>; +def : InstRW<[FXa], (instregex "AL(FI|HSIK)$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "ALG(F)?$")>; +def : InstRW<[FXa], (instregex "ALGHSIK$")>; +def : InstRW<[FXa], (instregex "ALGF(I|R)$")>; +def : InstRW<[FXa], (instregex "ALGR(K)?$")>; +def : InstRW<[FXa], (instregex "ALR(K)?$")>; +def : InstRW<[FXa], (instregex "AR(K)?$")>; +def : InstRW<[FXa], (instregex "A(L)?HHHR$")>; +def : InstRW<[FXa, Lat2], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXa], (instregex "ALSIH(N)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>; + +// Logical addition with carry +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>; +def : InstRW<[FXa, Lat2, GroupAlone], (instregex "ALC(G)?R$")>; + +// Add with sign extension (16/32 -> 64) +def : InstRW<[FXa, LSU, Lat6], (instregex "AG(F|H)$")>; +def : InstRW<[FXa, Lat2], (instregex "AGFR$")>; + +//===----------------------------------------------------------------------===// +// Subtraction +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "S(G|Y)?$")>; +def : InstRW<[FXa, LSU, Lat6], (instregex "SH(Y)?$")>; +def : InstRW<[FXa], (instregex "SGR(K)?$")>; +def : InstRW<[FXa], (instregex "SLFI$")>; +def : InstRW<[FXa, LSU, Lat5], (instregex "SL(G|GF|Y)?$")>; +def : InstRW<[FXa], (instregex "SLGF(I|R)$")>; +def : InstRW<[FXa], (instregex "SLGR(K)?$")>; +def : InstRW<[FXa], (instregex "SLR(K)?$")>; +def : InstRW<[FXa], (instregex "SR(K)?$")>; +def : InstRW<[FXa], (instregex "S(L)?HHHR$")>; +def : InstRW<[FXa, Lat2], (instregex "S(L)?HHLR$")>; + +// Subtraction with borrow +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "SLB(G)?$")>; +def : InstRW<[FXa, Lat2, GroupAlone], (instregex "SLB(G)?R$")>; + +// Subtraction with sign extension (16/32 -> 64) +def : InstRW<[FXa, LSU, Lat6], (instregex "SG(F|H)$")>; +def : InstRW<[FXa, Lat2], (instregex "SGFR$")>; + +//===----------------------------------------------------------------------===// +// AND +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "N(G|Y)?$")>; +def : InstRW<[FXa], (instregex "NGR(K)?$")>; +def : InstRW<[FXa], (instregex "NI(FMux|HMux|LMux)$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "NI(Y)?$")>; +def : InstRW<[FXa], (instregex "NIHF(64)?$")>; +def : InstRW<[FXa], (instregex "NIHH(64)?$")>; +def : InstRW<[FXa], (instregex "NIHL(64)?$")>; +def : InstRW<[FXa], (instregex "NILF(64)?$")>; +def : InstRW<[FXa], (instregex "NILH(64)?$")>; +def : InstRW<[FXa], (instregex "NILL(64)?$")>; +def : InstRW<[FXa], (instregex "NR(K)?$")>; +def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "NC$")>; + +//===----------------------------------------------------------------------===// +// OR +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "O(G|Y)?$")>; +def : InstRW<[FXa], (instregex "OGR(K)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "OI(Y)?$")>; +def : InstRW<[FXa], (instregex "OI(FMux|HMux|LMux)$")>; +def : InstRW<[FXa], (instregex "OIHF(64)?$")>; +def : InstRW<[FXa], (instregex "OIHH(64)?$")>; +def : InstRW<[FXa], (instregex "OIHL(64)?$")>; +def : InstRW<[FXa], (instregex "OILF(64)?$")>; +def : InstRW<[FXa], (instregex "OILH(64)?$")>; +def : InstRW<[FXa], (instregex "OILL(64)?$")>; +def : InstRW<[FXa], (instregex "OR(K)?$")>; +def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "OC$")>; + +//===----------------------------------------------------------------------===// +// XOR +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat5], (instregex "X(G|Y)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "XI(Y)?$")>; +def : InstRW<[FXa], (instregex "XIFMux$")>; +def : InstRW<[FXa], (instregex "XGR(K)?$")>; +def : InstRW<[FXa], (instregex "XIHF(64)?$")>; +def : InstRW<[FXa], (instregex "XILF(64)?$")>; +def : InstRW<[FXa], (instregex "XR(K)?$")>; +def : InstRW<[LSU, LSU, FXb, Lat9, BeginGroup], (instregex "XC$")>; + +//===----------------------------------------------------------------------===// +// Multiplication +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat9], (instregex "MS(GF|Y)?$")>; +def : InstRW<[FXa, Lat5], (instregex "MS(R|FI)$")>; +def : InstRW<[FXa, LSU, Lat11], (instregex "MSG$")>; +def : InstRW<[FXa, Lat7], (instregex "MSGR$")>; +def : InstRW<[FXa, Lat5], (instregex "MSGF(I|R)$")>; +def : InstRW<[FXa2, LSU, Lat12, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXa2, Lat8, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXa, Lat4], (instregex "MGHI$")>; +def : InstRW<[FXa, Lat4], (instregex "MHI$")>; +def : InstRW<[FXa, LSU, Lat8], (instregex "MH(Y)?$")>; +def : InstRW<[FXa2, Lat6, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXa2, LSU, Lat10, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXa, LSU, Lat8], (instregex "MGH$")>; +def : InstRW<[FXa, LSU, Lat12, GroupAlone], (instregex "MG$")>; +def : InstRW<[FXa, Lat8, GroupAlone], (instregex "MGRK$")>; +def : InstRW<[FXa, LSU, Lat9, GroupAlone], (instregex "MSC$")>; +def : InstRW<[FXa, LSU, Lat11, GroupAlone], (instregex "MSGC$")>; +def : InstRW<[FXa, Lat5], (instregex "MSRKC$")>; +def : InstRW<[FXa, Lat7], (instregex "MSGRKC$")>; + +//===----------------------------------------------------------------------===// +// Division and remainder +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>; +def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>; +def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; +def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>; +def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>; +def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>; +def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>; + +//===----------------------------------------------------------------------===// +// Shifts +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "SLL(G|K)?$")>; +def : InstRW<[FXa], (instregex "SRL(G|K)?$")>; +def : InstRW<[FXa], (instregex "SRA(G|K)?$")>; +def : InstRW<[FXa], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; + +// Rotate +def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>; + +// Rotate and insert +def : InstRW<[FXa], (instregex "RISBG(N|32)?$")>; +def : InstRW<[FXa], (instregex "RISBH(G|H|L)$")>; +def : InstRW<[FXa], (instregex "RISBL(G|H|L)$")>; +def : InstRW<[FXa], (instregex "RISBMux$")>; + +// Rotate and Select +def : InstRW<[FXa, FXa, Lat2, BeginGroup], (instregex "R(N|O|X)SBG$")>; + +//===----------------------------------------------------------------------===// +// Comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat5], (instregex "C(G|Y|Mux|RL)?$")>; +def : InstRW<[FXb], (instregex "C(F|H)I(Mux)?$")>; +def : InstRW<[FXb], (instregex "CG(F|H)I$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CG(HSI|RL)$")>; +def : InstRW<[FXb], (instregex "C(G)?R$")>; +def : InstRW<[FXb], (instregex "CIH$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CH(F|SI)$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CL(Y|Mux|FHSI)?$")>; +def : InstRW<[FXb], (instregex "CLFI(Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLG(HRL|HSI)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLGF(RL)?$")>; +def : InstRW<[FXb], (instregex "CLGF(I|R)$")>; +def : InstRW<[FXb], (instregex "CLGR$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLGRL$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLH(F|RL|HSI)$")>; +def : InstRW<[FXb], (instregex "CLIH$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLI(Y)?$")>; +def : InstRW<[FXb], (instregex "CLR$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "CLRL$")>; +def : InstRW<[FXb], (instregex "C(L)?HHR$")>; +def : InstRW<[FXb, Lat2], (instregex "C(L)?HLR$")>; + +// Compare halfword +def : InstRW<[FXb, LSU, Lat6], (instregex "CH(Y|RL)?$")>; +def : InstRW<[FXb, LSU, Lat6], (instregex "CGH(RL)?$")>; +def : InstRW<[FXa, FXb, LSU, Lat6, BeginGroup], (instregex "CHHSI$")>; + +// Compare with sign extension (32 -> 64) +def : InstRW<[FXb, LSU, Lat6], (instregex "CGF(RL)?$")>; +def : InstRW<[FXb, Lat2], (instregex "CGFR$")>; + +// Compare logical character +def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; + +// Test under mask +def : InstRW<[FXb, LSU, Lat5], (instregex "TM(Y)?$")>; +def : InstRW<[FXb], (instregex "TM(H|L)Mux$")>; +def : InstRW<[FXb], (instregex "TMHH(64)?$")>; +def : InstRW<[FXb], (instregex "TMHL(64)?$")>; +def : InstRW<[FXb], (instregex "TMLH(64)?$")>; +def : InstRW<[FXb], (instregex "TMLL(64)?$")>; + +// Compare logical characters under mask +def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>; + +//===----------------------------------------------------------------------===// +// Prefetch and execution hint +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU], (instregex "PFD(RL)?$")>; +def : InstRW<[FXb, Lat2], (instregex "BPP$")>; +def : InstRW<[FXb, EndGroup], (instregex "BPRP$")>; +def : InstRW<[FXb], (instregex "NIAI$")>; + +//===----------------------------------------------------------------------===// +// Atomic operations +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, EndGroup], (instregex "Serialize$")>; + +def : InstRW<[FXb, LSU, Lat5], (instregex "LAA(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAAL(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAN(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAO(G)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "LAX(G)?$")>; + +// Test and set +def : InstRW<[FXb, LSU, Lat5, EndGroup], (instregex "TS$")>; + +// Compare and swap +def : InstRW<[FXa, FXb, LSU, Lat6, GroupAlone], (instregex "CS(G|Y)?$")>; + +// Compare double and swap +def : InstRW<[FXa, FXa, FXb, FXb, FXa, LSU, Lat10, GroupAlone], + (instregex "CDS(Y)?$")>; +def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone], + (instregex "CDSG$")>; + +// Compare and swap and store +def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>; + +// Perform locked operation +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; + +// Load/store pair from/to quadword +def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPQ$")>; +def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; + +// Load pair disjoint +def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; + +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR|A)?$")>; +def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC)$")>; +def : InstRW<[FXa, Lat30], (instregex "(PCC|PPNO|PRNO)$")>; + +//===----------------------------------------------------------------------===// +// Guarded storage +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU], (instregex "LGG$")>; +def : InstRW<[LSU, Lat5], (instregex "LLGFSG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)GSC$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone], + (instregex "CVBG$")>; +def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; +def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>; + +def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>; +def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + +//===----------------------------------------------------------------------===// +// Access registers +//===----------------------------------------------------------------------===// + +// Extract/set/copy access register +def : InstRW<[LSU], (instregex "(EAR|SAR|CPYA)$")>; + +// Load address extended +def : InstRW<[LSU, FXa, Lat5, BeginGroup], (instregex "LAE(Y)?$")>; + +// Load/store access multiple (not modeled precisely) +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(L|ST)AM(Y)?$")>; + +//===----------------------------------------------------------------------===// +// Program mask and addressing mode +//===----------------------------------------------------------------------===// + +// Insert Program Mask +def : InstRW<[FXa, Lat3, EndGroup], (instregex "IPM$")>; + +// Set Program Mask +def : InstRW<[LSU, EndGroup], (instregex "SPM$")>; + +// Branch and link +def : InstRW<[FXa, FXa, FXb, Lat5, GroupAlone], (instregex "BAL(R)?$")>; + +// Test addressing mode +def : InstRW<[FXb], (instregex "TAM$")>; + +// Set addressing mode +def : InstRW<[FXb, Lat2, EndGroup], (instregex "SAM(24|31|64)$")>; + +// Branch (and save) and set mode. +def : InstRW<[FXa, FXb, Lat2, GroupAlone], (instregex "BSM$")>; +def : InstRW<[FXa, FXa, FXb, Lat3, GroupAlone], (instregex "BASSM$")>; + +//===----------------------------------------------------------------------===// +// Transactional execution +//===----------------------------------------------------------------------===// + +// Transaction begin +def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat15, GroupAlone], + (instregex "TBEGIN(C|_nofloat)?$")>; + +// Transaction end +def : InstRW<[FXb, GroupAlone], (instregex "TEND$")>; + +// Transaction abort +def : InstRW<[LSU, GroupAlone], (instregex "TABORT$")>; + +// Extract Transaction Nesting Depth +def : InstRW<[FXa], (instregex "ETND$")>; + +// Nontransactional store +def : InstRW<[FXb, LSU, Lat5], (instregex "NTSTG$")>; + +//===----------------------------------------------------------------------===// +// Processor assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "PPA$")>; + +//===----------------------------------------------------------------------===// +// Miscellaneous Instructions. +//===----------------------------------------------------------------------===// + +// Find leftmost one +def : InstRW<[FXa, FXa, Lat4, GroupAlone], (instregex "FLOGR$")>; + +// Population count +def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>; + +// Extend +def : InstRW<[FXa], (instregex "AEXT128$")>; +def : InstRW<[FXa], (instregex "ZEXT128$")>; + +// String instructions +def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>; + +// Execute +def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>; + +//===----------------------------------------------------------------------===// +// .insn directive instructions +//===----------------------------------------------------------------------===// + +// An "empty" sched-class will be assigned instead of the "invalid sched-class". +// getNumDecoderSlots() will then return 1 instead of 0. +def : InstRW<[], (instregex "Insn.*")>; + + +// ----------------------------- Floating point ----------------------------- // + +//===----------------------------------------------------------------------===// +// FP: Select instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa], (instregex "Select(F32|F64|F128|VR128)$")>; +def : InstRW<[FXa], (instregex "CondStoreF32(Inv)?$")>; +def : InstRW<[FXa], (instregex "CondStoreF64(Inv)?$")>; + +//===----------------------------------------------------------------------===// +// FP: Move instructions +//===----------------------------------------------------------------------===// + +// Load zero +def : InstRW<[FXb], (instregex "LZ(DR|ER)$")>; +def : InstRW<[FXb, FXb, Lat2, BeginGroup], (instregex "LZXR$")>; + +// Load +def : InstRW<[VecXsPm], (instregex "LER$")>; +def : InstRW<[FXb], (instregex "LD(R|R32|GR)$")>; +def : InstRW<[FXb, Lat3], (instregex "LGDR$")>; +def : InstRW<[FXb, FXb, Lat2, GroupAlone], (instregex "LXR$")>; + +// Load and Test +def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)BR$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "LTEBRCompare(_VecPseudo)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "LTDBRCompare(_VecPseudo)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXBR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], + (instregex "LTXBRCompare(_VecPseudo)?$")>; + +// Copy sign +def : InstRW<[VecXsPm], (instregex "CPSDRd(d|s)$")>; +def : InstRW<[VecXsPm], (instregex "CPSDRs(d|s)$")>; + +//===----------------------------------------------------------------------===// +// FP: Load instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm, LSU, Lat7], (instregex "LE(Y)?$")>; +def : InstRW<[LSU], (instregex "LD(Y|E32)?$")>; +def : InstRW<[LSU], (instregex "LX$")>; + +//===----------------------------------------------------------------------===// +// FP: Store instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat7], (instregex "STD(Y)?$")>; +def : InstRW<[FXb, LSU, Lat7], (instregex "STE(Y)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STX$")>; + +//===----------------------------------------------------------------------===// +// FP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[VecBF], (instregex "LEDBR(A)?$")>; +def : InstRW<[VecDF, VecDF, Lat20], (instregex "LEXBR(A)?$")>; +def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXBR(A)?$")>; + +// Load lengthened +def : InstRW<[VecBF, LSU, Lat12], (instregex "LDEB$")>; +def : InstRW<[VecBF], (instregex "LDEBR$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12 , GroupAlone], (instregex "LX(D|E)B$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)BR$")>; + +// Convert from fixed / logical +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)BR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)BR(A)?$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)BR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CEL(F|G)BR$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CDL(F|G)BR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CXL(F|G)BR$")>; + +// Convert to fixed / logical +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)BR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)BR(A)?$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XBR(A)?$")>; +def : InstRW<[FXb, VecBF, Lat11, GroupAlone], (instregex "CLFEBR$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLFDBR$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CLG(E|D)BR$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "CL(F|G)XBR$")>; + +//===----------------------------------------------------------------------===// +// FP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load Complement / Negative / Positive +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DBR$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)EBR$")>; +def : InstRW<[FXb], (instregex "LCDFR(_32)?$")>; +def : InstRW<[FXb], (instregex "LNDFR(_32)?$")>; +def : InstRW<[FXb], (instregex "LPDFR(_32)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XBR$")>; + +// Square root +def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)B$")>; +def : InstRW<[VecFPd], (instregex "SQ(E|D)BR$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXBR$")>; + +// Load FP integer +def : InstRW<[VecBF], (instregex "FIEBR(A)?$")>; +def : InstRW<[VecBF], (instregex "FIDBR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>; + +//===----------------------------------------------------------------------===// +// FP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>; +def : InstRW<[VecBF], (instregex "A(E|D)BR$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>; + +// Subtraction +def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>; +def : InstRW<[VecBF], (instregex "S(E|D)BR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXBR$")>; + +// Multiply +def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|EE)B$")>; +def : InstRW<[VecBF], (instregex "M(D|DE|EE)BR$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXDB$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>; +def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>; + +// Multiply and add / subtract +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>; + +// Division +def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>; +def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>; + +// Divide to integer +def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>; + +//===----------------------------------------------------------------------===// +// FP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>; +def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>; + +// Test Data Class +def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>; +def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TCXB$")>; + +//===----------------------------------------------------------------------===// +// FP: Floating-point control register instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; +def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; +def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; +def : InstRW<[FXa, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>; +def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>; + + +// --------------------- Hexadecimal floating point ------------------------- // + +//===----------------------------------------------------------------------===// +// HFP: Move instructions +//===----------------------------------------------------------------------===// + +// Load and Test +def : InstRW<[VecXsPm, Lat4], (instregex "LT(D|E)R$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>; +def : InstRW<[VecBF], (instregex "LEXR$")>; +def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>; + +// Load lengthened +def : InstRW<[LSU], (instregex "LDE$")>; +def : InstRW<[FXb], (instregex "LDER$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "LX(D|E)$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "LX(D|E)R$")>; + +// Convert from fixed +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CE(F|G)R$")>; +def : InstRW<[FXb, VecBF, Lat9, BeginGroup], (instregex "CD(F|G)R$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat12, GroupAlone], (instregex "CX(F|G)R$")>; + +// Convert to fixed +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CF(E|D)R$")>; +def : InstRW<[FXb, VecBF, Lat11, BeginGroup], (instregex "CG(E|D)R$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat20, BeginGroup], (instregex "C(F|G)XR$")>; + +// Convert BFP to HFP / HFP to BFP. +def : InstRW<[VecBF], (instregex "THD(E)?R$")>; +def : InstRW<[VecBF], (instregex "TB(E)?DR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load Complement / Negative / Positive +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)DR$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "L(C|N|P)ER$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "L(C|N|P)XR$")>; + +// Halve +def : InstRW<[VecBF], (instregex "H(E|D)R$")>; + +// Square root +def : InstRW<[VecFPd, LSU], (instregex "SQ(E|D)$")>; +def : InstRW<[VecFPd], (instregex "SQ(E|D)R$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "SQXR$")>; + +// Load FP integer +def : InstRW<[VecBF], (instregex "FIER$")>; +def : InstRW<[VecBF], (instregex "FIDR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>; +def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>; + +// Subtraction +def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>; +def : InstRW<[VecBF], (instregex "S(E|D|U|W)R$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXR$")>; + +// Multiply +def : InstRW<[VecBF, LSU, Lat12], (instregex "M(D|DE|E|EE)$")>; +def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>; +def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>; + +// Multiply and add / subtract +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>; + +// Division +def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>; +def : InstRW<[VecFPd], (instregex "D(E|D)R$")>; +def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>; + +//===----------------------------------------------------------------------===// +// HFP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>; +def : InstRW<[VecBF], (instregex "C(E|D)R$")>; +def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>; + + +// ------------------------ Decimal floating point -------------------------- // + +//===----------------------------------------------------------------------===// +// DFP: Move instructions +//===----------------------------------------------------------------------===// + +// Load and Test +def : InstRW<[VecDF], (instregex "LTDTR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Conversion instructions +//===----------------------------------------------------------------------===// + +// Load rounded +def : InstRW<[VecDF, Lat15], (instregex "LEDTR$")>; +def : InstRW<[VecDF, VecDF, Lat20], (instregex "LDXTR$")>; + +// Load lengthened +def : InstRW<[VecDF], (instregex "LDETR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LXDTR$")>; + +// Convert from fixed / logical +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CD(F|G)TR(A)?$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>; +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CDL(F|G)TR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat30, GroupAlone], (instregex "CXL(F|G)TR$")>; + +// Convert to fixed / logical +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "C(F|G)DTR(A)?$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "C(F|G)XTR(A)?$")>; +def : InstRW<[FXb, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)DTR$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>; + +// Convert from / to signed / unsigned packed +def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>; +def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>; +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>; +def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>; + +// Convert from / to zoned +def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>; +def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXZT$")>; +def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CZDT$")>; +def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CZXT$")>; + +// Convert from / to packed +def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDPT$")>; +def : InstRW<[LSU, LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CXPT$")>; +def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>; +def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>; + +// Perform floating-point operation +def : InstRW<[FXb, Lat30], (instregex "PFPO$")>; + +//===----------------------------------------------------------------------===// +// DFP: Unary arithmetic +//===----------------------------------------------------------------------===// + +// Load FP integer +def : InstRW<[VecDF], (instregex "FIDTR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXTR$")>; + +// Extract biased exponent +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEDTR$")>; +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "EEXTR$")>; + +// Extract significance +def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "ESDTR$")>; +def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Binary arithmetic +//===----------------------------------------------------------------------===// + +// Addition +def : InstRW<[VecDF], (instregex "ADTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>; + +// Subtraction +def : InstRW<[VecDF], (instregex "SDTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "SXTR(A)?$")>; + +// Multiply +def : InstRW<[VecDF, Lat30], (instregex "MDTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "MXTR(A)?$")>; + +// Division +def : InstRW<[VecDF, Lat30], (instregex "DDTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat30, GroupAlone], (instregex "DXTR(A)?$")>; + +// Quantize +def : InstRW<[VecDF], (instregex "QADTR$")>; +def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>; + +// Reround +def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>; + +// Shift significand left/right +def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; + +// Insert biased exponent +def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>; +def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>; + +//===----------------------------------------------------------------------===// +// DFP: Comparisons +//===----------------------------------------------------------------------===// + +// Compare +def : InstRW<[VecDF], (instregex "(K|C)DTR$")>; +def : InstRW<[VecDF, VecDF, Lat11, GroupAlone], (instregex "(K|C)XTR$")>; + +// Compare biased exponent +def : InstRW<[VecDF], (instregex "CEDTR$")>; +def : InstRW<[VecDF], (instregex "CEXTR$")>; + +// Test Data Class/Group +def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>; +def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; + + +// --------------------------------- Vector --------------------------------- // + +//===----------------------------------------------------------------------===// +// Vector: Move instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "VLR(32|64)?$")>; +def : InstRW<[FXb, Lat4], (instregex "VLGV(B|F|G|H)?$")>; +def : InstRW<[FXb], (instregex "VLVG(B|F|G|H)?$")>; +def : InstRW<[FXb, Lat2], (instregex "VLVGP(32)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Immediate instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VZERO$")>; +def : InstRW<[VecXsPm], (instregex "VONE$")>; +def : InstRW<[VecXsPm], (instregex "VGBM$")>; +def : InstRW<[VecXsPm], (instregex "VGM(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VREPI(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VLEI(B|F|G|H)$")>; + +//===----------------------------------------------------------------------===// +// Vector: Loads +//===----------------------------------------------------------------------===// + +def : InstRW<[LSU], (instregex "VL(L|BB)?$")>; +def : InstRW<[LSU], (instregex "VL(32|64)$")>; +def : InstRW<[LSU], (instregex "VLLEZ(B|F|G|H|LF)?$")>; +def : InstRW<[LSU], (instregex "VLREP(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, LSU, Lat7], (instregex "VLE(B|F|G|H)$")>; +def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VGE(F|G)$")>; +def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], + (instregex "VLM$")>; +def : InstRW<[LSU, Lat5], (instregex "VLRL(R)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Stores +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat8], (instregex "VST(L|32|64)?$")>; +def : InstRW<[FXb, LSU, Lat8], (instregex "VSTE(F|G)$")>; +def : InstRW<[FXb, LSU, VecXsPm, Lat11, BeginGroup], (instregex "VSTE(B|H)$")>; +def : InstRW<[LSU, LSU, FXb, FXb, FXb, FXb, FXb, Lat20, GroupAlone], + (instregex "VSTM$")>; +def : InstRW<[FXb, FXb, LSU, Lat12, BeginGroup], (instregex "VSCE(F|G)$")>; +def : InstRW<[FXb, LSU, Lat8], (instregex "VSTRL(R)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Selects and permutes +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VMRH(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMRL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VPERM$")>; +def : InstRW<[VecXsPm], (instregex "VPDI$")>; +def : InstRW<[VecXsPm], (instregex "VBPERM$")>; +def : InstRW<[VecXsPm], (instregex "VREP(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VSEL$")>; + +//===----------------------------------------------------------------------===// +// Vector: Widening and narrowing +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VPK(F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VPKS(F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VPKS(F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VPKLS(F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VPKLS(F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VSEG(B|F|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPH(B|F|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPL(B|F)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPLH(B|F|H|W)?$")>; +def : InstRW<[VecXsPm], (instregex "VUPLL(B|F|H)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Integer arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VA(B|F|G|H|Q|C|CQ)?$")>; +def : InstRW<[VecXsPm], (instregex "VACC(B|F|G|H|Q|C|CQ)?$")>; +def : InstRW<[VecXsPm], (instregex "VAVG(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VAVGL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VN(C|O|N|X)?$")>; +def : InstRW<[VecXsPm], (instregex "VO(C)?$")>; +def : InstRW<[VecMul], (instregex "VCKSM$")>; +def : InstRW<[VecXsPm], (instregex "VCLZ(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VCTZ(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VX$")>; +def : InstRW<[VecMul], (instregex "VGFM?$")>; +def : InstRW<[VecMul], (instregex "VGFMA(B|F|G|H)?$")>; +def : InstRW<[VecMul], (instregex "VGFM(B|F|G|H)$")>; +def : InstRW<[VecXsPm], (instregex "VLC(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VLP(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMX(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMXL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMN(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VMNL(B|F|G|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAL(B|F)?$")>; +def : InstRW<[VecMul], (instregex "VMALE(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMALH(B|F|H|W)?$")>; +def : InstRW<[VecMul], (instregex "VMALO(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAO(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAE(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMAH(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VME(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMH(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VML(B|F)?$")>; +def : InstRW<[VecMul], (instregex "VMLE(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMLH(B|F|H|W)?$")>; +def : InstRW<[VecMul], (instregex "VMLO(B|F|H)?$")>; +def : InstRW<[VecMul], (instregex "VMO(B|F|H)?$")>; +def : InstRW<[VecBF2], (instregex "VMSL(G)?$")>; + +def : InstRW<[VecXsPm], (instregex "VPOPCT(B|F|G|H)?$")>; + +def : InstRW<[VecXsPm], (instregex "VERLL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VERLLV(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VERIM(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESLV(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRA(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRAV(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VESRLV(B|F|G|H)?$")>; + +def : InstRW<[VecXsPm], (instregex "VSL(DB)?$")>; +def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSLB$")>; +def : InstRW<[VecXsPm], (instregex "VSR(A|L)$")>; +def : InstRW<[VecXsPm, VecXsPm, Lat8], (instregex "VSR(A|L)B$")>; + +def : InstRW<[VecXsPm], (instregex "VSB(I|IQ|CBI|CBIQ)?$")>; +def : InstRW<[VecXsPm], (instregex "VSCBI(B|F|G|H|Q)?$")>; +def : InstRW<[VecXsPm], (instregex "VS(F|G|H|Q)?$")>; + +def : InstRW<[VecMul], (instregex "VSUM(B|H)?$")>; +def : InstRW<[VecMul], (instregex "VSUMG(F|H)?$")>; +def : InstRW<[VecMul], (instregex "VSUMQ(F|G)?$")>; + +//===----------------------------------------------------------------------===// +// Vector: Integer comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm, Lat4], (instregex "VEC(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VECL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm], (instregex "VCEQ(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VCEQ(B|F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VCH(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VCH(B|F|G|H)S$")>; +def : InstRW<[VecXsPm], (instregex "VCHL(B|F|G|H)?$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VCHL(B|F|G|H)S$")>; +def : InstRW<[VecStr, Lat5], (instregex "VTM$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point arithmetic +//===----------------------------------------------------------------------===// + +// Conversion and rounding +def : InstRW<[VecBF], (instregex "VCD(L)?G$")>; +def : InstRW<[VecBF], (instregex "VCD(L)?GB$")>; +def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>; +def : InstRW<[VecBF], (instregex "VC(L)?GD$")>; +def : InstRW<[VecBF], (instregex "VC(L)?GDB$")>; +def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>; +def : InstRW<[VecBF], (instregex "VL(DE|ED)$")>; +def : InstRW<[VecBF], (instregex "VL(DE|ED)B$")>; +def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>; +def : InstRW<[VecBF], (instregex "VFL(L|R)$")>; +def : InstRW<[VecBF], (instregex "VFL(LS|RD)$")>; +def : InstRW<[VecBF], (instregex "WFL(LS|RD)$")>; +def : InstRW<[VecBF2], (instregex "WFLLD$")>; +def : InstRW<[VecDF2, Lat10], (instregex "WFLRX$")>; +def : InstRW<[VecBF2], (instregex "VFI$")>; +def : InstRW<[VecBF], (instregex "VFIDB$")>; +def : InstRW<[VecBF], (instregex "WFIDB$")>; +def : InstRW<[VecBF2], (instregex "VFISB$")>; +def : InstRW<[VecBF], (instregex "WFISB$")>; +def : InstRW<[VecDF2, Lat10], (instregex "WFIXB$")>; + +// Sign operations +def : InstRW<[VecXsPm], (instregex "VFPSO$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FPSOSB$")>; +def : InstRW<[VecXsPm], (instregex "WFPSOXB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)SB$")>; +def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)XB$")>; + +// Minimum / maximum +def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)$")>; +def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)DB$")>; +def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)DB$")>; +def : InstRW<[VecXsPm], (instregex "VF(MAX|MIN)SB$")>; +def : InstRW<[VecXsPm], (instregex "WF(MAX|MIN)SB$")>; +def : InstRW<[VecDFX], (instregex "WF(MAX|MIN)XB$")>; + +// Test data class +def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCISB$")>; +def : InstRW<[VecDFX, Lat4], (instregex "WFTCIXB$")>; + +// Add / subtract +def : InstRW<[VecBF2], (instregex "VF(A|S)$")>; +def : InstRW<[VecBF], (instregex "VF(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>; +def : InstRW<[VecBF2], (instregex "VF(A|S)SB$")>; +def : InstRW<[VecBF], (instregex "WF(A|S)SB$")>; +def : InstRW<[VecDF2, Lat10], (instregex "WF(A|S)XB$")>; + +// Multiply / multiply-and-add/subtract +def : InstRW<[VecBF2], (instregex "VFM$")>; +def : InstRW<[VecBF], (instregex "VFMDB$")>; +def : InstRW<[VecBF], (instregex "WFMDB$")>; +def : InstRW<[VecBF2], (instregex "VFMSB$")>; +def : InstRW<[VecBF], (instregex "WFMSB$")>; +def : InstRW<[VecDF2, Lat20], (instregex "WFMXB$")>; +def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)$")>; +def : InstRW<[VecBF], (instregex "VF(N)?M(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)DB$")>; +def : InstRW<[VecBF2], (instregex "VF(N)?M(A|S)SB$")>; +def : InstRW<[VecBF], (instregex "WF(N)?M(A|S)SB$")>; +def : InstRW<[VecDF2, Lat20], (instregex "WF(N)?M(A|S)XB$")>; + +// Divide / square root +def : InstRW<[VecFPd], (instregex "VFD$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FDSB$")>; +def : InstRW<[VecFPd], (instregex "WFDXB$")>; +def : InstRW<[VecFPd], (instregex "VFSQ$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>; +def : InstRW<[VecFPd], (instregex "(V|W)FSQSB$")>; +def : InstRW<[VecFPd], (instregex "WFSQXB$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point comparison +//===----------------------------------------------------------------------===// + +def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)$")>; +def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)DB$")>; +def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)DB$")>; +def : InstRW<[VecXsPm], (instregex "VF(C|K)(E|H|HE)SB$")>; +def : InstRW<[VecXsPm], (instregex "WF(C|K)(E|H|HE)SB$")>; +def : InstRW<[VecDFX], (instregex "WF(C|K)(E|H|HE)XB$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)DBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)DBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "VF(C|K)(E|H|HE)SBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)(E|H|HE)SBS$")>; +def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)(E|H|HE)XBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)SB$")>; +def : InstRW<[VecDFX, Lat4], (instregex "WF(C|K)XB$")>; + +//===----------------------------------------------------------------------===// +// Vector: Floating-point insertion and extraction +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "LEFR$")>; +def : InstRW<[FXb, Lat4], (instregex "LFER$")>; + +//===----------------------------------------------------------------------===// +// Vector: String instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecStr], (instregex "VFAE(B)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFAEBS$")>; +def : InstRW<[VecStr], (instregex "VFAE(F|H)$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFAE(F|H)S$")>; +def : InstRW<[VecStr], (instregex "VFAEZ(B|F|H)$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFAEZ(B|F|H)S$")>; +def : InstRW<[VecStr], (instregex "VFEE(B|F|H|ZB|ZF|ZH)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFEE(B|F|H|ZB|ZF|ZH)S$")>; +def : InstRW<[VecStr], (instregex "VFENE(B|F|H|ZB|ZF|ZH)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VFENE(B|F|H|ZB|ZF|ZH)S$")>; +def : InstRW<[VecStr], (instregex "VISTR(B|F|H)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VISTR(B|F|H)S$")>; +def : InstRW<[VecStr], (instregex "VSTRC(B|F|H)?$")>; +def : InstRW<[VecStr, Lat5], (instregex "VSTRC(B|F|H)S$")>; +def : InstRW<[VecStr], (instregex "VSTRCZ(B|F|H)$")>; +def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>; + +//===----------------------------------------------------------------------===// +// Vector: Packed-decimal instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[VecDF, VecDF, Lat10, GroupAlone], (instregex "VLIP$")>; +def : InstRW<[VecDFX, LSU, Lat12, GroupAlone], (instregex "VPKZ$")>; +def : InstRW<[VecDFX, FXb, LSU, Lat12, GroupAlone], (instregex "VUPKZ$")>; +def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVB(G)?$")>; +def : InstRW<[VecDF, VecDF, FXb, Lat20, GroupAlone], (instregex "VCVD(G)?$")>; +def : InstRW<[VecDFX], (instregex "V(A|S)P$")>; +def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "VM(S)?P$")>; +def : InstRW<[VecDF, VecDF, Lat30, GroupAlone], (instregex "V(D|R)P$")>; +def : InstRW<[VecDFX, Lat30, GroupAlone], (instregex "VSDP$")>; +def : InstRW<[VecDF, VecDF, Lat11], (instregex "VSRP$")>; +def : InstRW<[VecDFX], (instregex "VPSOP$")>; +def : InstRW<[VecDFX], (instregex "V(T|C)P$")>; + + +// -------------------------------- System ---------------------------------- // + +//===----------------------------------------------------------------------===// +// System: Program-Status Word Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>; +def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXa, Lat3], (instregex "IAC$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; + +//===----------------------------------------------------------------------===// +// System: Control Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "LCTL(G)?$")>; +def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; +def : InstRW<[FXb, Lat30], (instregex "SSA(I)?R$")>; +def : InstRW<[FXb, Lat30], (instregex "ESEA$")>; + +//===----------------------------------------------------------------------===// +// System: Prefix-Register Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "SPX$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STPX$")>; + +//===----------------------------------------------------------------------===// +// System: Storage-Key and Real Memory Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "ISKE$")>; +def : InstRW<[FXb, Lat30], (instregex "IVSK$")>; +def : InstRW<[FXb, Lat30], (instregex "SSKE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "RRB(E|M)$")>; +def : InstRW<[FXb, Lat30], (instregex "IRBM$")>; +def : InstRW<[FXb, Lat30], (instregex "PFMF$")>; +def : InstRW<[FXb, Lat30], (instregex "TB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PGIN$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PGOUT$")>; + +//===----------------------------------------------------------------------===// +// System: Dynamic-Address-Translation Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "IPTE(Opt)?(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "IDTE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "CRDTE(Opt)?$")>; +def : InstRW<[FXb, Lat30], (instregex "PTLB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "CSP(G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LPTEA$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LRA(Y|G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STRAG$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "LURA(G)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STUR(A|G)$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>; + +//===----------------------------------------------------------------------===// +// System: Memory-move Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; +def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; + +//===----------------------------------------------------------------------===// +// System: Address-Space Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>; +def : InstRW<[FXb, Lat30], (instregex "PR$")>; +def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "RP$")>; +def : InstRW<[FXb, Lat30], (instregex "BS(G|A)$")>; +def : InstRW<[FXb, Lat20], (instregex "TAR$")>; + +//===----------------------------------------------------------------------===// +// System: Linkage-Stack Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>; +def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>; +def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>; + +//===----------------------------------------------------------------------===// +// System: Time-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "PTFF$")>; +def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>; +def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>; +def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>; +def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>; +def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone], + (instregex "STCK(F)?$")>; +def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone], + (instregex "STCKE$")>; +def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>; +def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Related Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, LSU, Lat30], (instregex "STAP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>; +def : InstRW<[FXb, Lat30], (instregex "PTF$")>; +def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>; + +//===----------------------------------------------------------------------===// +// System: Miscellaneous Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "SVC$")>; +def : InstRW<[FXb, GroupAlone], (instregex "MC$")>; +def : InstRW<[FXb, Lat30], (instregex "DIAG$")>; +def : InstRW<[FXb], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>; +def : InstRW<[FXb, Lat30], (instregex "SIGP$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "SIGA$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>; + +//===----------------------------------------------------------------------===// +// System: CPU-Measurement Facility Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb], (instregex "LPP$")>; +def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>; +def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>; +def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>; +def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>; + +//===----------------------------------------------------------------------===// +// System: I/O Instructions +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, Lat30], (instregex "(C|H|R|X)SCH$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "(M|S|ST|T)SCH$")>; +def : InstRW<[FXb, Lat30], (instregex "RCHP$")>; +def : InstRW<[FXb, Lat30], (instregex "SCHM$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "STC(PS|RW)$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "TPI$")>; +def : InstRW<[FXb, Lat30], (instregex "SAL$")>; + +} + diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index e3e1999d8ad8d..4d986e8391cf5 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -311,7 +311,7 @@ def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; def : InstRW<[FXU], (instregex "A(L)?HHHR$")>; -def : InstRW<[FXU, FXU, Lat3], (instregex "A(L)?HHLR$")>; +def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "A(L)?HHLR$")>; def : InstRW<[FXU], (instregex "ALSIH(N)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; @@ -337,7 +337,7 @@ def : InstRW<[FXU], (instregex "SLGR(K)?$")>; def : InstRW<[FXU], (instregex "SLR(K)?$")>; def : InstRW<[FXU], (instregex "SR(K)?$")>; def : InstRW<[FXU], (instregex "S(L)?HHHR$")>; -def : InstRW<[FXU, FXU, Lat3], (instregex "S(L)?HHLR$")>; +def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "S(L)?HHLR$")>; // Subtraction with borrow def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "SLB(G)?$")>; @@ -403,13 +403,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>; def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>; def : InstRW<[FXU, Lat8], (instregex "MSGR$")>; def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>; -def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; -def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; -def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder @@ -436,7 +436,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>; -def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; +def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -474,7 +475,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "CLI(Y)?$")>; def : InstRW<[FXU], (instregex "CLR$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "CLRL$")>; def : InstRW<[FXU], (instregex "C(L)?HHR$")>; -def : InstRW<[FXU, FXU, Lat3], (instregex "C(L)?HLR$")>; +def : InstRW<[FXU, FXU, Lat3, GroupAlone], (instregex "C(L)?HLR$")>; // Compare halfword def : InstRW<[FXU, LSU, FXU, Lat6, GroupAlone], (instregex "CH(Y|RL)?$")>; @@ -499,7 +500,7 @@ def : InstRW<[FXU], (instregex "TMLH(64)?$")>; def : InstRW<[FXU], (instregex "TMLL(64)?$")>; // Compare logical characters under mask -def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>; +def : InstRW<[FXU, FXU, LSU, Lat5, GroupAlone], (instregex "CLM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Prefetch @@ -532,7 +533,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone], (instregex "CDSG$")>; // Compare and swap and store -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>; // Perform locked operation def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; @@ -548,36 +549,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; // Translate and convert //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; //===----------------------------------------------------------------------===// // Message-security assist //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; +def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; //===----------------------------------------------------------------------===// // Decimal arithmetic //===----------------------------------------------------------------------===// -def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; -def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>; +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone], (instregex "(A|S|ZA)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone], (instregex "SRP$")>; -def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; -def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, GroupAlone], (instregex "TP$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; //===----------------------------------------------------------------------===// @@ -621,7 +630,7 @@ def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "BASSM$")>; //===----------------------------------------------------------------------===// // Find leftmost one -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; // Population count def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>; @@ -632,14 +641,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; -def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; // Various complex instructions -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>; // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -780,9 +789,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>; // Division @@ -791,7 +800,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; // Divide to integer -def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; +def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>; //===----------------------------------------------------------------------===// // FP: Comparisons @@ -813,9 +822,9 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>; def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; -def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>; -def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>; +def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[FXU, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>; def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>; @@ -900,16 +909,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>; def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>; def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>; +def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>; +def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>; +def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>; // Division def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>; @@ -949,16 +962,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>; def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>; // Convert from fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>; +def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>; +def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>; def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>; // Convert to fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>; -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>; // Convert from / to signed / unsigned packed def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>; @@ -967,7 +985,7 @@ def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "C(S|U)DTR$")>; def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$")>; // Perform floating-point operation -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>; +def : InstRW<[FXU, Lat30], (instregex "PFPO$")>; //===----------------------------------------------------------------------===// // DFP: Unary arithmetic @@ -979,7 +997,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>; // Extract biased exponent def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>; -def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>; +def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>; // Extract significance def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>; @@ -1010,15 +1028,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>; def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>; // Reround -def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>; // Shift significand left/right -def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; // Insert biased exponent -def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; //===----------------------------------------------------------------------===// @@ -1027,15 +1045,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; // Compare def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>; -def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>; +def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>; // Compare biased exponent def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>; -def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>; +def : InstRW<[DFU2, Lat9], (instregex "CEXTR$")>; // Test Data Class/Group def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>; -def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; +def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>; // -------------------------------- System ---------------------------------- // @@ -1046,19 +1064,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>; -def : InstRW<[FXU, Lat3], (instregex "IPK$")>; -def : InstRW<[LSU], (instregex "SPKA$")>; -def : InstRW<[LSU], (instregex "SSM$")>; -def : InstRW<[FXU], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; def : InstRW<[FXU, Lat3], (instregex "IAC$")>; -def : InstRW<[LSU], (instregex "SAC(F)?$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// // System: Control Register Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>; -def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], + (instregex "STCT(L|G)$")>; def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "ESEA$")>; @@ -1103,16 +1122,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>; //===----------------------------------------------------------------------===// def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; -def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCSK$")>; +def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>; -def : InstRW<[LSU, Lat30], (instregex "MVPG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; //===----------------------------------------------------------------------===// // System: Address-Space Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>; -def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>; def : InstRW<[FXU, Lat30], (instregex "PR$")>; def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>; @@ -1124,7 +1144,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>; // System: Linkage-Stack Instructions //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>; def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>; def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>; @@ -1161,9 +1181,9 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>; //===----------------------------------------------------------------------===// def : InstRW<[FXU, Lat30], (instregex "SVC$")>; -def : InstRW<[FXU], (instregex "MC$")>; +def : InstRW<[FXU, GroupAlone], (instregex "MC$")>; def : InstRW<[FXU, Lat30], (instregex "DIAG$")>; -def : InstRW<[FXU], (instregex "TRAC(E|G)$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TRAC(E|G)$")>; def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>; def : InstRW<[FXU, Lat30], (instregex "SIGP$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "SIGA$")>; @@ -1176,7 +1196,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>; def : InstRW<[FXU], (instregex "LPP$")>; def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>; def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>; -def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>; def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index 59f37205f4127..a0f2115eb9d72 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -69,7 +69,7 @@ def : WriteRes<LSU_lat1, [ZEC12_LSUnit]> { let Latency = 1; } def : WriteRes<FPU, [ZEC12_FPUnit]> { let Latency = 8; } def : WriteRes<FPU2, [ZEC12_FPUnit, ZEC12_FPUnit]> { let Latency = 9; } def : WriteRes<DFU, [ZEC12_DFUnit]> { let Latency = 2; } -def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_FPUnit]> { let Latency = 3; } +def : WriteRes<DFU2, [ZEC12_DFUnit, ZEC12_DFUnit]> { let Latency = 3; } def : WriteRes<VBU, [ZEC12_VBUnit]>; // Virtual Branching Unit // -------------------------- INSTRUCTIONS ---------------------------------- // @@ -251,7 +251,7 @@ def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; // Load multiple disjoint -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; // Store multiple (estimated average of 3 ops) def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], @@ -413,13 +413,13 @@ def : InstRW<[FXU, Lat6], (instregex "MS(R|FI)$")>; def : InstRW<[FXU, LSU, Lat12], (instregex "MSG$")>; def : InstRW<[FXU, Lat8], (instregex "MSGR$")>; def : InstRW<[FXU, Lat6], (instregex "MSGF(I|R)$")>; -def : InstRW<[FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; -def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXU, FXU, LSU, Lat15, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXU, FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; -def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder @@ -446,7 +446,8 @@ def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; def : InstRW<[FXU], (instregex "SLA(G|K)?$")>; -def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; +def : InstRW<[FXU, FXU, FXU, FXU, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -544,7 +545,7 @@ def : InstRW<[FXU, FXU, FXU, FXU, FXU, FXU, LSU, LSU, Lat12, GroupAlone], (instregex "CDSG$")>; // Compare and swap and store -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CSST$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "CSST$")>; // Perform locked operation def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; @@ -560,36 +561,44 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; // Translate and convert //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[FXU, FXU, FXU, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXU, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; //===----------------------------------------------------------------------===// // Message-security assist //===----------------------------------------------------------------------===// -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; +def : InstRW<[FXU, Lat30], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; //===----------------------------------------------------------------------===// // Decimal arithmetic //===----------------------------------------------------------------------===// -def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; -def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[FXU, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "CVBG$")>; +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXU, FXU, FXU, DFU2, DFU2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXU, FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat15, GroupAlone], (instregex "(A|S|ZA)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], +def : InstRW<[FXU, DFU2, DFU2, LSU, LSU, Lat30, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXU, FXU, DFU2, DFU2, LSU, LSU, LSU, Lat15, GroupAlone], (instregex "SRP$")>; -def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; -def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[DFU2, DFU2, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, Lat5, GroupAlone], (instregex "TP$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; //===----------------------------------------------------------------------===// @@ -659,7 +668,7 @@ def : InstRW<[FXU], (instregex "PPA$")>; //===----------------------------------------------------------------------===// // Find leftmost one -def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; +def : InstRW<[FXU, FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; // Population count def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>; @@ -670,14 +679,14 @@ def : InstRW<[FXU], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; -def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[FXU, Lat30], (instregex "SRSTU$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; // Various complex instructions -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXU, Lat30], (instregex "CMPSC$")>; // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -818,9 +827,9 @@ def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDBR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXBR$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)EBR$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DBR$")>; // Division @@ -829,7 +838,7 @@ def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; // Divide to integer -def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; +def : InstRW<[FPU, Lat30], (instregex "DI(E|D)BR$")>; //===----------------------------------------------------------------------===// // FP: Comparisons @@ -851,10 +860,10 @@ def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "TCXB$")>; def : InstRW<[FXU, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; -def : InstRW<[LSU, Lat3, GroupAlone], (instregex "STFPC$")>; -def : InstRW<[FXU, Lat30, GroupAlone], (instregex "SFASR$")>; -def : InstRW<[FXU, LSU, Lat30, GroupAlone], (instregex "LFAS$")>; -def : InstRW<[FXU, Lat2, GroupAlone], (instregex "SRNM(B|T)?$")>; +def : InstRW<[FXU, LSU, Lat3, GroupAlone], (instregex "STFPC$")>; +def : InstRW<[FXU, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "LFAS$")>; +def : InstRW<[FXU, GroupAlone], (instregex "SRNM(B|T)?$")>; // --------------------- Hexadecimal floating point ------------------------- // @@ -938,16 +947,20 @@ def : InstRW<[FPU], (instregex "M(D|DE|E|EE)R$")>; def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MXD$")>; def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MXDR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "MXR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, Lat15, GroupAlone], (instregex "MY$")>; +def : InstRW<[FPU, FPU, LSU, Lat15, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[FPU2, FPU2, Lat10, GroupAlone], (instregex "MYR$")>; +def : InstRW<[FPU, Lat10, GroupAlone], (instregex "MY(H|L)R$")>; // Multiply and add / subtract -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)ER$")>; -def : InstRW<[FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; def : InstRW<[FPU, GroupAlone], (instregex "M(A|S)DR$")>; -def : InstRW<[FPU2, FPU2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>; -def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAY(H|L)?R$")>; +def : InstRW<[FPU2, FPU2, LSU, GroupAlone], (instregex "MAY$")>; +def : InstRW<[FPU2, FPU2, GroupAlone], (instregex "MAYR$")>; +def : InstRW<[FPU, FPU, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[FPU, GroupAlone], (instregex "MAY(H|L)R$")>; // Division def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)$")>; @@ -987,16 +1000,21 @@ def : InstRW<[DFU, Lat20], (instregex "LDETR$")>; def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "LXDTR$")>; // Convert from fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CD(F|G)TR(A)?$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CX(F|G)TR(A)?$")>; +def : InstRW<[FXU, DFU, Lat9, GroupAlone], (instregex "CDFTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CDGTR(A)?$")>; +def : InstRW<[FXU, DFU2, DFU2, GroupAlone], (instregex "CXFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "CXGTR(A)?$")>; def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CDL(F|G)TR$")>; -def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXL(F|G)TR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat11, GroupAlone], (instregex "CXLFTR$")>; +def : InstRW<[FXU, DFU2, DFU2, Lat6, GroupAlone], (instregex "CXLGTR$")>; // Convert to fixed / logical -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "C(F|G)DTR(A)?$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "C(F|G)XTR(A)?$")>; -def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)DTR$")>; -def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CL(F|G)XTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CFDTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "CGDTR(A)?$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CFXTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat30, GroupAlone], (instregex "CGXTR(A)?$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)DTR$")>; +def : InstRW<[FXU, DFU, DFU, Lat11, GroupAlone], (instregex "CL(F|G)XTR$")>; // Convert from / to signed / unsigned packed def : InstRW<[FXU, DFU, Lat12, GroupAlone], (instregex "CD(S|U)TR$")>; @@ -1007,11 +1025,11 @@ def : InstRW<[FXU, FXU, DFU2, DFU2, Lat20, GroupAlone], (instregex "C(S|U)XTR$") // Convert from / to zoned def : InstRW<[LSU, DFU2, Lat7, GroupAlone], (instregex "CDZT$")>; def : InstRW<[LSU, LSU, DFU2, DFU2, Lat10, GroupAlone], (instregex "CXZT$")>; -def : InstRW<[FXU, LSU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>; +def : InstRW<[FXU, LSU, DFU, DFU, Lat11, GroupAlone], (instregex "CZDT$")>; def : InstRW<[FXU, LSU, DFU, DFU, Lat15, GroupAlone], (instregex "CZXT$")>; // Perform floating-point operation -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>; +def : InstRW<[FXU, Lat30], (instregex "PFPO$")>; //===----------------------------------------------------------------------===// // DFP: Unary arithmetic @@ -1023,7 +1041,7 @@ def : InstRW<[DFU2, DFU2, Lat20, GroupAlone], (instregex "FIXTR$")>; // Extract biased exponent def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEDTR$")>; -def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "EEXTR$")>; +def : InstRW<[FXU, DFU2, Lat15, GroupAlone], (instregex "EEXTR$")>; // Extract significance def : InstRW<[FXU, DFU, Lat15, GroupAlone], (instregex "ESDTR$")>; @@ -1054,15 +1072,15 @@ def : InstRW<[DFU, Lat30], (instregex "QADTR$")>; def : InstRW<[DFU2, DFU2, Lat30, GroupAlone], (instregex "QAXTR$")>; // Reround -def : InstRW<[FXU, DFU, Lat30], (instregex "RRDTR$")>; +def : InstRW<[FXU, DFU, Lat30, GroupAlone], (instregex "RRDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat30, GroupAlone], (instregex "RRXTR$")>; // Shift significand left/right -def : InstRW<[LSU, DFU, Lat11], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, DFU, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; // Insert biased exponent -def : InstRW<[FXU, DFU, Lat11], (instregex "IEDTR$")>; +def : InstRW<[FXU, DFU, Lat11, GroupAlone], (instregex "IEDTR$")>; def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; //===----------------------------------------------------------------------===// @@ -1071,15 +1089,15 @@ def : InstRW<[FXU, DFU2, DFU2, Lat15, GroupAlone], (instregex "IEXTR$")>; // Compare def : InstRW<[DFU, Lat11], (instregex "(K|C)DTR$")>; -def : InstRW<[DFU, DFU, Lat15, GroupAlone], (instregex "(K|C)XTR$")>; +def : InstRW<[DFU, DFU, Lat15], (instregex "(K|C)XTR$")>; // Compare biased exponent def : InstRW<[DFU, Lat8], (instregex "CEDTR$")>; -def : InstRW<[DFU, Lat9], (instregex "CEXTR$")>; +def : InstRW<[DFU, DFU, Lat9], (instregex "CEXTR$")>; // Test Data Class/Group def : InstRW<[LSU, DFU, Lat15], (instregex "TD(C|G)(E|D)T$")>; -def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; +def : InstRW<[LSU, DFU2, Lat15], (instregex "TD(C|G)XT$")>; // -------------------------------- System ---------------------------------- // @@ -1090,19 +1108,20 @@ def : InstRW<[LSU, DFU2, DFU2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "LPSW(E)?$")>; -def : InstRW<[FXU, Lat3], (instregex "IPK$")>; -def : InstRW<[LSU], (instregex "SPKA$")>; -def : InstRW<[LSU], (instregex "SSM$")>; -def : InstRW<[FXU], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXU, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXU, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; def : InstRW<[FXU, Lat3], (instregex "IAC$")>; -def : InstRW<[LSU], (instregex "SAC(F)?$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// // System: Control Register Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LCTL(G)?$")>; -def : InstRW<[LSU, Lat30], (instregex "STCT(L|G)$")>; +def : InstRW<[FXU, LSU, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "STCT(L|G)$")>; def : InstRW<[LSU], (instregex "E(P|S)A(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "SSA(I)?R$")>; def : InstRW<[FXU, Lat30], (instregex "ESEA$")>; @@ -1148,16 +1167,17 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "TPROT$")>; //===----------------------------------------------------------------------===// def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; -def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; +def : InstRW<[LSU, Lat6, Lat30, GroupAlone], (instregex "MVCSK$")>; +def : InstRW<[LSU, Lat6, GroupAlone], (instregex "MVCDK$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "MVCOS$")>; -def : InstRW<[LSU, Lat30], (instregex "MVPG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; //===----------------------------------------------------------------------===// // System: Address-Space Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXU, LSU, Lat30], (instregex "LASP$")>; -def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "PC$")>; def : InstRW<[FXU, Lat30], (instregex "PR$")>; def : InstRW<[FXU, Lat30], (instregex "PT(I)?$")>; @@ -1169,7 +1189,7 @@ def : InstRW<[FXU, Lat20], (instregex "TAR$")>; // System: Linkage-Stack Instructions //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXU, LSU, Lat30, EndGroup], (instregex "BAKR$")>; def : InstRW<[FXU, Lat30], (instregex "EREG(G)?$")>; def : InstRW<[FXU, Lat30], (instregex "(E|M)STA$")>; @@ -1206,7 +1226,7 @@ def : InstRW<[FXU, Lat30], (instregex "PCKMO$")>; //===----------------------------------------------------------------------===// def : InstRW<[FXU, Lat30], (instregex "SVC$")>; -def : InstRW<[FXU], (instregex "MC$")>; +def : InstRW<[FXU, GroupAlone], (instregex "MC$")>; def : InstRW<[FXU, Lat30], (instregex "DIAG$")>; def : InstRW<[FXU], (instregex "TRAC(E|G)$")>; def : InstRW<[FXU, Lat30], (instregex "TRAP(2|4)$")>; @@ -1221,7 +1241,8 @@ def : InstRW<[FXU, LSU, Lat30], (instregex "SIE$")>; def : InstRW<[FXU], (instregex "LPP$")>; def : InstRW<[FXU, Lat30], (instregex "ECPGA$")>; def : InstRW<[FXU, Lat30], (instregex "E(C|P)CTR$")>; -def : InstRW<[FXU, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXU, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXU, LSU, Lat30], (instregex "L(P|S)CTL$")>; def : InstRW<[FXU, LSU, Lat30], (instregex "Q(S|CTR)I$")>; def : InstRW<[FXU, Lat30], (instregex "S(C|P)CTR$")>; diff --git a/lib/Target/SystemZ/SystemZShortenInst.cpp b/lib/Target/SystemZ/SystemZShortenInst.cpp index 7391df8342efd..13ceb371a425e 100644 --- a/lib/Target/SystemZ/SystemZShortenInst.cpp +++ b/lib/Target/SystemZ/SystemZShortenInst.cpp @@ -200,14 +200,26 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { Changed |= shortenOn001AddCC(MI, SystemZ::ADBR); break; + case SystemZ::WFASB: + Changed |= shortenOn001AddCC(MI, SystemZ::AEBR); + break; + case SystemZ::WFDDB: Changed |= shortenOn001(MI, SystemZ::DDBR); break; + case SystemZ::WFDSB: + Changed |= shortenOn001(MI, SystemZ::DEBR); + break; + case SystemZ::WFIDB: Changed |= shortenFPConv(MI, SystemZ::FIDBRA); break; + case SystemZ::WFISB: + Changed |= shortenFPConv(MI, SystemZ::FIEBRA); + break; + case SystemZ::WLDEB: Changed |= shortenOn01(MI, SystemZ::LDEBR); break; @@ -220,30 +232,58 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) { Changed |= shortenOn001(MI, SystemZ::MDBR); break; + case SystemZ::WFMSB: + Changed |= shortenOn001(MI, SystemZ::MEEBR); + break; + case SystemZ::WFLCDB: Changed |= shortenOn01(MI, SystemZ::LCDFR); break; + case SystemZ::WFLCSB: + Changed |= shortenOn01(MI, SystemZ::LCDFR_32); + break; + case SystemZ::WFLNDB: Changed |= shortenOn01(MI, SystemZ::LNDFR); break; + case SystemZ::WFLNSB: + Changed |= shortenOn01(MI, SystemZ::LNDFR_32); + break; + case SystemZ::WFLPDB: Changed |= shortenOn01(MI, SystemZ::LPDFR); break; + case SystemZ::WFLPSB: + Changed |= shortenOn01(MI, SystemZ::LPDFR_32); + break; + case SystemZ::WFSQDB: Changed |= shortenOn01(MI, SystemZ::SQDBR); break; + case SystemZ::WFSQSB: + Changed |= shortenOn01(MI, SystemZ::SQEBR); + break; + case SystemZ::WFSDB: Changed |= shortenOn001AddCC(MI, SystemZ::SDBR); break; + case SystemZ::WFSSB: + Changed |= shortenOn001AddCC(MI, SystemZ::SEBR); + break; + case SystemZ::WFCDB: Changed |= shortenOn01(MI, SystemZ::CDBR); break; + case SystemZ::WFCSB: + Changed |= shortenOn01(MI, SystemZ::CEBR); + break; + case SystemZ::VL32: // For z13 we prefer LDE over LE to avoid partial register dependencies. Changed |= shortenOn0(MI, SystemZ::LDE32); diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index eb4a0962f7eb2..9cd09b0f911e0 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -47,6 +47,10 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, HasVector(false), HasLoadStoreOnCond2(false), HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false), HasDFPPackedConversion(false), + HasMiscellaneousExtensions2(false), HasGuardedStorage(false), + HasMessageSecurityAssist7(false), HasMessageSecurityAssist8(false), + HasVectorEnhancements1(false), HasVectorPackedDecimal(false), + HasInsertReferenceBitsMultiple(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(), FrameLowering() {} diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index b05a1bb6cafd0..4829f73e080e2 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -56,6 +56,13 @@ protected: bool HasLoadAndZeroRightmostByte; bool HasMessageSecurityAssist5; bool HasDFPPackedConversion; + bool HasMiscellaneousExtensions2; + bool HasGuardedStorage; + bool HasMessageSecurityAssist7; + bool HasMessageSecurityAssist8; + bool HasVectorEnhancements1; + bool HasVectorPackedDecimal; + bool HasInsertReferenceBitsMultiple; private: Triple TargetTriple; @@ -168,6 +175,33 @@ public: // Return true if the target has the vector facility. bool hasVector() const { return HasVector; } + // Return true if the target has the miscellaneous-extensions facility 2. + bool hasMiscellaneousExtensions2() const { + return HasMiscellaneousExtensions2; + } + + // Return true if the target has the guarded-storage facility. + bool hasGuardedStorage() const { return HasGuardedStorage; } + + // Return true if the target has the message-security-assist + // extension facility 7. + bool hasMessageSecurityAssist7() const { return HasMessageSecurityAssist7; } + + // Return true if the target has the message-security-assist + // extension facility 8. + bool hasMessageSecurityAssist8() const { return HasMessageSecurityAssist8; } + + // Return true if the target has the vector-enhancements facility 1. + bool hasVectorEnhancements1() const { return HasVectorEnhancements1; } + + // Return true if the target has the vector-packed-decimal facility. + bool hasVectorPackedDecimal() const { return HasVectorPackedDecimal; } + + // Return true if the target has the insert-reference-bits-multiple facility. + bool hasInsertReferenceBitsMultiple() const { + return HasInsertReferenceBitsMultiple; + } + // Return true if GV can be accessed using LARL for reloc model RM // and code model CM. bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const; diff --git a/lib/Target/SystemZ/SystemZTargetMachine.cpp b/lib/Target/SystemZ/SystemZTargetMachine.cpp index cb81c0e5276e7..025bf73d2df0d 100644 --- a/lib/Target/SystemZ/SystemZTargetMachine.cpp +++ b/lib/Target/SystemZ/SystemZTargetMachine.cpp @@ -143,8 +143,10 @@ public: } // end anonymous namespace void SystemZPassConfig::addIRPasses() { - if (getOptLevel() != CodeGenOpt::None) + if (getOptLevel() != CodeGenOpt::None) { addPass(createSystemZTDCPass()); + addPass(createLoopDataPrefetchPass()); + } TargetPassConfig::addIRPasses(); } diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 9ac768b2189d7..506dc74279932 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -372,6 +372,9 @@ int SystemZTTIImpl::getArithmeticInstrCost( Opcode == Instruction::FMul || Opcode == Instruction::FDiv) { switch (ScalarBits) { case 32: { + // The vector enhancements facility 1 provides v4f32 instructions. + if (ST->hasVectorEnhancements1()) + return NumVectors; // Return the cost of multiple scalar invocation plus the cost of // inserting and extracting the values. unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType()); diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 6923fc6fc9104..a0c6fa94f8c1e 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -56,6 +56,10 @@ public: unsigned getNumberOfRegisters(bool Vector); unsigned getRegisterBitWidth(bool Vector) const; + unsigned getCacheLineSize() { return 256; } + unsigned getPrefetchDistance() { return 2000; } + unsigned getMinPrefetchStride() { return 2048; } + bool prefersVectorizedAddressing() { return false; } bool supportsEfficientVectorElementLoadStore() { return true; } bool enableInterleavedAccessVectorization() { return true; } diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index fc4adddc149ba..6e08d4cff6eaf 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -37,6 +37,7 @@ endif() set(sources X86AsmPrinter.cpp X86CallFrameOptimization.cpp + X86CmovConversion.cpp X86ExpandPseudo.cpp X86FastISel.cpp X86FixupBWInsts.cpp diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 19c93cfff0fe9..91201d1fec85a 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -83,6 +83,9 @@ FunctionPass *createX86WinEHStatePass(); /// the MachineInstr to MC. FunctionPass *createX86ExpandPseudoPass(); +/// This pass converts X86 cmov instructions into branch when profitable. +FunctionPass *createX86CmovConverterPass(); + /// Return a Machine IR pass that selectively replaces /// certain byte and word instructions by equivalent 32 bit instructions, /// in order to eliminate partial register usage, false dependences on diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 4ca57fe9fb00f..54eabeac51264 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -814,10 +814,8 @@ def : Proc<"bdver4", [ FeatureMWAITX ]>; -// TODO: The scheduler model falls to BTVER2 model. -// The znver1 model has to be put in place. -// Zen -def: ProcessorModel<"znver1", BtVer2Model, [ +// Znver1 +def: ProcessorModel<"znver1", Znver1Model, [ FeatureADX, FeatureAES, FeatureAVX2, diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 6decb550ad5f8..26461986427d2 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -448,7 +448,7 @@ def RetCC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::Swift", CCDelegateTo<RetCC_X86_64_Swift>>, // Handle explicit CC selection - CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<RetCC_X86_Win64_C>>, + CCIfCC<"CallingConv::Win64", CCDelegateTo<RetCC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<RetCC_X86_64_C>>, // Handle Vectorcall CC @@ -1004,7 +1004,7 @@ def CC_X86_64 : CallingConv<[ CCIfCC<"CallingConv::HiPE", CCDelegateTo<CC_X86_64_HiPE>>, CCIfCC<"CallingConv::WebKit_JS", CCDelegateTo<CC_X86_64_WebKit_JS>>, CCIfCC<"CallingConv::AnyReg", CCDelegateTo<CC_X86_64_AnyReg>>, - CCIfCC<"CallingConv::X86_64_Win64", CCDelegateTo<CC_X86_Win64_C>>, + CCIfCC<"CallingConv::Win64", CCDelegateTo<CC_X86_Win64_C>>, CCIfCC<"CallingConv::X86_64_SysV", CCDelegateTo<CC_X86_64_C>>, CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win64_VectorCall>>, CCIfCC<"CallingConv::HHVM", CCDelegateTo<CC_X86_64_HHVM>>, diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp new file mode 100644 index 0000000000000..bfc834435de55 --- /dev/null +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -0,0 +1,611 @@ +//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// \file +/// This file implements a pass that converts X86 cmov instructions into branch +/// when profitable. This pass is conservative, i.e., it applies transformation +/// if and only if it can gaurantee a gain with high confidence. +/// +/// Thus, the optimization applies under the following conditions: +/// 1. Consider as a candidate only CMOV in most inner loop, assuming that +/// most hotspots are represented by these loops. +/// 2. Given a group of CMOV instructions, that are using same EFLAGS def +/// instruction: +/// a. Consider them as candidates only if all have same code condition or +/// opposite one, to prevent generating more than one conditional jump +/// per EFLAGS def instruction. +/// b. Consider them as candidates only if all are profitable to be +/// converted, assuming that one bad conversion may casue a degradation. +/// 3. Apply conversion only for loop that are found profitable and only for +/// CMOV candidates that were found profitable. +/// a. Loop is considered profitable only if conversion will reduce its +/// depth cost by some thrishold. +/// b. CMOV is considered profitable if the cost of its condition is higher +/// than the average cost of its true-value and false-value by 25% of +/// branch-misprediction-penalty, this to assure no degredassion even +/// with 25% branch misprediction. +/// +/// Note: This pass is assumed to run on SSA machine code. +//===----------------------------------------------------------------------===// +// +// External interfaces: +// FunctionPass *llvm::createX86CmovConverterPass(); +// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); +// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +using namespace llvm; + +#define DEBUG_TYPE "x86-cmov-converter" + +STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups"); +STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates"); +STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops"); +STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups"); + +namespace { +// This internal switch can be used to turn off the cmov/branch optimization. +static cl::opt<bool> + EnableCmovConverter("x86-cmov-converter", + cl::desc("Enable the X86 cmov-to-branch optimization."), + cl::init(true), cl::Hidden); + +/// Converts X86 cmov instructions into branches when profitable. +class X86CmovConverterPass : public MachineFunctionPass { +public: + X86CmovConverterPass() : MachineFunctionPass(ID) {} + ~X86CmovConverterPass() {} + + StringRef getPassName() const override { return "X86 cmov Conversion"; } + bool runOnMachineFunction(MachineFunction &MF) override; + void getAnalysisUsage(AnalysisUsage &AU) const override; + +private: + /// Pass identification, replacement for typeid. + static char ID; + + const MachineRegisterInfo *MRI; + const TargetInstrInfo *TII; + TargetSchedModel TSchedModel; + + /// List of consecutive CMOV instructions. + typedef SmallVector<MachineInstr *, 2> CmovGroup; + typedef SmallVector<CmovGroup, 2> CmovGroups; + + /// Collect all CMOV-group-candidates in \p CurrLoop and update \p + /// CmovInstGroups accordingly. + /// + /// \param CurrLoop Loop being processed. + /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. + /// \returns true iff it found any CMOV-group-candidate. + bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups); + + /// Check if it is profitable to transform each CMOV-group-candidates into + /// branch. Remove all groups that are not profitable from \p CmovInstGroups. + /// + /// \param CurrLoop Loop being processed. + /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. + /// \returns true iff any CMOV-group-candidate remain. + bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop, + CmovGroups &CmovInstGroups); + + /// Convert the given list of consecutive CMOV instructions into a branch. + /// + /// \param Group Consecutive CMOV instructions to be converted into branch. + void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const; +}; + +char X86CmovConverterPass::ID = 0; + +void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { + MachineFunctionPass::getAnalysisUsage(AU); + AU.addRequired<MachineLoopInfo>(); +} + +bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + if (!EnableCmovConverter) + return false; + + DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName() + << "**********\n"); + + bool Changed = false; + MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>(); + const TargetSubtargetInfo &STI = MF.getSubtarget(); + MRI = &MF.getRegInfo(); + TII = STI.getInstrInfo(); + TSchedModel.init(STI.getSchedModel(), &STI, TII); + + //===--------------------------------------------------------------------===// + // Algorithm + // --------- + // For each inner most loop + // collectCmovCandidates() { + // Find all CMOV-group-candidates. + // } + // + // checkForProfitableCmovCandidates() { + // * Calculate both loop-depth and optimized-loop-depth. + // * Use these depth to check for loop transformation profitability. + // * Check for CMOV-group-candidate transformation profitability. + // } + // + // For each profitable CMOV-group-candidate + // convertCmovInstsToBranches() { + // * Create FalseBB, SinkBB, Conditional branch to SinkBB. + // * Replace each CMOV instruction with a PHI instruction in SinkBB. + // } + // + // Note: For more details, see each function description. + //===--------------------------------------------------------------------===// + for (MachineBasicBlock &MBB : MF) { + MachineLoop *CurrLoop = MLI.getLoopFor(&MBB); + + // Optimize only inner most loops. + if (!CurrLoop || CurrLoop->getHeader() != &MBB || + !CurrLoop->getSubLoops().empty()) + continue; + + // List of consecutive CMOV instructions to be processed. + CmovGroups CmovInstGroups; + + if (!collectCmovCandidates(CurrLoop, CmovInstGroups)) + continue; + + if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups)) + continue; + + Changed = true; + for (auto &Group : CmovInstGroups) + convertCmovInstsToBranches(Group); + } + return Changed; +} + +bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop, + CmovGroups &CmovInstGroups) { + //===--------------------------------------------------------------------===// + // Collect all CMOV-group-candidates and add them into CmovInstGroups. + // + // CMOV-group: + // CMOV instructions, in same MBB, that uses same EFLAGS def instruction. + // + // CMOV-group-candidate: + // CMOV-group where all the CMOV instructions are + // 1. consecutive. + // 2. have same condition code or opposite one. + // 3. have only operand registers (X86::CMOVrr). + //===--------------------------------------------------------------------===// + // List of possible improvement (TODO's): + // -------------------------------------- + // TODO: Add support for X86::CMOVrm instructions. + // TODO: Add support for X86::SETcc instructions. + // TODO: Add support for CMOV-groups with non consecutive CMOV instructions. + //===--------------------------------------------------------------------===// + + // Current processed CMOV-Group. + CmovGroup Group; + for (auto *MBB : CurrLoop->getBlocks()) { + Group.clear(); + // Condition code of first CMOV instruction current processed range and its + // opposite condition code. + X86::CondCode FirstCC, FirstOppCC; + // Indicator of a non CMOVrr instruction in the current processed range. + bool FoundNonCMOVInst = false; + // Indicator for current processed CMOV-group if it should be skipped. + bool SkipGroup = false; + + for (auto &I : *MBB) { + X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); + // Check if we found a X86::CMOVrr instruction. + if (CC != X86::COND_INVALID && !I.mayLoad()) { + if (Group.empty()) { + // We found first CMOV in the range, reset flags. + FirstCC = CC; + FirstOppCC = X86::GetOppositeBranchCondition(CC); + FoundNonCMOVInst = false; + SkipGroup = false; + } + Group.push_back(&I); + // Check if it is a non-consecutive CMOV instruction or it has different + // condition code than FirstCC or FirstOppCC. + if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) + // Mark the SKipGroup indicator to skip current processed CMOV-Group. + SkipGroup = true; + continue; + } + // If Group is empty, keep looking for first CMOV in the range. + if (Group.empty()) + continue; + + // We found a non X86::CMOVrr instruction. + FoundNonCMOVInst = true; + // Check if this instruction define EFLAGS, to determine end of processed + // range, as there would be no more instructions using current EFLAGS def. + if (I.definesRegister(X86::EFLAGS)) { + // Check if current processed CMOV-group should not be skipped and add + // it as a CMOV-group-candidate. + if (!SkipGroup) + CmovInstGroups.push_back(Group); + else + ++NumOfSkippedCmovGroups; + Group.clear(); + } + } + // End of basic block is considered end of range, check if current processed + // CMOV-group should not be skipped and add it as a CMOV-group-candidate. + if (Group.empty()) + continue; + if (!SkipGroup) + CmovInstGroups.push_back(Group); + else + ++NumOfSkippedCmovGroups; + } + + NumOfCmovGroupCandidate += CmovInstGroups.size(); + return !CmovInstGroups.empty(); +} + +/// \returns Depth of CMOV instruction as if it was converted into branch. +/// \param TrueOpDepth depth cost of CMOV true value operand. +/// \param FalseOpDepth depth cost of CMOV false value operand. +static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { + //===--------------------------------------------------------------------===// + // With no info about branch weight, we assume 50% for each value operand. + // Thus, depth of optimized CMOV instruction is the rounded up average of + // its True-Operand-Value-Depth and False-Operand-Value-Depth. + //===--------------------------------------------------------------------===// + return (TrueOpDepth + FalseOpDepth + 1) / 2; +} + +bool X86CmovConverterPass::checkForProfitableCmovCandidates( + MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) { + struct DepthInfo { + /// Depth of original loop. + unsigned Depth; + /// Depth of optimized loop. + unsigned OptDepth; + }; + /// Number of loop iterations to calculate depth for ?! + static const unsigned LoopIterations = 2; + DenseMap<MachineInstr *, DepthInfo> DepthMap; + DepthInfo LoopDepth[LoopIterations] = {{0, 0}, {0, 0}}; + enum { PhyRegType = 0, VirRegType = 1, RegTypeNum = 2 }; + /// For each register type maps the register to its last def instruction. + DenseMap<unsigned, MachineInstr *> RegDefMaps[RegTypeNum]; + /// Maps register operand to its def instruction, which can be nullptr if it + /// is unknown (e.g., operand is defined outside the loop). + DenseMap<MachineOperand *, MachineInstr *> OperandToDefMap; + + // Set depth of unknown instruction (i.e., nullptr) to zero. + DepthMap[nullptr] = {0, 0}; + + SmallPtrSet<MachineInstr *, 4> CmovInstructions; + for (auto &Group : CmovInstGroups) + CmovInstructions.insert(Group.begin(), Group.end()); + + //===--------------------------------------------------------------------===// + // Step 1: Calculate instruction depth and loop depth. + // Optimized-Loop: + // loop with CMOV-group-candidates converted into branches. + // + // Instruction-Depth: + // instruction latency + max operand depth. + // * For CMOV instruction in optimized loop the depth is calculated as: + // CMOV latency + getDepthOfOptCmov(True-Op-Depth, False-Op-depth) + // TODO: Find a better way to estimate the latency of the branch instruction + // rather than using the CMOV latency. + // + // Loop-Depth: + // max instruction depth of all instructions in the loop. + // Note: instruction with max depth represents the critical-path in the loop. + // + // Loop-Depth[i]: + // Loop-Depth calculated for first `i` iterations. + // Note: it is enough to calculate depth for up to two iterations. + // + // Depth-Diff[i]: + // Number of cycles saved in first 'i` iterations by optimizing the loop. + //===--------------------------------------------------------------------===// + for (unsigned I = 0; I < LoopIterations; ++I) { + DepthInfo &MaxDepth = LoopDepth[I]; + for (auto *MBB : CurrLoop->getBlocks()) { + // Clear physical registers Def map. + RegDefMaps[PhyRegType].clear(); + for (MachineInstr &MI : *MBB) { + unsigned MIDepth = 0; + unsigned MIDepthOpt = 0; + bool IsCMOV = CmovInstructions.count(&MI); + for (auto &MO : MI.uses()) { + // Checks for "isUse()" as "uses()" returns also implicit definitions. + if (!MO.isReg() || !MO.isUse()) + continue; + unsigned Reg = MO.getReg(); + auto &RDM = RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)]; + if (MachineInstr *DefMI = RDM.lookup(Reg)) { + OperandToDefMap[&MO] = DefMI; + DepthInfo Info = DepthMap.lookup(DefMI); + MIDepth = std::max(MIDepth, Info.Depth); + if (!IsCMOV) + MIDepthOpt = std::max(MIDepthOpt, Info.OptDepth); + } + } + + if (IsCMOV) + MIDepthOpt = getDepthOfOptCmov( + DepthMap[OperandToDefMap.lookup(&MI.getOperand(1))].OptDepth, + DepthMap[OperandToDefMap.lookup(&MI.getOperand(2))].OptDepth); + + // Iterates over all operands to handle implicit definitions as well. + for (auto &MO : MI.operands()) { + if (!MO.isReg() || !MO.isDef()) + continue; + unsigned Reg = MO.getReg(); + RegDefMaps[TargetRegisterInfo::isVirtualRegister(Reg)][Reg] = &MI; + } + + unsigned Latency = TSchedModel.computeInstrLatency(&MI); + DepthMap[&MI] = {MIDepth += Latency, MIDepthOpt += Latency}; + MaxDepth.Depth = std::max(MaxDepth.Depth, MIDepth); + MaxDepth.OptDepth = std::max(MaxDepth.OptDepth, MIDepthOpt); + } + } + } + + unsigned Diff[LoopIterations] = {LoopDepth[0].Depth - LoopDepth[0].OptDepth, + LoopDepth[1].Depth - LoopDepth[1].OptDepth}; + + //===--------------------------------------------------------------------===// + // Step 2: Check if Loop worth to be optimized. + // Worth-Optimize-Loop: + // case 1: Diff[1] == Diff[0] + // Critical-path is iteration independent - there is no dependency + // of critical-path instructions on critical-path instructions of + // previous iteration. + // Thus, it is enough to check gain percent of 1st iteration - + // To be conservative, the optimized loop need to have a depth of + // 12.5% cycles less than original loop, per iteration. + // + // case 2: Diff[1] > Diff[0] + // Critical-path is iteration dependent - there is dependency of + // critical-path instructions on critical-path instructions of + // previous iteration. + // Thus, it is required to check the gradient of the gain - the + // change in Depth-Diff compared to the change in Loop-Depth between + // 1st and 2nd iterations. + // To be conservative, the gradient need to be at least 50%. + // + // If loop is not worth optimizing, remove all CMOV-group-candidates. + //===--------------------------------------------------------------------===// + bool WorthOptLoop = false; + if (Diff[1] == Diff[0]) + WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; + else if (Diff[1] > Diff[0]) + WorthOptLoop = + (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth); + + if (!WorthOptLoop) + return false; + + ++NumOfLoopCandidate; + + //===--------------------------------------------------------------------===// + // Step 3: Check for each CMOV-group-candidate if it worth to be optimized. + // Worth-Optimize-Group: + // Iff it worths to optimize all CMOV instructions in the group. + // + // Worth-Optimize-CMOV: + // Predicted branch is faster than CMOV by the difference between depth of + // condition operand and depth of taken (predicted) value operand. + // To be conservative, the gain of such CMOV transformation should cover at + // at least 25% of branch-misprediction-penalty. + //===--------------------------------------------------------------------===// + unsigned MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty; + CmovGroups TempGroups; + std::swap(TempGroups, CmovInstGroups); + for (auto &Group : TempGroups) { + bool WorthOpGroup = true; + for (auto *MI : Group) { + // Avoid CMOV instruction which value is used as a pointer to load from. + // This is another conservative check to avoid converting CMOV instruction + // used with tree-search like algorithm, where the branch is unpredicted. + auto UIs = MRI->use_instructions(MI->defs().begin()->getReg()); + if (UIs.begin() != UIs.end() && ++UIs.begin() == UIs.end()) { + unsigned Op = UIs.begin()->getOpcode(); + if (Op == X86::MOV64rm || Op == X86::MOV32rm) { + WorthOpGroup = false; + break; + } + } + + unsigned CondCost = + DepthMap[OperandToDefMap.lookup(&MI->getOperand(3))].Depth; + unsigned ValCost = getDepthOfOptCmov( + DepthMap[OperandToDefMap.lookup(&MI->getOperand(1))].Depth, + DepthMap[OperandToDefMap.lookup(&MI->getOperand(2))].Depth); + if (ValCost > CondCost || (CondCost - ValCost) * 4 < MispredictPenalty) { + WorthOpGroup = false; + break; + } + } + + if (WorthOpGroup) + CmovInstGroups.push_back(Group); + } + + return !CmovInstGroups.empty(); +} + +static bool checkEFLAGSLive(MachineInstr *MI) { + if (MI->killsRegister(X86::EFLAGS)) + return false; + + // The EFLAGS operand of MI might be missing a kill marker. + // Figure out whether EFLAGS operand should LIVE after MI instruction. + MachineBasicBlock *BB = MI->getParent(); + MachineBasicBlock::iterator ItrMI = MI; + + // Scan forward through BB for a use/def of EFLAGS. + for (auto I = std::next(ItrMI), E = BB->end(); I != E; ++I) { + if (I->readsRegister(X86::EFLAGS)) + return true; + if (I->definesRegister(X86::EFLAGS)) + return false; + } + + // We hit the end of the block, check whether EFLAGS is live into a successor. + for (auto I = BB->succ_begin(), E = BB->succ_end(); I != E; ++I) { + if ((*I)->isLiveIn(X86::EFLAGS)) + return true; + } + + return false; +} + +void X86CmovConverterPass::convertCmovInstsToBranches( + SmallVectorImpl<MachineInstr *> &Group) const { + assert(!Group.empty() && "No CMOV instructions to convert"); + ++NumOfOptimizedCmovGroups; + + // To convert a CMOVcc instruction, we actually have to insert the diamond + // control-flow pattern. The incoming instruction knows the destination vreg + // to set, the condition code register to branch on, the true/false values to + // select between, and a branch opcode to use. + + // Before + // ----- + // MBB: + // cond = cmp ... + // v1 = CMOVge t1, f1, cond + // v2 = CMOVlt t2, f2, cond + // v3 = CMOVge v1, f3, cond + // + // After + // ----- + // MBB: + // cond = cmp ... + // jge %SinkMBB + // + // FalseMBB: + // jmp %SinkMBB + // + // SinkMBB: + // %v1 = phi[%f1, %FalseMBB], [%t1, %MBB] + // %v2 = phi[%t2, %FalseMBB], [%f2, %MBB] ; For CMOV with OppCC switch + // ; true-value with false-value + // %v3 = phi[%f3, %FalseMBB], [%t1, %MBB] ; Phi instruction cannot use + // ; previous Phi instruction result + + MachineInstr &MI = *Group.front(); + MachineInstr *LastCMOV = Group.back(); + DebugLoc DL = MI.getDebugLoc(); + X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + MachineBasicBlock *MBB = MI.getParent(); + MachineFunction::iterator It = ++MBB->getIterator(); + MachineFunction *F = MBB->getParent(); + const BasicBlock *BB = MBB->getBasicBlock(); + + MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(BB); + MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(BB); + F->insert(It, FalseMBB); + F->insert(It, SinkMBB); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + if (checkEFLAGSLive(LastCMOV)) { + FalseMBB->addLiveIn(X86::EFLAGS); + SinkMBB->addLiveIn(X86::EFLAGS); + } + + // Transfer the remainder of BB and its successor edges to SinkMBB. + SinkMBB->splice(SinkMBB->begin(), MBB, + std::next(MachineBasicBlock::iterator(LastCMOV)), MBB->end()); + SinkMBB->transferSuccessorsAndUpdatePHIs(MBB); + + // Add the false and sink blocks as its successors. + MBB->addSuccessor(FalseMBB); + MBB->addSuccessor(SinkMBB); + + // Create the conditional branch instruction. + BuildMI(MBB, DL, TII->get(X86::GetCondBranchFromCond(CC))).addMBB(SinkMBB); + + // Add the sink block to the false block successors. + FalseMBB->addSuccessor(SinkMBB); + + MachineInstrBuilder MIB; + MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); + MachineBasicBlock::iterator MIItEnd = + std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are processing is the opposite condition from the jump we + // generated, then we have to swap the operands for the PHI that is going to + // be generated. + if (X86::getCondFromCMovOpc(MIIt->getOpcode()) == OppCC) + std::swap(Op1Reg, Op2Reg); + + auto Op1Itr = RegRewriteTable.find(Op1Reg); + if (Op1Itr != RegRewriteTable.end()) + Op1Reg = Op1Itr->second.first; + + auto Op2Itr = RegRewriteTable.find(Op2Reg); + if (Op2Itr != RegRewriteTable.end()) + Op2Reg = Op2Itr->second.second; + + // SinkMBB: + // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, MBB ] + // ... + MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) + .addReg(Op1Reg) + .addMBB(FalseMBB) + .addReg(Op2Reg) + .addMBB(MBB); + (void)MIB; + DEBUG(dbgs() << "\tFrom: "; MIIt->dump()); + DEBUG(dbgs() << "\tTo: "; MIB->dump()); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + // Now remove the CMOV(s). + MBB->erase(MIItBegin, MIItEnd); +} + +} // End anonymous namespace. + +FunctionPass *llvm::createX86CmovConverterPass() { + return new X86CmovConverterPass(); +} diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index ee9e78146305d..527e5d568ac6f 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -1187,7 +1187,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { CC != CallingConv::X86_StdCall && CC != CallingConv::X86_ThisCall && CC != CallingConv::X86_64_SysV && - CC != CallingConv::X86_64_Win64) + CC != CallingConv::Win64) return false; // Don't handle popping bytes if they don't fit the ret's immediate. @@ -3171,7 +3171,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: case CallingConv::X86_ThisCall: - case CallingConv::X86_64_Win64: + case CallingConv::Win64: case CallingConv::X86_64_SysV: break; } diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index c28746f96439b..95c6f2a3fa342 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -22,7 +22,7 @@ /// instructions and register-to-register moves. It would /// seem like cmov(s) would also be affected, but because of the way cmov is /// really implemented by most machines as reading both the destination and -/// and source regsters, and then "merging" the two based on a condition, +/// and source registers, and then "merging" the two based on a condition, /// it really already should be considered as having a true dependence on the /// destination register as well. /// diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 65486cf7f529e..44eecd664714a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -1335,6 +1335,13 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CTTZ, VT, Custom); } + // NonVLX sub-targets extend 128/256 vectors to use the 512 version. + for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64, + MVT::v8i64}) { + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + } + // Need to promote to 64-bit even though we have 32-bit masked instructions // because the IR optimizers rearrange bitcasts around logic ops leaving // too many variations to handle if we don't promote them. @@ -1663,10 +1670,11 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores MaxStoresPerMemmoveOptSize = 4; - // TODO: These control memcmp expansion in CGP and are set low to prevent - // altering the vector expansion for 16/32 byte memcmp in SelectionDAGBuilder. - MaxLoadsPerMemcmp = 1; - MaxLoadsPerMemcmpOptSize = 1; + // TODO: These control memcmp expansion in CGP and could be raised higher, but + // that needs to benchmarked and balanced with the potential use of vector + // load/store types (PR33329). + MaxLoadsPerMemcmp = 4; + MaxLoadsPerMemcmpOptSize = 2; // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4). setPrefLoopAlignment(ExperimentalPrefLoopAlignment); @@ -2661,7 +2669,7 @@ static bool mayTailCallThisCC(CallingConv::ID CC) { switch (CC) { // C calling conventions: case CallingConv::C: - case CallingConv::X86_64_Win64: + case CallingConv::Win64: case CallingConv::X86_64_SysV: // Callee pop conventions: case CallingConv::X86_ThisCall: @@ -20188,7 +20196,10 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); EVT MaskVT = Mask.getValueType(); SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other); @@ -20210,7 +20221,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); MVT MaskVT = MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements()); @@ -20235,7 +20249,10 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue Index, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -20254,7 +20271,10 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG, SDValue ScaleOp, SDValue Chain, const X86Subtarget &Subtarget) { SDLoc dl(Op); - auto *C = cast<ConstantSDNode>(ScaleOp); + auto *C = dyn_cast<ConstantSDNode>(ScaleOp); + // Scale must be constant. + if (!C) + return SDValue(); SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8); SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32); SDValue Segment = DAG.getRegister(0, MVT::i32); @@ -22665,10 +22685,31 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op); SDValue R = Op.getOperand(0); SDValue Amt = Op.getOperand(1); + unsigned Opcode = Op.getOpcode(); + unsigned EltSizeInBits = VT.getScalarSizeInBits(); + + if (Subtarget.hasAVX512()) { + // Attempt to rotate by immediate. + APInt UndefElts; + SmallVector<APInt, 16> EltBits; + if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits)) { + if (!UndefElts && llvm::all_of(EltBits, [EltBits](APInt &V) { + return EltBits[0] == V; + })) { + unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI); + uint64_t RotateAmt = EltBits[0].urem(EltSizeInBits); + return DAG.getNode(Op, DL, VT, R, + DAG.getConstant(RotateAmt, DL, MVT::i8)); + } + } + + // Else, fall-back on VPROLV/VPRORV. + return Op; + } assert(VT.isVector() && "Custom lowering only for vector rotates!"); assert(Subtarget.hasXOP() && "XOP support required for vector rotates!"); - assert((Op.getOpcode() == ISD::ROTL) && "Only ROTL supported"); + assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. // +ve/-ve Amt = rotate left/right. @@ -22683,7 +22724,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (auto *BVAmt = dyn_cast<BuildVectorSDNode>(Amt)) { if (auto *RotateConst = BVAmt->getConstantSplatNode()) { uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); - assert(RotateAmt < VT.getScalarSizeInBits() && "Rotation out of range"); + assert(RotateAmt < EltSizeInBits && "Rotation out of range"); return DAG.getNode(X86ISD::VPROTI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } @@ -24030,7 +24071,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::MULHU: return LowerMULH(Op, Subtarget, DAG); case ISD::UMUL_LOHI: case ISD::SMUL_LOHI: return LowerMUL_LOHI(Op, Subtarget, DAG); - case ISD::ROTL: return LowerRotate(Op, Subtarget, DAG); + case ISD::ROTL: + case ISD::ROTR: return LowerRotate(Op, Subtarget, DAG); case ISD::SRA: case ISD::SRL: case ISD::SHL: return LowerShift(Op, Subtarget, DAG); diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index cc5c09cbf0e5e..705d0f7a5cf7d 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -1759,29 +1759,29 @@ let Predicates = Preds in { (i64 0)), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))), (i64 0)), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and _.KRCWM:$mask, + (_.KVT (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))), (i64 0)), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))))))), (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, _.RC:$src1, addr:$src2), NewInf.KRC)>; } @@ -1798,7 +1798,7 @@ let Predicates = Preds in { (i64 0)), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), (_.KVT (and (_.KVT _.KRCWM:$mask), (_.KVT (OpNode (_.VT _.RC:$src1), @@ -1879,7 +1879,7 @@ defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm, defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm, "VPCMPEQQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, +defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, "VPCMPEQQZ", [HasAVX512]>; defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm, "VPCMPEQQZ", [HasAVX512]>; @@ -2127,17 +2127,17 @@ multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne list<Predicate> Preds> { let Predicates = Preds in { def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT _.RC:$src2), imm:$cc)), (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), + (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)), (i64 0)), @@ -2145,37 +2145,37 @@ let Predicates = Preds in { addr:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and _.KRCWM:$mask, + (_.KVT (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc))), (i64 0)), (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask, - _.RC:$src1, + _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert + (_.KVT (and (_.KVT _.KRCWM:$mask), + (_.KVT (OpNode (_.VT _.RC:$src1), + (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)))), (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, _.RC:$src1, addr:$src2, imm:$cc), NewInf.KRC)>; } } - + multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, SDNode OpNode, string InstrStr, - list<Predicate> Preds> + list<Predicate> Preds> : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { let Predicates = Preds in { def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), @@ -2187,7 +2187,7 @@ let Predicates = Preds in { addr:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), (_.KVT (and (_.KVT _.KRCWM:$mask), (_.KVT (OpNode (_.VT _.RC:$src1), @@ -2447,17 +2447,17 @@ multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo Ne string InstrStr, list<Predicate> Preds> { let Predicates = Preds in { def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.VT _.RC:$src2), imm:$cc)), (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; - + def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), + (_.KVT (X86cmpm (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)), (i64 0)), @@ -2477,19 +2477,19 @@ let Predicates = Preds in { NewInf.KRC)>; } } - + multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - string InstrStr, list<Predicate> Preds> + string InstrStr, list<Predicate> Preds> : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> { let Predicates = Preds in def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), + (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), + (_.VT _.RC:$src2), imm:$cc, (i32 FROUND_NO_EXC))), (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, + (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, _.RC:$src2, imm:$cc), NewInf.KRC)>; @@ -2817,16 +2817,16 @@ let Predicates = [HasAVX512] in { def : Pat<(maskVT (scalar_to_vector GR32:$src)), (COPY_TO_REGCLASS GR32:$src, maskRC)>; - def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), + def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), (COPY_TO_REGCLASS maskRC:$src, GR32)>; def : Pat<(maskVT (scalar_to_vector GR8:$src)), (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; - def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), + def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; - def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), + def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), (COPY_TO_REGCLASS maskRC:$src, GR32)>; } @@ -3036,7 +3036,7 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), +def : Pat<(insert_subvector (v16i1 immAllZerosV), (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr) @@ -3044,8 +3044,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), (i8 8)), (i8 8))>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (and VK8:$mask, +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk) @@ -3063,7 +3063,7 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2) (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), +def : Pat<(insert_subvector (v16i1 immAllZerosV), (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri) @@ -3072,8 +3072,8 @@ def : Pat<(insert_subvector (v16i1 immAllZerosV), imm:$cc), (i8 8)), (i8 8))>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (and VK8:$mask, +def : Pat<(insert_subvector (v16i1 immAllZerosV), + (v8i1 (and VK8:$mask, (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))), (i64 0)), (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik) @@ -3379,35 +3379,35 @@ defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, defm VMOVDQA32 : avx512_alignedload_vl<0x6F, "vmovdqa32", avx512vl_i32_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa32", avx512vl_i32_info, - HasAVX512, "VMOVDQA32">, + HasAVX512, "VMOVDQA32">, PD, EVEX_CD8<32, CD8VF>; defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, HasAVX512>, avx512_alignedstore_vl<0x7F, "vmovdqa64", avx512vl_i64_info, - HasAVX512, "VMOVDQA64">, + HasAVX512, "VMOVDQA64">, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, - HasBWI, "VMOVDQU8">, + HasBWI, "VMOVDQU8">, XD, EVEX_CD8<8, CD8VF>; defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, - HasBWI, "VMOVDQU16">, + HasBWI, "VMOVDQU16">, XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, - HasAVX512, "VMOVDQU32">, + HasAVX512, "VMOVDQU32">, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, - HasAVX512, "VMOVDQU64">, + HasAVX512, "VMOVDQU64">, XS, VEX_W, EVEX_CD8<64, CD8VF>; // Special instructions to help with spilling when we don't have VLX. We need @@ -3964,49 +3964,49 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; let hasSideEffects = 0 in { - def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2), "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], NoItinerary>, XS, EVEX_4V, VEX_LIG, FoldGenData<"VMOVSSZrr">; let Constraints = "$src0 = $dst" in - def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, + def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG, FoldGenData<"VMOVSSZrrk">; - - def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + + def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, FoldGenData<"VMOVSSZrrkz">; - def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2), "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W, FoldGenData<"VMOVSDZrr">; let Constraints = "$src0 = $dst" in - def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, + def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, VR128X:$src1, FR64X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG, - VEX_W, FoldGenData<"VMOVSDZrrk">; + VEX_W, FoldGenData<"VMOVSDZrrk">; - def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f64x_info.KRCWM:$mask, VR128X:$src1, + def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), + (ins f64x_info.KRCWM:$mask, VR128X:$src1, FR64X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", - [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, VEX_W, FoldGenData<"VMOVSDZrrkz">; } @@ -5676,6 +5676,109 @@ defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v2i64x_info, [HasVLX]>; defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v4i64x_info, [HasVLX]>; defm : avx512_var_shift_int_lowering_mb<"VPSRAVQ", v8i64_info, [HasAVX512]>; + +// Use 512bit VPROL/VPROLI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotl (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotli (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPROLQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotli (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotli (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPROLDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + +// Use 512bit VPROR/VPRORI version to implement v2i64/v4i64 + v4i32/v8i32 in case NoVLX. +let Predicates = [HasAVX512, NoVLX] in { + def : Pat<(v2i64 (rotr (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORVQZrr + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + sub_xmm)>; + def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORVDZrr + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + sub_ymm)>; + + def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v4i64 (X86vrotri (v4i64 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v8i64 + (VPRORQZri + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; + + def : Pat<(v4i32 (X86vrotri (v4i32 VR128X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), + imm:$src2)), sub_xmm)>; + def : Pat<(v8i32 (X86vrotri (v8i32 VR256X:$src1), (i8 imm:$src2))), + (EXTRACT_SUBREG (v16i32 + (VPRORDZri + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + imm:$src2)), sub_ymm)>; +} + //===-------------------------------------------------------------------===// // 1-src variable permutation VPERMW/D/Q //===-------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 7e4cba1c8345f..343da2573b55c 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -224,7 +224,7 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, const TargetRegisterClass * X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { const Function *F = MF.getFunction(); - if (IsWin64 || (F && F->getCallingConv() == CallingConv::X86_64_Win64)) + if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64)) return &X86::GR64_TCW64RegClass; else if (Is64Bit) return &X86::GR64_TCRegClass; @@ -334,7 +334,7 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { if (Is64Bit) return CSR_64_MostRegs_SaveList; break; - case CallingConv::X86_64_Win64: + case CallingConv::Win64: if (!HasSSE) return CSR_Win64_NoSSE_SaveList; return CSR_Win64_SaveList; @@ -450,7 +450,7 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, if (Is64Bit) return CSR_64_MostRegs_RegMask; break; - case CallingConv::X86_64_Win64: + case CallingConv::Win64: return CSR_Win64_RegMask; case CallingConv::X86_64_SysV: return CSR_64_RegMask; diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index a12fa68faf4f1..d831a7974359a 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -663,5 +663,6 @@ include "X86ScheduleAtom.td" include "X86SchedSandyBridge.td" include "X86SchedHaswell.td" include "X86ScheduleSLM.td" +include "X86ScheduleZnver1.td" include "X86ScheduleBtVer2.td" diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index ed53893b779ce..9dcc968a1a7af 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -371,6 +371,22 @@ def : WriteRes<WriteFence, [JSAGU]>; def : WriteRes<WriteNop, []>; //////////////////////////////////////////////////////////////////////////////// +// SSE4A instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WriteEXTRQ: SchedWriteRes<[JFPU01]> { + let Latency = 1; + let ResourceCycles = [1]; +} +def : InstRW<[WriteEXTRQ], (instregex "EXTRQ")>; + +def WriteINSERTQ: SchedWriteRes<[JFPU01]> { + let Latency = 2; + let ResourceCycles = [4]; +} +def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>; + +//////////////////////////////////////////////////////////////////////////////// // AVX instructions. //////////////////////////////////////////////////////////////////////////////// diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td new file mode 100644 index 0000000000000..d5b4cfe2ddee0 --- /dev/null +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -0,0 +1,223 @@ +//=- X86ScheduleZnver1.td - X86 Znver1 Scheduling -------------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Znver1 to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def Znver1Model : SchedMachineModel { + // Zen can decode 4 instructions per cycle. + let IssueWidth = 4; + // Based on the reorder buffer we define MicroOpBufferSize + let MicroOpBufferSize = 192; + let LoadLatency = 4; + let MispredictPenalty = 17; + let HighLatency = 25; + let PostRAScheduler = 1; + + // FIXME: This variable is required for incomplete model. + // We haven't catered all instructions. + // So, we reset the value of this variable so as to + // say that the model is incomplete. + let CompleteModel = 0; +} + +let SchedModel = Znver1Model in { + +// Zen can issue micro-ops to 10 different units in one cycle. +// These are +// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3) +// * Two AGU units (ZAGU0, ZAGU1) +// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3) +// AGUs feed load store queues @two loads and 1 store per cycle. + +// Four ALU units are defined below +def ZnALU0 : ProcResource<1>; +def ZnALU1 : ProcResource<1>; +def ZnALU2 : ProcResource<1>; +def ZnALU3 : ProcResource<1>; + +// Two AGU units are defined below +def ZnAGU0 : ProcResource<1>; +def ZnAGU1 : ProcResource<1>; + +// Four FPU units are defined below +def ZnFPU0 : ProcResource<1>; +def ZnFPU1 : ProcResource<1>; +def ZnFPU2 : ProcResource<1>; +def ZnFPU3 : ProcResource<1>; + +// FPU grouping +def ZnFPU : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU2, ZnFPU3]>; +def ZnFPU013 : ProcResGroup<[ZnFPU0, ZnFPU1, ZnFPU3]>; +def ZnFPU01 : ProcResGroup<[ZnFPU0, ZnFPU1]>; +def ZnFPU12 : ProcResGroup<[ZnFPU1, ZnFPU2]>; +def ZnFPU13 : ProcResGroup<[ZnFPU1, ZnFPU3]>; +def ZnFPU23 : ProcResGroup<[ZnFPU2, ZnFPU3]>; +def ZnFPU02 : ProcResGroup<[ZnFPU0, ZnFPU2]>; +def ZnFPU03 : ProcResGroup<[ZnFPU0, ZnFPU3]>; + +// Below are the grouping of the units. +// Micro-ops to be issued to multiple units are tackled this way. + +// ALU grouping +// ZnALU03 - 0,3 grouping +def ZnALU03: ProcResGroup<[ZnALU0, ZnALU3]>; + +// 56 Entry (14x4 entries) Int Scheduler +def ZnALU : ProcResGroup<[ZnALU0, ZnALU1, ZnALU2, ZnALU3]> { + let BufferSize=56; +} + +// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations +// but are relevant for some instructions +def ZnAGU : ProcResGroup<[ZnAGU0, ZnAGU1]> { + let BufferSize=28; +} + +// Integer Multiplication issued on ALU1. +def ZnMultiplier : ProcResource<1>; + +// Integer division issued on ALU2. +def ZnDivider : ProcResource<1>; + +// 4 Cycles load-to use Latency is captured +def : ReadAdvance<ReadAfterLd, 4>; + +// (a folded load is an instruction that loads and does some operation) +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops. +// a. load and +// b. addpd +// This multiclass is for folded loads for integer units. +multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 4 cycles to the latency. + def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { + let Latency = !add(Lat, 4); + } +} + +// This multiclass is for folded loads for floating point units. +multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant takes 1-cycle on Execution Port. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on ZnAGU + // adds 7 cycles to the latency. + def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { + let Latency = !add(Lat, 7); + } +} + +// WriteRMW is set for instructions with Memory write +// operation in codegen +def : WriteRes<WriteRMW, [ZnAGU]>; + +def : WriteRes<WriteStore, [ZnAGU]>; +def : WriteRes<WriteMove, [ZnALU]>; +def : WriteRes<WriteLoad, [ZnAGU]> { let Latency = 8; } + +def : WriteRes<WriteZero, []>; +def : WriteRes<WriteLEA, [ZnALU]>; +defm : ZnWriteResPair<WriteALU, ZnALU, 1>; +defm : ZnWriteResPair<WriteShift, ZnALU, 1>; +defm : ZnWriteResPair<WriteJump, ZnALU, 1>; + +// IDIV +def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> { + let Latency = 41; + let ResourceCycles = [1, 41]; +} + +def : WriteRes<WriteIDivLd, [ZnALU2, ZnAGU, ZnDivider]> { + let Latency = 45; + let ResourceCycles = [1, 4, 41]; +} + +// IMUL +def : WriteRes<WriteIMulH, [ZnALU1, ZnMultiplier]>{ + let Latency = 4; +} +def : WriteRes<WriteIMul, [ZnALU1, ZnMultiplier]> { + let Latency = 4; +} + +def : WriteRes<WriteIMulLd,[ZnALU1, ZnMultiplier]> { + let Latency = 8; +} + +// Floating point operations +defm : ZnWriteResFpuPair<WriteFHAdd, ZnFPU0, 3>; +defm : ZnWriteResFpuPair<WriteFAdd, ZnFPU0, 3>; +defm : ZnWriteResFpuPair<WriteFBlend, ZnFPU01, 1>; +defm : ZnWriteResFpuPair<WriteFVarBlend, ZnFPU01, 1>; +defm : ZnWriteResFpuPair<WriteVarBlend, ZnFPU0, 1>; +defm : ZnWriteResFpuPair<WriteCvtI2F, ZnFPU3, 5>; +defm : ZnWriteResFpuPair<WriteCvtF2F, ZnFPU3, 5>; +defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>; +defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>; +defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>; +defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>; +defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>; +defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>; +defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>; + +// Vector integer operations which uses FPU units +defm : ZnWriteResFpuPair<WriteVecShift, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteVecLogic, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WritePHAdd, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteVecALU, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteVecIMul, ZnFPU0, 4>; +defm : ZnWriteResFpuPair<WriteShuffle, ZnFPU, 1>; +defm : ZnWriteResFpuPair<WriteBlend, ZnFPU01, 1>; +defm : ZnWriteResFpuPair<WriteShuffle256, ZnFPU, 2>; + +// Vector Shift Operations +defm : ZnWriteResFpuPair<WriteVarVecShift, ZnFPU12, 1>; + +// AES Instructions. +defm : ZnWriteResFpuPair<WriteAESDecEnc, ZnFPU01, 4>; +defm : ZnWriteResFpuPair<WriteAESIMC, ZnFPU01, 4>; +defm : ZnWriteResFpuPair<WriteAESKeyGen, ZnFPU01, 4>; + +def : WriteRes<WriteFence, [ZnAGU]>; +def : WriteRes<WriteNop, []>; + +// Following instructions with latency=100 are microcoded. +// We set long latency so as to block the entire pipeline. +defm : ZnWriteResFpuPair<WriteFShuffle256, ZnFPU, 100>; + +//Microcoded Instructions +let Latency = 100 in { + def : WriteRes<WriteMicrocoded, []>; + def : WriteRes<WriteSystem, []>; + def : WriteRes<WriteMPSAD, []>; + def : WriteRes<WriteMPSADLd, []>; + def : WriteRes<WriteCLMul, []>; + def : WriteRes<WriteCLMulLd, []>; + def : WriteRes<WritePCmpIStrM, []>; + def : WriteRes<WritePCmpIStrMLd, []>; + def : WriteRes<WritePCmpEStrI, []>; + def : WriteRes<WritePCmpEStrILd, []>; + def : WriteRes<WritePCmpEStrM, []>; + def : WriteRes<WritePCmpEStrMLd, []>; + def : WriteRes<WritePCmpIStrI, []>; + def : WriteRes<WritePCmpIStrILd, []>; + } +} diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index fa0afe29586b4..427a0001bef98 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -597,7 +597,7 @@ public: case CallingConv::Intel_OCL_BI: return isTargetWin64(); // This convention allows using the Win64 convention on other targets. - case CallingConv::X86_64_Win64: + case CallingConv::Win64: return true; // This convention allows using the SysV convention on Windows targets. case CallingConv::X86_64_SysV: diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 8d891c983fab0..08c2cdaefe71d 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -375,6 +375,7 @@ bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); if (EnableMachineCombinerPass) addPass(&MachineCombinerID); + addPass(createX86CmovConverterPass()); return true; } diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index aaa6d58bd1340..c16207973b393 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -40,6 +40,8 @@ public: ~X86TargetMachine() override; const X86Subtarget *getSubtargetImpl(const Function &F) const override; + // The no argument getSubtargetImpl, while it exists on some targets, is + // deprecated and should not be used. const X86Subtarget *getSubtargetImpl() const = delete; TargetIRAnalysis getTargetIRAnalysis() override; |