diff options
Diffstat (limited to 'lib/Target')
169 files changed, 6326 insertions, 1410 deletions
diff --git a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp index 4a7e0b2b803e..db1fbe069f4d 100644 --- a/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp +++ b/lib/Target/AArch64/AArch64A57FPLoadBalancing.cpp @@ -509,7 +509,7 @@ int AArch64A57FPLoadBalancing::scavengeRegister(Chain *G, Color C, assert(ChainBegin != ChainEnd && "Chain should contain instructions"); do { --I; - Units.accumulateBackward(*I); + Units.accumulate(*I); } while (I != ChainBegin); // Make sure we allocate in-order, to get the cheapest registers first. diff --git a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp index 6f8dd3e3ac0c..b3b738584b40 100644 --- a/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp +++ b/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp @@ -113,7 +113,7 @@ struct LDTLSCleanup : public MachineFunctionPass { return Copy; } - // Create a virtal register in *TLSBaseAddrReg, and populate it by + // Create a virtual register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. MachineInstr *setRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I.getParent()->getParent(); diff --git a/lib/Target/AArch64/AArch64CondBrTuning.cpp b/lib/Target/AArch64/AArch64CondBrTuning.cpp index 0a948812ff33..51700f905979 100644 --- a/lib/Target/AArch64/AArch64CondBrTuning.cpp +++ b/lib/Target/AArch64/AArch64CondBrTuning.cpp @@ -167,6 +167,7 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, case AArch64::SUBWrs: case AArch64::SUBWrx: IsFlagSetting = false; + LLVM_FALLTHROUGH; case AArch64::ADDSWri: case AArch64::ADDSWrr: case AArch64::ADDSWrs: @@ -226,6 +227,7 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI, case AArch64::SUBXrs: case AArch64::SUBXrx: IsFlagSetting = false; + LLVM_FALLTHROUGH; case AArch64::ADDSXri: case AArch64::ADDSXrr: case AArch64::ADDSXrs: diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 7bf2097c17ce..3682b62d2b84 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -2114,7 +2114,7 @@ bool AArch64FastISel::emitStore(MVT VT, unsigned SrcReg, Address Addr, switch (VT.SimpleTy) { default: llvm_unreachable("Unexpected value type."); - case MVT::i1: VTIsi1 = true; + case MVT::i1: VTIsi1 = true; LLVM_FALLTHROUGH; case MVT::i8: Opc = OpcTable[Idx][0]; break; case MVT::i16: Opc = OpcTable[Idx][1]; break; case MVT::i32: Opc = OpcTable[Idx][2]; break; diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index aaf32a499bc3..60fde5caa339 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -8364,9 +8364,9 @@ static bool findEXTRHalf(SDValue N, SDValue &Src, uint32_t &ShiftAmount, /// EXTR instruction extracts a contiguous chunk of bits from two existing /// registers viewed as a high/low pair. This function looks for the pattern: -/// (or (shl VAL1, #N), (srl VAL2, #RegWidth-N)) and replaces it with an -/// EXTR. Can't quite be done in TableGen because the two immediates aren't -/// independent. +/// <tt>(or (shl VAL1, \#N), (srl VAL2, \#RegWidth-N))</tt> and replaces it +/// with an EXTR. Can't quite be done in TableGen because the two immediates +/// aren't independent. static SDValue tryCombineToEXTR(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { SelectionDAG &DAG = DCI.DAG; @@ -9531,7 +9531,7 @@ static SDValue performPostLD1Combine(SDNode *N, return SDValue(); } -/// Simplify \Addr given that the top byte of it is ignored by HW during +/// Simplify ``Addr`` given that the top byte of it is ignored by HW during /// address translation. static bool performTBISimplification(SDValue Addr, TargetLowering::DAGCombinerInfo &DCI, diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index 314e89bbca86..dba3e4bdf82f 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -1282,6 +1282,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { case AArch64CC::HI: // Z clear and C set case AArch64CC::LS: // Z set or C clear UsedFlags.Z = true; + LLVM_FALLTHROUGH; case AArch64CC::HS: // C set case AArch64CC::LO: // C clear UsedFlags.C = true; @@ -1300,6 +1301,7 @@ static UsedNZCV getUsedNZCV(AArch64CC::CondCode CC) { case AArch64CC::GT: // Z clear, N and V the same case AArch64CC::LE: // Z set, N and V differ UsedFlags.Z = true; + LLVM_FALLTHROUGH; case AArch64CC::GE: // N and V the same case AArch64CC::LT: // N and V differ UsedFlags.N = true; @@ -3669,12 +3671,17 @@ enum class FMAInstKind { Default, Indexed, Accumulator }; /// F|MUL I=A,B,0 /// F|ADD R,I,C /// ==> F|MADD R,A,B,C +/// \param MF Containing MachineFunction +/// \param MRI Register information +/// \param TII Target information /// \param Root is the F|ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction /// \param IdxMulOpd is index of operand in Root that is the result of /// the F|MUL. In the example above IdxMulOpd is 1. /// \param MaddOpc the opcode fo the f|madd instruction +/// \param RC Register class of operands +/// \param kind of fma instruction (addressing mode) to be generated static MachineInstr * genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, @@ -3733,6 +3740,9 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, /// ADD R,I,Imm /// ==> ORR V, ZR, Imm /// ==> MADD R,A,B,V +/// \param MF Containing MachineFunction +/// \param MRI Register information +/// \param TII Target information /// \param Root is the ADD instruction /// \param [out] InsInstrs is a vector of machine instructions and will /// contain the generated madd instruction @@ -3741,6 +3751,7 @@ genFusedMultiply(MachineFunction &MF, MachineRegisterInfo &MRI, /// \param MaddOpc the opcode fo the madd instruction /// \param VR is a virtual register that holds the value of an ADD operand /// (V in the example above). +/// \param RC Register class of operands static MachineInstr *genMaddR(MachineFunction &MF, MachineRegisterInfo &MRI, const TargetInstrInfo *TII, MachineInstr &Root, SmallVectorImpl<MachineInstr *> &InsInstrs, @@ -4216,26 +4227,36 @@ void AArch64InstrInfo::genAlternativeCodeSequence( /// \brief Replace csincr-branch sequence by simple conditional branch /// /// Examples: -/// 1. +/// 1. \code /// csinc w9, wzr, wzr, <condition code> /// tbnz w9, #0, 0x44 +/// \endcode /// to +/// \code /// b.<inverted condition code> +/// \endcode /// -/// 2. +/// 2. \code /// csinc w9, wzr, wzr, <condition code> /// tbz w9, #0, 0x44 +/// \endcode /// to +/// \code /// b.<condition code> +/// \endcode /// /// Replace compare and branch sequence by TBZ/TBNZ instruction when the /// compare's constant operand is power of 2. /// /// Examples: +/// \code /// and w8, w8, #0x400 /// cbnz w8, L1 +/// \endcode /// to +/// \code /// tbnz w8, #10, L1 +/// \endcode /// /// \param MI Conditional Branch /// \return True when the simple conditional branch is generated @@ -4409,6 +4430,13 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { return makeArrayRef(TargetFlags); } +ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> +AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { + static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] = + {{MOSuppressPair, "aarch64-suppress-pair"}}; + return makeArrayRef(TargetFlags); +} + unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize, size_t Occurrences, bool CanBeTailCall) const { diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h index 58e9ce583d44..0809ede4df2a 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.h +++ b/lib/Target/AArch64/AArch64InstrInfo.h @@ -263,8 +263,8 @@ public: /// \param Pattern - combiner pattern bool isThroughputPattern(MachineCombinerPattern Pattern) const override; /// Return true when there is potentially a faster code sequence - /// for an instruction chain ending in <Root>. All potential patterns are - /// listed in the <Patterns> array. + /// for an instruction chain ending in ``Root``. All potential patterns are + /// listed in the ``Patterns`` array. bool getMachineCombinerPatterns(MachineInstr &Root, SmallVectorImpl<MachineCombinerPattern> &Patterns) const override; @@ -289,6 +289,8 @@ public: getSerializableDirectMachineOperandTargetFlags() const override; ArrayRef<std::pair<unsigned, const char *>> getSerializableBitmaskMachineOperandTargetFlags() const override; + ArrayRef<std::pair<MachineMemOperand::Flags, const char *>> + getSerializableMachineMemOperandTargetFlags() const override; bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override; unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences, diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 6cb723d187af..0be14673eb20 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -313,9 +313,6 @@ def AArch64umaxv : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>; //===----------------------------------------------------------------------===// // AArch64 Instruction Predicate Definitions. -def IsDarwin : Predicate<"Subtarget->isTargetDarwin()">; -def IsNotDarwin: Predicate<"!Subtarget->isTargetDarwin()">; - // We could compute these on a per-module basis but doing so requires accessing // the Function object through the <Target>Subtarget and objections were raised // to that (see post-commit review comments for r301750). @@ -714,10 +711,10 @@ def : InstAlias<"negs $dst, $src$shift", defm UDIV : Div<0, "udiv", udiv>; defm SDIV : Div<1, "sdiv", sdiv>; -def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr $Rn, $Rm)>; -def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr $Rn, $Rm)>; -def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr $Rn, $Rm)>; -def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr $Rn, $Rm)>; +def : Pat<(int_aarch64_udiv GPR32:$Rn, GPR32:$Rm), (UDIVWr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(int_aarch64_udiv GPR64:$Rn, GPR64:$Rm), (UDIVXr GPR64:$Rn, GPR64:$Rm)>; +def : Pat<(int_aarch64_sdiv GPR32:$Rn, GPR32:$Rm), (SDIVWr GPR32:$Rn, GPR32:$Rm)>; +def : Pat<(int_aarch64_sdiv GPR64:$Rn, GPR64:$Rm), (SDIVXr GPR64:$Rn, GPR64:$Rm)>; // Variable shift defm ASRV : Shift<0b10, "asr", sra>; diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp index 07ce0e863c5e..7e275e4d2f46 100644 --- a/lib/Target/AArch64/AArch64InstructionSelector.cpp +++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp @@ -33,6 +33,8 @@ #define DEBUG_TYPE "aarch64-isel" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" + using namespace llvm; #ifndef LLVM_BUILD_GLOBAL_ISEL @@ -212,6 +214,7 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, return GenericOpc; } } + break; case AArch64::FPRRegBankID: switch (OpSize) { case 32: @@ -243,7 +246,8 @@ static unsigned selectBinaryOp(unsigned GenericOpc, unsigned RegBankID, return GenericOpc; } } - }; + break; + } return GenericOpc; } @@ -267,6 +271,7 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, case 64: return isStore ? AArch64::STRXui : AArch64::LDRXui; } + break; case AArch64::FPRRegBankID: switch (OpSize) { case 8: @@ -278,7 +283,8 @@ static unsigned selectLoadStoreUIOp(unsigned GenericOpc, unsigned RegBankID, case 64: return isStore ? AArch64::STRDui : AArch64::LDRDui; } - }; + break; + } return GenericOpc; } @@ -1319,6 +1325,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const { case TargetOpcode::G_VASTART: return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI) : selectVaStartAAPCS(I, MF, MRI); + case TargetOpcode::G_IMPLICIT_DEF: + I.setDesc(TII.get(TargetOpcode::IMPLICIT_DEF)); + return true; } return false; diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp index 4b568f3fba2b..4a0a7c36baf8 100644 --- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp +++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp @@ -291,11 +291,10 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI, unsigned DstPtr; if (Align > PtrSize) { // Realign the list to the actual required alignment. - unsigned AlignMinus1 = MRI.createGenericVirtualRegister(IntPtrTy); - MIRBuilder.buildConstant(AlignMinus1, Align - 1); + auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1); unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy); - MIRBuilder.buildGEP(ListTmp, List, AlignMinus1); + MIRBuilder.buildGEP(ListTmp, List, AlignMinus1->getOperand(0).getReg()); DstPtr = MRI.createGenericVirtualRegister(PtrTy); MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align)); diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp index f3c8e7e9bdc2..4e65c0ab6011 100644 --- a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp +++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp @@ -163,6 +163,7 @@ AArch64RedundantCopyElimination::knownRegValInBlock( case AArch64::ADDSWri: case AArch64::ADDSXri: IsCMN = true; + LLVM_FALLTHROUGH; // CMP is an alias for SUBS with a dead destination register. case AArch64::SUBSWri: case AArch64::SUBSXri: { diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index a9a9d5ce8429..a3238cf3b60f 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -81,6 +81,7 @@ void AArch64Subtarget::initializeProperties() { break; case CortexA57: MaxInterleaveFactor = 4; + PrefFunctionAlignment = 4; break; case ExynosM1: MaxInterleaveFactor = 4; @@ -130,7 +131,9 @@ void AArch64Subtarget::initializeProperties() { break; case CortexA35: break; case CortexA53: break; - case CortexA72: break; + case CortexA72: + PrefFunctionAlignment = 4; + break; case CortexA73: break; case Others: break; } diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 7933e58c49ee..db53946cbc77 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -218,6 +218,13 @@ public: bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; } bool hasFuseAES() const { return HasFuseAES; } bool hasFuseLiterals() const { return HasFuseLiterals; } + + /// \brief Return true if the CPU supports any kind of instruction fusion. + bool hasFusion() const { + return hasArithmeticBccFusion() || hasArithmeticCbzFusion() || + hasFuseAES() || hasFuseLiterals(); + } + bool useRSqrt() const { return UseRSqrt; } unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } unsigned getVectorInsertExtractBaseCost() const { diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp index 1252f9403812..6237b8f3e7b9 100644 --- a/lib/Target/AArch64/AArch64TargetMachine.cpp +++ b/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -277,17 +277,19 @@ public: ScheduleDAGInstrs * createMachineScheduler(MachineSchedContext *C) const override { + const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); ScheduleDAGMILive *DAG = createGenericSchedLive(C); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); - DAG->addMutation(createAArch64MacroFusionDAGMutation()); + if (ST.hasFusion()) + DAG->addMutation(createAArch64MacroFusionDAGMutation()); return DAG; } ScheduleDAGInstrs * createPostMachineScheduler(MachineSchedContext *C) const override { const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>(); - if (ST.hasFuseAES() || ST.hasFuseLiterals()) { + if (ST.hasFusion()) { // Run the Macro Fusion after RA again since literals are expanded from // pseudos then (v. addPreSched2()). ScheduleDAGMI *DAG = createGenericSchedPostRA(C); diff --git a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp index 0d860a7eef79..7870dce5c9c0 100644 --- a/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp +++ b/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp @@ -756,7 +756,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, // if shift == '11' then ReservedValue() if (shiftHi == 0x3) return Fail; - // Deliberate fallthrough + LLVM_FALLTHROUGH; case AArch64::ANDWrs: case AArch64::ANDSWrs: case AArch64::BICWrs: @@ -780,7 +780,7 @@ static DecodeStatus DecodeThreeAddrSRegInstruction(llvm::MCInst &Inst, // if shift == '11' then ReservedValue() if (shiftHi == 0x3) return Fail; - // Deliberate fallthrough + LLVM_FALLTHROUGH; case AArch64::ANDXrs: case AArch64::ANDSXrs: case AArch64::BICXrs: diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp index 475f91016840..a7a7daf4b4a5 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp @@ -73,7 +73,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; bool mayNeedRelaxation(const MCInst &Inst) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, @@ -264,7 +264,7 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); if (!Value) return; // Doesn't change encoding. diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index fc808ee0cdd6..c25bd8c8f6cc 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -103,4 +103,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { AArch64MCAsmInfoCOFF::AArch64MCAsmInfoCOFF() { CommentString = ";"; + PrivateGlobalPrefix = ".L"; + PrivateLabelPrefix = ".L"; } diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp index 6f002860044c..ed5370826647 100644 --- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp +++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp @@ -108,10 +108,11 @@ bool AMDGPUAnnotateUniformValues::isClobberedInFunction(LoadInst * Load) { DFS(Start, Checklist); for (auto &BB : Checklist) { BasicBlock::iterator StartIt = (!L && (BB == Load->getParent())) ? - BasicBlock::iterator(Load) : BB->end(); - if (MDR->getPointerDependencyFrom(MemoryLocation(Ptr), - true, StartIt, BB, Load).isClobber()) - return true; + BasicBlock::iterator(Load) : BB->end(); + auto Q = MDR->getPointerDependencyFrom(MemoryLocation(Ptr), true, + StartIt, BB, Load); + if (Q.isClobber() || Q.isUnknown()) + return true; } return false; } diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp index b312dbc8d14d..31ee9206ae27 100644 --- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -380,7 +380,9 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) { FastMathFlags FMF = FPOp->getFastMathFlags(); bool UnsafeDiv = HasUnsafeFPMath || FMF.unsafeAlgebra() || FMF.allowReciprocal(); - if (ST->hasFP32Denormals() && !UnsafeDiv) + + // With UnsafeDiv node will be optimized to just rcp and mul. + if (ST->hasFP32Denormals() || UnsafeDiv) return false; IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath); diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 96f819fd0e68..2553cf4da0fe 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -2651,8 +2651,11 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDValue Shl = DAG.getNode(ISD::SHL, SL, XVT, X, SDValue(RHS, 0)); return DAG.getZExtOrTrunc(Shl, SL, VT); } - case ISD::OR: if (!isOrEquivalentToAdd(DAG, LHS)) break; - case ISD::ADD: { // Fall through from above + case ISD::OR: + if (!isOrEquivalentToAdd(DAG, LHS)) + break; + LLVM_FALLTHROUGH; + case ISD::ADD: { // shl (or|add x, c2), c1 => or|add (shl x, c1), (c2 << c1) if (ConstantSDNode *C2 = dyn_cast<ConstantSDNode>(LHS->getOperand(1))) { SDValue Shl = DAG.getNode(ISD::SHL, SL, VT, LHS->getOperand(0), diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp index 846e7dff5f8c..7e0e9802c0e6 100644 --- a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp +++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp @@ -10,6 +10,7 @@ #include "AMDGPU.h" #include "AMDGPUSubtarget.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" @@ -34,9 +35,14 @@ public: AMDGPULowerIntrinsics() : ModulePass(ID) {} bool runOnModule(Module &M) override; + bool expandMemIntrinsicUses(Function &F); StringRef getPassName() const override { return "AMDGPU Lower Intrinsics"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired<TargetTransformInfoWrapperPass>(); + } }; } @@ -55,7 +61,7 @@ static bool shouldExpandOperationWithSize(Value *Size) { return !CI || (CI->getZExtValue() > MaxStaticSize); } -static bool expandMemIntrinsicUses(Function &F) { +bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) { Intrinsic::ID ID = F.getIntrinsicID(); bool Changed = false; @@ -67,7 +73,10 @@ static bool expandMemIntrinsicUses(Function &F) { case Intrinsic::memcpy: { auto *Memcpy = cast<MemCpyInst>(Inst); if (shouldExpandOperationWithSize(Memcpy->getLength())) { - expandMemCpyAsLoop(Memcpy); + Function *ParentFunc = Memcpy->getParent()->getParent(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(*ParentFunc); + expandMemCpyAsLoop(Memcpy, TTI); Changed = true; Memcpy->eraseFromParent(); } diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp new file mode 100644 index 000000000000..7263ba73d155 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp @@ -0,0 +1,64 @@ +//===--- AMDGPUMacroFusion.cpp - AMDGPU Macro Fusion ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU implementation of the DAG scheduling +/// mutation to pair instructions back to back. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPUMacroFusion.h" +#include "AMDGPUSubtarget.h" +#include "SIInstrInfo.h" + +#include "llvm/CodeGen/MacroFusion.h" + +using namespace llvm; + +namespace { + +/// \brief Check if the instr pair, FirstMI and SecondMI, should be fused +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &TII = static_cast<const SIInstrInfo&>(TII_); + + switch (SecondMI.getOpcode()) { + case AMDGPU::V_ADDC_U32_e64: + case AMDGPU::V_SUBB_U32_e64: + case AMDGPU::V_CNDMASK_B32_e64: { + // Try to cluster defs of condition registers to their uses. This improves + // the chance VCC will be available which will allow shrinking to VOP2 + // encodings. + if (!FirstMI) + return true; + + const MachineOperand *Src2 = TII.getNamedOperand(SecondMI, + AMDGPU::OpName::src2); + return FirstMI->definesRegister(Src2->getReg()); + } + default: + return false; + } + + return false; +} + +} // end namespace + + +namespace llvm { + +std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation () { + return createMacroFusionDAGMutation(shouldScheduleAdjacent); +} + +} // end namespace llvm diff --git a/lib/Target/AMDGPU/AMDGPUMacroFusion.h b/lib/Target/AMDGPU/AMDGPUMacroFusion.h new file mode 100644 index 000000000000..844958580a65 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMacroFusion.h @@ -0,0 +1,19 @@ +//===- AMDGPUMacroFusion.h - AMDGPU Macro Fusion ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +/// Note that you have to add: +/// DAG.addMutation(createAMDGPUMacroFusionDAGMutation()); +/// to AMDGPUPassConfig::createMachineScheduler() to have an effect. +std::unique_ptr<ScheduleDAGMutation> createAMDGPUMacroFusionDAGMutation(); + +} // llvm diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index be47b900c6f0..1bc5a52053ec 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -13,6 +13,14 @@ //===----------------------------------------------------------------------===// #include "AMDGPUSubtarget.h" +#include "AMDGPU.h" +#include "AMDGPUTargetMachine.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL +#include "AMDGPUCallLowering.h" +#include "AMDGPUInstructionSelector.h" +#include "AMDGPULegalizerInfo.h" +#include "AMDGPURegisterBankInfo.h" +#endif #include "SIMachineFunctionInfo.h" #include "llvm/ADT/SmallString.h" #include "llvm/CodeGen/MachineScheduler.h" @@ -72,6 +80,31 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT, return *this; } +#ifdef LLVM_BUILD_GLOBAL_ISEL +namespace { + +struct SIGISelActualAccessor : public GISelAccessor { + std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; + std::unique_ptr<InstructionSelector> InstSelector; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + const AMDGPUCallLowering *getCallLowering() const override { + return CallLoweringInfo.get(); + } + const InstructionSelector *getInstructionSelector() const override { + return InstSelector.get(); + } + const LegalizerInfo *getLegalizerInfo() const override { + return Legalizer.get(); + } + const RegisterBankInfo *getRegBankInfo() const override { + return RegBankInfo.get(); + } +}; + +} // end anonymous namespace +#endif + AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, const TargetMachine &TM) : AMDGPUGenSubtargetInfo(TT, GPU, FS), @@ -265,18 +298,21 @@ bool AMDGPUSubtarget::makeLIDRangeMetadata(Instruction *I) const { case Intrinsic::amdgcn_workitem_id_x: case Intrinsic::r600_read_tidig_x: IdQuery = true; + LLVM_FALLTHROUGH; case Intrinsic::r600_read_local_size_x: Dim = 0; break; case Intrinsic::amdgcn_workitem_id_y: case Intrinsic::r600_read_tidig_y: IdQuery = true; + LLVM_FALLTHROUGH; case Intrinsic::r600_read_local_size_y: Dim = 1; break; case Intrinsic::amdgcn_workitem_id_z: case Intrinsic::r600_read_tidig_z: IdQuery = true; + LLVM_FALLTHROUGH; case Intrinsic::r600_read_local_size_z: Dim = 2; break; @@ -317,11 +353,23 @@ R600Subtarget::R600Subtarget(const Triple &TT, StringRef GPU, StringRef FS, TLInfo(TM, *this) {} SISubtarget::SISubtarget(const Triple &TT, StringRef GPU, StringRef FS, - const TargetMachine &TM) : - AMDGPUSubtarget(TT, GPU, FS, TM), - InstrInfo(*this), - FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), - TLInfo(TM, *this) {} + const TargetMachine &TM) + : AMDGPUSubtarget(TT, GPU, FS, TM), InstrInfo(*this), + FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0), + TLInfo(TM, *this) { +#ifndef LLVM_BUILD_GLOBAL_ISEL + GISelAccessor *GISel = new GISelAccessor(); +#else + SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); + GISel->CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering())); + GISel->Legalizer.reset(new AMDGPULegalizerInfo()); + + GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*getRegisterInfo())); + GISel->InstSelector.reset(new AMDGPUInstructionSelector( + *this, *static_cast<AMDGPURegisterBankInfo *>(GISel->RegBankInfo.get()))); +#endif + setGISelAccessor(*GISel); +} void SISubtarget::overrideSchedPolicy(MachineSchedPolicy &Policy, unsigned NumRegionInstrs) const { diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 425fd35d47de..dc868f010d85 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -19,9 +19,7 @@ #include "AMDGPUCallLowering.h" #include "AMDGPUInstructionSelector.h" #include "AMDGPULegalizerInfo.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL -#include "AMDGPURegisterBankInfo.h" -#endif +#include "AMDGPUMacroFusion.h" #include "AMDGPUTargetObjectFile.h" #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" @@ -85,7 +83,7 @@ static cl::opt<bool> EnableLoadStoreVectorizer( static cl::opt<bool> ScalarizeGlobal( "amdgpu-scalarize-global-loads", cl::desc("Enable global load scalarization"), - cl::init(false), + cl::init(true), cl::Hidden); // Option to run internalize pass. @@ -176,6 +174,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) { new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C)); DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createAMDGPUMacroFusionDAGMutation()); return DAG; } @@ -389,31 +388,6 @@ const R600Subtarget *R600TargetMachine::getSubtargetImpl( // GCN Target Machine (SI+) //===----------------------------------------------------------------------===// -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct SIGISelActualAccessor : public GISelAccessor { - std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo; - std::unique_ptr<InstructionSelector> InstSelector; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - const AMDGPUCallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; - -} // end anonymous namespace -#endif - GCNTargetMachine::GCNTargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, TargetOptions Options, @@ -435,21 +409,6 @@ const SISubtarget *GCNTargetMachine::getSubtargetImpl(const Function &F) const { // function that reside in TargetOptions. resetTargetOptions(F); I = llvm::make_unique<SISubtarget>(TargetTriple, GPU, FS, *this); - -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - SIGISelActualAccessor *GISel = new SIGISelActualAccessor(); - GISel->CallLoweringInfo.reset( - new AMDGPUCallLowering(*I->getTargetLowering())); - GISel->Legalizer.reset(new AMDGPULegalizerInfo()); - - GISel->RegBankInfo.reset(new AMDGPURegisterBankInfo(*I->getRegisterInfo())); - GISel->InstSelector.reset(new AMDGPUInstructionSelector(*I, - *static_cast<AMDGPURegisterBankInfo*>(GISel->RegBankInfo.get()))); -#endif - - I->setGISelAccessor(*GISel); } I->setScalarizeGlobalBehavior(ScalarizeGlobal); diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp index 7b8756050b75..e3c90f250600 100644 --- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp +++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp @@ -1058,17 +1058,13 @@ public: OperandMatchResultTy parseOModOperand(OperandVector &Operands); - void cvtId(MCInst &Inst, const OperandVector &Operands); - void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands); - - void cvtVOP3Impl(MCInst &Inst, - const OperandVector &Operands, - OptionalImmIndexMap &OptionalIdx); + void cvtVOP3(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx); void cvtVOP3(MCInst &Inst, const OperandVector &Operands); - void cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands); void cvtVOP3P(MCInst &Inst, const OperandVector &Operands); - void cvtMIMG(MCInst &Inst, const OperandVector &Operands); + void cvtMIMG(MCInst &Inst, const OperandVector &Operands, + bool IsAtomic = false); void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands); OperandMatchResultTy parseDPPCtrl(OperandVector &Operands); @@ -3870,13 +3866,19 @@ void AMDGPUAsmParser::cvtMtbuf(MCInst &Inst, const OperandVector &Operands) { // mimg //===----------------------------------------------------------------------===// -void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) { +void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands, + bool IsAtomic) { unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } + if (IsAtomic) { + // Add src, same as dst + ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); + } + OptionalImmIndexMap OptionalIdx; for (unsigned E = Operands.size(); I != E; ++I) { @@ -3904,39 +3906,7 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands) { } void AMDGPUAsmParser::cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands) { - unsigned I = 1; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { - ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); - } - - // Add src, same as dst - ((AMDGPUOperand &)*Operands[I]).addRegOperands(Inst, 1); - - OptionalImmIndexMap OptionalIdx; - - for (unsigned E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - - // Add the register arguments - if (Op.isRegOrImm()) { - Op.addRegOrImmOperands(Inst, 1); - continue; - } else if (Op.isImmModifier()) { - OptionalIdx[Op.getImmTy()] = I; - } else { - llvm_unreachable("unexpected operand type"); - } - } - - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDMask); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyUNorm); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyDA); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC); + cvtMIMG(Inst, Operands, true); } AMDGPUOperand::Ptr AMDGPUAsmParser::defaultDMask() const { @@ -4118,25 +4088,6 @@ OperandMatchResultTy AMDGPUAsmParser::parseOModOperand(OperandVector &Operands) return MatchOperand_NoMatch; } -void AMDGPUAsmParser::cvtId(MCInst &Inst, const OperandVector &Operands) { - unsigned I = 1; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { - ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); - } - for (unsigned E = Operands.size(); I != E; ++I) - ((AMDGPUOperand &)*Operands[I]).addRegOrImmOperands(Inst, 1); -} - -void AMDGPUAsmParser::cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands) { - uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags; - if (TSFlags & SIInstrFlags::VOP3) { - cvtVOP3(Inst, Operands); - } else { - cvtId(Inst, Operands); - } -} - static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { // 1. This operand is input modifiers return Desc.OpInfo[OpNum].OperandType == AMDGPU::OPERAND_INPUT_MODS @@ -4148,91 +4099,78 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) { && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1; } -void AMDGPUAsmParser::cvtVOP3Impl(MCInst &Inst, const OperandVector &Operands, - OptionalImmIndexMap &OptionalIdx) { +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands, + OptionalImmIndexMap &OptionalIdx) { + unsigned Opc = Inst.getOpcode(); + unsigned I = 1; const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); } - for (unsigned E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { - Op.addRegOrImmWithFPInputModsOperands(Inst, 2); - } else if (Op.isImmModifier()) { - OptionalIdx[Op.getImmTy()] = I; - } else if (Op.isRegOrImm()) { - Op.addRegOrImmOperands(Inst, 1); - } else { - llvm_unreachable("unhandled operand type"); + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers) != -1) { + // This instruction has src modifiers + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) { + Op.addRegOrImmWithFPInputModsOperands(Inst, 2); + } else if (Op.isImmModifier()) { + OptionalIdx[Op.getImmTy()] = I; + } else if (Op.isRegOrImm()) { + Op.addRegOrImmOperands(Inst, 1); + } else { + llvm_unreachable("unhandled operand type"); + } + } + } else { + // No src modifiers + for (unsigned E = Operands.size(); I != E; ++I) { + AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); + if (Op.isMod()) { + OptionalIdx[Op.getImmTy()] = I; + } else { + Op.addRegOrImmOperands(Inst, 1); + } } } -} -void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { - OptionalImmIndexMap OptionalIdx; - - cvtVOP3Impl(Inst, Operands, OptionalIdx); + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); + } - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::omod) != -1) { + addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + } // special case v_mac_{f16, f32}: // it has src2 register operand that is tied to dst operand // we don't allow modifiers for this operand in assembler so src2_modifiers // should be 0 - if (Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_si || - Inst.getOpcode() == AMDGPU::V_MAC_F32_e64_vi || - Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi) { + if (Opc == AMDGPU::V_MAC_F32_e64_si || Opc == AMDGPU::V_MAC_F32_e64_vi || + Opc == AMDGPU::V_MAC_F16_e64_vi) { auto it = Inst.begin(); - std::advance( - it, - AMDGPU::getNamedOperandIdx(Inst.getOpcode() == AMDGPU::V_MAC_F16_e64_vi ? - AMDGPU::V_MAC_F16_e64 : - AMDGPU::V_MAC_F32_e64, - AMDGPU::OpName::src2_modifiers)); + std::advance(it, AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2_modifiers)); it = Inst.insert(it, MCOperand::createImm(0)); // no modifiers for src2 ++it; Inst.insert(it, Inst.getOperand(0)); // src2 = dst } } -void AMDGPUAsmParser::cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands) { +void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptionalIdx; - - unsigned I = 1; - const MCInstrDesc &Desc = MII.get(Inst.getOpcode()); - for (unsigned J = 0; J < Desc.getNumDefs(); ++J) { - ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1); - } - - for (unsigned E = Operands.size(); I != E; ++I) { - AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]); - if (Op.isMod()) { - OptionalIdx[Op.getImmTy()] = I; - } else { - Op.addRegOrImmOperands(Inst, 1); - } - } - - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI); - addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI); + cvtVOP3(Inst, Operands, OptionalIdx); } void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { OptionalImmIndexMap OptIdx; - cvtVOP3Impl(Inst, Operands, OptIdx); + cvtVOP3(Inst, Operands, OptIdx); // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3 // instruction, and then figure out where to actually put the modifiers int Opc = Inst.getOpcode(); - if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) { - addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI); - } - addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel); addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1); @@ -4284,7 +4222,7 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) { int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]); - Inst.getOperand(ModIdx).setImm(ModVal); + Inst.getOperand(ModIdx).setImm(Inst.getOperand(ModIdx).getImm() | ModVal); } } diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 917d9cfa6905..971208c5db84 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -47,6 +47,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUIntrinsicInfo.cpp AMDGPUISelDAGToDAG.cpp AMDGPULowerIntrinsics.cpp + AMDGPUMacroFusion.cpp AMDGPUMCInstLower.cpp AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp index 8ead48067336..2e7641cda375 100644 --- a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp +++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp @@ -17,7 +17,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" namespace llvm { std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots, diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp index d378df674be9..0657f67b217d 100644 --- a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp +++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp @@ -15,7 +15,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" namespace { class GCNMinRegScheduler { diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index 390a8286c76a..1d02c7fdffbf 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -16,7 +16,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 8ec46665daf5..155b400ba022 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -20,7 +20,7 @@ #include "llvm/CodeGen/RegisterClassInfo.h" #include "llvm/Support/MathExtras.h" -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" using namespace llvm; diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 3ed3cd5b3b1c..060d2ca72d93 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -66,7 +66,7 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { const SIMachineFunctionInfo &MFI; - // Occupancy target at the begining of function scheduling cycle. + // Occupancy target at the beginning of function scheduling cycle. unsigned StartingOccupancy; // Minimal real occupancy recorder for the function. diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp index 2b408ff10caa..a50e3eb8d9ce 100644 --- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp +++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp @@ -32,7 +32,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value, const MCRelaxableFragment *DF, const MCAsmLayout &Layout) const override { @@ -100,7 +100,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value, void AMDGPUAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { Value = adjustFixupValue(Fixup, Value, &Asm.getContext()); if (!Value) return; // Doesn't change encoding. diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td index a515eecc222a..06e2c11b0193 100644 --- a/lib/Target/AMDGPU/MIMGInstructions.td +++ b/lib/Target/AMDGPU/MIMGInstructions.td @@ -26,6 +26,7 @@ class MIMG_Helper <dag outs, dag ins, string asm, let isAsmParserOnly = !if(!eq(dns,""), 1, 0); let AsmMatchConverter = "cvtMIMG"; let usesCustomInserter = 1; + let SchedRW = [WriteVMEM]; } class MIMG_NoSampler_Helper <bits<7> op, string asm, diff --git a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp index 6993e8a62a9c..00cbd24b84fb 100644 --- a/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp +++ b/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp @@ -555,7 +555,7 @@ public: CFStack.pushBranch(AMDGPU::CF_PUSH_EG); } else CFStack.pushBranch(AMDGPU::CF_ALU_PUSH_BEFORE); - + LLVM_FALLTHROUGH; case AMDGPU::CF_ALU: I = MI; AluClauses.push_back(MakeALUClause(MBB, I)); diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp index 215791f4f92d..69a63b6941ef 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.cpp +++ b/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1618,7 +1618,8 @@ EVT R600TargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &, return VT.changeVectorElementTypeToInteger(); } -bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { +bool R600TargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const { // Local and Private addresses do not handle vectors. Limit to i32 if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS)) { return (MemVT.getSizeInBits() <= 32); diff --git a/lib/Target/AMDGPU/R600ISelLowering.h b/lib/Target/AMDGPU/R600ISelLowering.h index d6a0876a6ee7..2a774693f02b 100644 --- a/lib/Target/AMDGPU/R600ISelLowering.h +++ b/lib/Target/AMDGPU/R600ISelLowering.h @@ -44,7 +44,8 @@ public: EVT getSetCCResultType(const DataLayout &DL, LLVMContext &, EVT VT) const override; - bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, diff --git a/lib/Target/AMDGPU/R600MachineScheduler.cpp b/lib/Target/AMDGPU/R600MachineScheduler.cpp index 47fda1c8fa82..a7e540f9d14d 100644 --- a/lib/Target/AMDGPU/R600MachineScheduler.cpp +++ b/lib/Target/AMDGPU/R600MachineScheduler.cpp @@ -22,7 +22,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" void R600SchedStrategy::initialize(ScheduleDAGMI *dag) { assert(dag->hasVRegLiveness() && "R600SchedStrategy needs vreg liveness"); diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp index f391f67a241f..3af242d9ea66 100644 --- a/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -137,6 +137,7 @@ static bool isInlineConstantIfFolded(const SIInstrInfo *TII, = TII->get(IsF32 ? AMDGPU::V_MAD_F32 : AMDGPU::V_MAD_F16); return TII->isInlineConstant(OpToFold, MadDesc.OpInfo[OpNo].OperandType); } + return false; } default: return false; diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index d39b345bdf03..2ba570b9ebbb 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -547,7 +547,7 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.align = 0; const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4)); - Info.vol = !Vol || !Vol->isNullValue(); + Info.vol = !Vol || !Vol->isZero(); Info.readMem = true; Info.writeMem = true; return true; @@ -713,7 +713,8 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL, } } -bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT) const { +bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const { if (AS == AMDGPUASI.GLOBAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS) { return (MemVT.getSizeInBits() <= 4 * 32); } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) { @@ -2374,20 +2375,16 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N, } case ISD::INTRINSIC_WO_CHAIN: { unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue(); - switch (IID) { - case Intrinsic::amdgcn_cvt_pkrtz: { + if (IID == Intrinsic::amdgcn_cvt_pkrtz) { SDValue Src0 = N->getOperand(1); SDValue Src1 = N->getOperand(2); SDLoc SL(N); SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32, Src0, Src1); - Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt)); return; } - default: - break; - } + break; } case ISD::SELECT: { SDLoc SL(N); @@ -3736,7 +3733,9 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, SDValue LHS = Op.getOperand(0); SDValue RHS = Op.getOperand(1); EVT VT = Op.getValueType(); - bool Unsafe = DAG.getTarget().Options.UnsafeFPMath; + const SDNodeFlags Flags = Op->getFlags(); + bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || + Flags.hasUnsafeAlgebra() || Flags.hasAllowReciprocal(); if (!Unsafe && VT == MVT::f32 && Subtarget->hasFP32Denormals()) return SDValue(); @@ -3771,15 +3770,11 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op, } } - const SDNodeFlags Flags = Op->getFlags(); - - if (Unsafe || Flags.hasAllowReciprocal()) { + if (Unsafe) { // Turn into multiply by the reciprocal. // x / y -> x * (1.0 / y) - SDNodeFlags NewFlags; - NewFlags.setUnsafeAlgebra(true); SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS); - return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, NewFlags); + return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags); } return SDValue(); @@ -4622,15 +4617,99 @@ SDValue SITargetLowering::performClassCombine(SDNode *N, return SDValue(); } +static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { + if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) + return true; + + return DAG.isKnownNeverNaN(Op); +} + +static bool isCanonicalized(SDValue Op, const SISubtarget *ST, + unsigned MaxDepth=5) { + // If source is a result of another standard FP operation it is already in + // canonical form. + + switch (Op.getOpcode()) { + default: + break; + + // These will flush denorms if required. + case ISD::FADD: + case ISD::FSUB: + case ISD::FMUL: + case ISD::FSQRT: + case ISD::FCEIL: + case ISD::FFLOOR: + case ISD::FMA: + case ISD::FMAD: + + case ISD::FCANONICALIZE: + return true; + + case ISD::FP_ROUND: + return Op.getValueType().getScalarType() != MVT::f16 || + ST->hasFP16Denormals(); + + case ISD::FP_EXTEND: + return Op.getOperand(0).getValueType().getScalarType() != MVT::f16 || + ST->hasFP16Denormals(); + + case ISD::FP16_TO_FP: + case ISD::FP_TO_FP16: + return ST->hasFP16Denormals(); + + // It can/will be lowered or combined as a bit operation. + // Need to check their input recursively to handle. + case ISD::FNEG: + case ISD::FABS: + return (MaxDepth > 0) && + isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1); + + case ISD::FSIN: + case ISD::FCOS: + case ISD::FSINCOS: + return Op.getValueType().getScalarType() != MVT::f16; + + // In pre-GFX9 targets V_MIN_F32 and others do not flush denorms. + // For such targets need to check their input recursively. + // TODO: on GFX9+ we could return true without checking provided no-nan + // mode, since canonicalization is also used to quiet sNaNs. + case ISD::FMINNUM: + case ISD::FMAXNUM: + case ISD::FMINNAN: + case ISD::FMAXNAN: + + return (MaxDepth > 0) && + isCanonicalized(Op.getOperand(0), ST, MaxDepth - 1) && + isCanonicalized(Op.getOperand(1), ST, MaxDepth - 1); + + case ISD::ConstantFP: { + auto F = cast<ConstantFPSDNode>(Op)->getValueAPF(); + return !F.isDenormal() && !(F.isNaN() && F.isSignaling()); + } + } + return false; +} + // Constant fold canonicalize. SDValue SITargetLowering::performFCanonicalizeCombine( SDNode *N, DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0)); - if (!CFP) + + if (!CFP) { + SDValue N0 = N->getOperand(0); + + bool IsIEEEMode = Subtarget->enableIEEEBit(DAG.getMachineFunction()); + + if ((IsIEEEMode || isKnownNeverSNan(DAG, N0)) && + isCanonicalized(N0, getSubtarget())) + return N0; + return SDValue(); + } - SelectionDAG &DAG = DCI.DAG; const APFloat &C = CFP->getValueAPF(); // Flush denormals to 0 if not enabled. @@ -4723,13 +4802,6 @@ SDValue SITargetLowering::performIntMed3ImmCombine( return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3); } -static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) { - if (!DAG.getTargetLoweringInfo().hasFloatingPointExceptions()) - return true; - - return DAG.isKnownNeverNaN(Op); -} - SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL, SDValue Op0, diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index 24f88e632d38..83392a7ab1b2 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -153,7 +153,8 @@ public: bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, unsigned AS) const override; - bool canMergeStoresTo(unsigned AS, EVT MemVT) const override; + bool canMergeStoresTo(unsigned AS, EVT MemVT, + const SelectionDAG &DAG) const override; bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align, diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index b6784ec14e9f..160f8837d49c 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2022,10 +2022,12 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB, return nullptr; case AMDGPU::V_MAC_F16_e64: IsF16 = true; + LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e64: break; case AMDGPU::V_MAC_F16_e32: IsF16 = true; + LLVM_FALLTHROUGH; case AMDGPU::V_MAC_F32_e32: { int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); @@ -4320,6 +4322,24 @@ SIInstrInfo::CreateTargetPostRAHazardRecognizer(const MachineFunction &MF) const return new GCNHazardRecognizer(MF); } +std::pair<unsigned, unsigned> +SIInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF & MO_MASK, TF & ~MO_MASK); +} + +ArrayRef<std::pair<unsigned, const char *>> +SIInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + static const std::pair<unsigned, const char *> TargetFlags[] = { + { MO_GOTPCREL, "amdgpu-gotprel" }, + { MO_GOTPCREL32_LO, "amdgpu-gotprel32-lo" }, + { MO_GOTPCREL32_HI, "amdgpu-gotprel32-hi" }, + { MO_REL32_LO, "amdgpu-rel32-lo" }, + { MO_REL32_HI, "amdgpu-rel32-hi" } + }; + + return makeArrayRef(TargetFlags); +} + bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 74b48c761808..d00c0d4a7f4e 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -100,6 +100,8 @@ protected: public: enum TargetOperandFlags { + MO_MASK = 0x7, + MO_NONE = 0, // MO_GOTPCREL -> symbol@GOTPCREL -> R_AMDGPU_GOTPCREL. MO_GOTPCREL = 1, @@ -781,9 +783,15 @@ public: void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const; + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + ArrayRef<std::pair<int, const char *>> getSerializableTargetIndices() const override; + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + ScheduleHazardRecognizer * CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II, const ScheduleDAG *DAG) const override; diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td index 4a81fb3b463a..ffb01363e131 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.td +++ b/lib/Target/AMDGPU/SIInstrInfo.td @@ -1502,6 +1502,8 @@ def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>; def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>; def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>; +def VOP_F32_V2F16_V2F16_V2F16 : VOPProfile <[f32, v2f16, v2f16, v2f16]>; + def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>; def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>; diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp index bb17dbbdfbd6..34886c48f461 100644 --- a/lib/Target/AMDGPU/SIMachineScheduler.cpp +++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp @@ -38,7 +38,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" // This scheduler implements a different scheduling algorithm than // GenericScheduler. diff --git a/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/lib/Target/AMDGPU/SIShrinkInstructions.cpp index 96a18544f02a..874fbadca7f3 100644 --- a/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -110,10 +110,8 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, } const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1); - const MachineOperand *Src1Mod = - TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers); - - if (Src1 && (!isVGPR(Src1, TRI, MRI) || (Src1Mod && Src1Mod->getImm() != 0))) + if (Src1 && (!isVGPR(Src1, TRI, MRI) || + TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers))) return false; // We don't need to check src0, all input types are legal, so just make sure @@ -122,58 +120,64 @@ static bool canShrink(MachineInstr &MI, const SIInstrInfo *TII, return false; // Check output modifiers - if (TII->hasModifiersSet(MI, AMDGPU::OpName::omod)) - return false; - - return !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); + return !TII->hasModifiersSet(MI, AMDGPU::OpName::omod) && + !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp); } /// \brief This function checks \p MI for operands defined by a move immediate /// instruction and then folds the literal constant into the instruction if it -/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instruction -/// and will only fold literal constants if we are still in SSA. -static void foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, +/// can. This function assumes that \p MI is a VOP1, VOP2, or VOPC instructions. +static bool foldImmediates(MachineInstr &MI, const SIInstrInfo *TII, MachineRegisterInfo &MRI, bool TryToCommute = true) { - - if (!MRI.isSSA()) - return; - assert(TII->isVOP1(MI) || TII->isVOP2(MI) || TII->isVOPC(MI)); int Src0Idx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0); - // Only one literal constant is allowed per instruction, so if src0 is a - // literal constant then we can't do any folding. - if (TII->isLiteralConstant(MI, Src0Idx)) - return; - // Try to fold Src0 MachineOperand &Src0 = MI.getOperand(Src0Idx); - if (Src0.isReg() && MRI.hasOneUse(Src0.getReg())) { + if (Src0.isReg()) { unsigned Reg = Src0.getReg(); - MachineInstr *Def = MRI.getUniqueVRegDef(Reg); - if (Def && Def->isMoveImmediate()) { - MachineOperand &MovSrc = Def->getOperand(1); - bool ConstantFolded = false; - - if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || - isUInt<32>(MovSrc.getImm()))) { - Src0.ChangeToImmediate(MovSrc.getImm()); - ConstantFolded = true; - } - if (ConstantFolded) { - if (MRI.use_empty(Reg)) + if (TargetRegisterInfo::isVirtualRegister(Reg) && MRI.hasOneUse(Reg)) { + MachineInstr *Def = MRI.getUniqueVRegDef(Reg); + if (Def && Def->isMoveImmediate()) { + MachineOperand &MovSrc = Def->getOperand(1); + bool ConstantFolded = false; + + if (MovSrc.isImm() && (isInt<32>(MovSrc.getImm()) || + isUInt<32>(MovSrc.getImm()))) { + // It's possible to have only one component of a super-reg defined by + // a single mov, so we need to clear any subregister flag. + Src0.setSubReg(0); + Src0.ChangeToImmediate(MovSrc.getImm()); + ConstantFolded = true; + } else if (MovSrc.isFI()) { + Src0.setSubReg(0); + Src0.ChangeToFrameIndex(MovSrc.getIndex()); + ConstantFolded = true; + } + + if (ConstantFolded) { + assert(MRI.use_empty(Reg)); Def->eraseFromParent(); - ++NumLiteralConstantsFolded; - return; + ++NumLiteralConstantsFolded; + return true; + } } } } // We have failed to fold src0, so commute the instruction and try again. - if (TryToCommute && MI.isCommutable() && TII->commuteInstruction(MI)) - foldImmediates(MI, TII, MRI, false); + if (TryToCommute && MI.isCommutable()) { + if (TII->commuteInstruction(MI)) { + if (foldImmediates(MI, TII, MRI, false)) + return true; + // Commute back. + TII->commuteInstruction(MI); + } + } + + return false; } // Copy MachineOperand with all flags except setting it as implicit. diff --git a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp index 9908fc003ce7..92fb762ebd73 100644 --- a/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp +++ b/lib/Target/AMDGPU/TargetInfo/AMDGPUTargetInfo.cpp @@ -16,7 +16,7 @@ using namespace llvm; -/// \brief The target which suports all AMD GPUs. This will eventually +/// \brief The target which supports all AMD GPUs. This will eventually /// be deprecated and there will be a R600 target and a GCN target. Target &llvm::getTheAMDGPUTarget() { static Target TheAMDGPUTarget; diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td index 96d343099132..f2de1f995726 100644 --- a/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/lib/Target/AMDGPU/VOP3PInstructions.td @@ -16,12 +16,21 @@ class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret) >; -// Non-packed instructions that use the VOP3P encoding. i.e. where -// omod/abs are used. +// Non-packed instructions that use the VOP3P encoding. +// VOP3 neg/abs and VOP3P opsel/opsel_hi modifiers are allowed. class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> : - VOP3P_Pseudo<OpName, P, - !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret) ->; + VOP3P_Pseudo<OpName, P> { + let InOperandList = + (ins + FP32InputMods:$src0_modifiers, VCSrc_f32:$src0, + FP32InputMods:$src1_modifiers, VCSrc_f32:$src1, + FP32InputMods:$src2_modifiers, VCSrc_f32:$src2, + clampmod:$clamp, + op_sel:$op_sel, + op_sel_hi:$op_sel_hi); + let AsmOperands = + " $vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp"; +} let isCommutable = 1 in { def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>; @@ -46,9 +55,12 @@ def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I1 def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>; // XXX - Commutable? -def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>; -def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; -def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>; +// These are VOP3a-like opcodes which accept no omod. +// Size of src arguments (16/32) is controlled by op_sel. +// For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi. +def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_V2F16_V2F16_V2F16>>; +def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; +def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>>; multiclass VOP3P_Real_vi<bits<10> op> { diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td index e386f21c2ba4..77b7952b22a8 100644 --- a/lib/Target/AMDGPU/VOPInstructions.td +++ b/lib/Target/AMDGPU/VOPInstructions.td @@ -51,12 +51,8 @@ class VOP3Common <dag outs, dag ins, string asm = "", let VOP3 = 1; - let AsmMatchConverter = - !if(!eq(VOP3Only,1), - "cvtVOP3", - !if(!eq(HasMods,1), "cvtVOP3_2_mod", "")); - let AsmVariantName = AMDGPUAsmVariants.VOP3; + let AsmMatchConverter = !if(!eq(HasMods,1), "cvtVOP3", ""); let isCodeGenOnly = 0; @@ -106,13 +102,11 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [], let AsmVariantName = AMDGPUAsmVariants.VOP3; let AsmMatchConverter = - !if(!eq(VOP3Only,1), - !if(!and(P.IsPacked, isVOP3P), "cvtVOP3P", "cvtVOP3"), - !if(!eq(P.HasModifiers, 1), - "cvtVOP3_2_mod", - !if(!eq(P.HasOMod, 1), "cvtVOP3OMod", "") - ) - ); + !if(!and(P.IsPacked, isVOP3P), + "cvtVOP3P", + !if(!or(P.HasModifiers, P.HasOMod), + "cvtVOP3", + "")); VOPProfile Pfl = P; } diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp index 90f635c81254..582153daebde 100644 --- a/lib/Target/ARM/ARMAsmPrinter.cpp +++ b/lib/Target/ARM/ARMAsmPrinter.cpp @@ -1103,6 +1103,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) { case ARM::tPUSH: // Special case here: no src & dst reg, but two extra imp ops. StartOp = 2; NumOffset = 2; + LLVM_FALLTHROUGH; case ARM::STMDB_UPD: case ARM::t2STMDB_UPD: case ARM::VSTMDDB_UPD: diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp index 1ec6b24b2ed6..3cf5950a1918 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -1880,6 +1880,9 @@ isProfitableToIfCvt(MachineBasicBlock &TBB, // Diamond: TBB is the block that is branched to, FBB is the fallthrough TUnpredCycles = TCycles + TakenBranchCost; FUnpredCycles = FCycles + NotTakenBranchCost; + // The branch at the end of FBB will disappear when it's predicated, so + // discount it from PredCost. + PredCost -= 1 * ScalingUpFactor; } // The total cost is the cost of each path scaled by their probabilites unsigned TUnpredCost = Probability.scale(TUnpredCycles * ScalingUpFactor); diff --git a/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/lib/Target/ARM/ARMBaseRegisterInfo.cpp index b4fb292c0116..e97a7ce5067f 100644 --- a/lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ b/lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -193,10 +193,11 @@ getReservedRegs(const MachineFunction &MF) const { for (unsigned R = 0; R < 16; ++R) markSuperRegs(Reserved, ARM::D16 + R); } - const TargetRegisterClass *RC = &ARM::GPRPairRegClass; - for(TargetRegisterClass::iterator I = RC->begin(), E = RC->end(); I!=E; ++I) - for (MCSubRegIterator SI(*I, this); SI.isValid(); ++SI) - if (Reserved.test(*SI)) markSuperRegs(Reserved, *I); + const TargetRegisterClass &RC = ARM::GPRPairRegClass; + for (unsigned Reg : RC) + for (MCSubRegIterator SI(Reg, this); SI.isValid(); ++SI) + if (Reserved.test(*SI)) + markSuperRegs(Reserved, Reg); assert(checkAllSuperRegsMarked(Reserved)); return Reserved; @@ -315,8 +316,7 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg, Hints.push_back(PairedPhys); // Then prefer even or odd registers. - for (unsigned I = 0, E = Order.size(); I != E; ++I) { - unsigned Reg = Order[I]; + for (unsigned Reg : Order) { if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd) continue; // Don't provide hints that are paired to a reserved register. @@ -659,11 +659,8 @@ bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned Ba const MCInstrDesc &Desc = MI->getDesc(); unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask); unsigned i = 0; - - while (!MI->getOperand(i).isFI()) { - ++i; - assert(i < MI->getNumOperands() &&"Instr doesn't have FrameIndex operand!"); - } + for (; !MI->getOperand(i).isFI(); ++i) + assert(i+1 < MI->getNumOperands() && "Instr doesn't have FrameIndex operand!"); // AddrMode4 and AddrMode6 cannot handle any offset. if (AddrMode == ARMII::AddrMode4 || AddrMode == ARMII::AddrMode6) diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index e498f70b820d..051827a6a6a2 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -321,7 +321,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler { assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size"); assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size"); - // The necesary extensions are handled on the other side of the ABI + // The necessary extensions are handled on the other side of the ABI // boundary. markPhysRegUsed(PhysReg); MIRBuilder.buildCopy(ValVReg, PhysReg); diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e42514acd76f..6ba7593543a9 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -3398,9 +3398,9 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG, static SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG &DAG, const ARMSubtarget *Subtarget) { SDLoc dl(Op); - ConstantSDNode *ScopeN = cast<ConstantSDNode>(Op.getOperand(2)); - auto Scope = static_cast<SynchronizationScope>(ScopeN->getZExtValue()); - if (Scope == SynchronizationScope::SingleThread) + ConstantSDNode *SSIDNode = cast<ConstantSDNode>(Op.getOperand(2)); + auto SSID = static_cast<SyncScope::ID>(SSIDNode->getZExtValue()); + if (SSID == SyncScope::SingleThread) return Op; if (!Subtarget->hasDataBarrier()) { @@ -5356,15 +5356,15 @@ static SDValue LowerVSETCC(SDValue Op, SelectionDAG &DAG) { // Integer comparisons. switch (SetCCOpcode) { default: llvm_unreachable("Illegal integer comparison"); - case ISD::SETNE: Invert = true; + case ISD::SETNE: Invert = true; LLVM_FALLTHROUGH; case ISD::SETEQ: Opc = ARMISD::VCEQ; break; - case ISD::SETLT: Swap = true; + case ISD::SETLT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGT: Opc = ARMISD::VCGT; break; - case ISD::SETLE: Swap = true; + case ISD::SETLE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETGE: Opc = ARMISD::VCGE; break; - case ISD::SETULT: Swap = true; + case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: Opc = ARMISD::VCGTU; break; - case ISD::SETULE: Swap = true; + case ISD::SETULE: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGE: Opc = ARMISD::VCGEU; break; } @@ -13779,7 +13779,9 @@ bool ARMTargetLowering::lowerInterleavedLoad( // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) - SubVec = Builder.CreateIntToPtr(SubVec, SV->getType()); + SubVec = Builder.CreateIntToPtr( + SubVec, VectorType::get(SV->getType()->getVectorElementType(), + VecTy->getVectorNumElements())); SubVecs[SV].push_back(SubVec); } diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 5044134f5b1e..f05b14255236 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -510,7 +510,8 @@ class InstrItineraryData; bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx, unsigned &Cost) const override; - bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT) const override { + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const SelectionDAG &DAG) const override { // Do not merge to larger than i32. return (MemVT.getSizeInBits() <= 32); } diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td index 53db5acbe805..42eac12e457b 100644 --- a/lib/Target/ARM/ARMInstrThumb2.td +++ b/lib/Target/ARM/ARMInstrThumb2.td @@ -4799,7 +4799,7 @@ def : t2InstAlias<"add${p} $Rd, pc, $imm", // Pseudo instruction ldr Rt, =immediate def t2LDRConstPool : t2AsmPseudo<"ldr${p} $Rt, $immediate", - (ins GPRnopc:$Rt, const_pool_asm_imm:$immediate, pred:$p)>; + (ins GPR:$Rt, const_pool_asm_imm:$immediate, pred:$p)>; // Version w/ the .w suffix. def : t2InstAlias<"ldr${p}.w $Rt, $immediate", (t2LDRConstPool GPRnopc:$Rt, diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 374176d1d737..29ef69ad0010 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -20,6 +20,8 @@ #define DEBUG_TYPE "arm-isel" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" + using namespace llvm; #ifndef LLVM_BUILD_GLOBAL_ISEL @@ -42,13 +44,32 @@ public: private: bool selectImpl(MachineInstr &I) const; - bool selectICmp(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) const; + struct CmpConstants; + struct InsertInfo; + + bool selectCmp(CmpConstants Helper, MachineInstrBuilder &MIB, + MachineRegisterInfo &MRI) const; + + // Helper for inserting a comparison sequence that sets \p ResReg to either 1 + // if \p LHSReg and \p RHSReg are in the relationship defined by \p Cond, or + // \p PrevRes otherwise. In essence, it computes PrevRes OR (LHS Cond RHS). + bool insertComparison(CmpConstants Helper, InsertInfo I, unsigned ResReg, + ARMCC::CondCodes Cond, unsigned LHSReg, unsigned RHSReg, + unsigned PrevRes) const; + + // Set \p DestReg to \p Constant. + void putConstant(InsertInfo I, unsigned DestReg, unsigned Constant) const; + + bool selectSelect(MachineInstrBuilder &MIB, MachineRegisterInfo &MRI) const; + + // Check if the types match and both operands have the expected size and + // register bank. + bool validOpRegPair(MachineRegisterInfo &MRI, unsigned LHS, unsigned RHS, + unsigned ExpectedSize, unsigned ExpectedRegBankID) const; - bool selectSelect(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, - MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) const; + // Check if the register has the expected size and register bank. + bool validReg(MachineRegisterInfo &MRI, unsigned Reg, unsigned ExpectedSize, + unsigned ExpectedRegBankID) const; const ARMBaseInstrInfo &TII; const ARMBaseRegisterInfo &TRI; @@ -251,120 +272,233 @@ static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank, return Opc; } -static ARMCC::CondCodes getComparePred(CmpInst::Predicate Pred) { +// When lowering comparisons, we sometimes need to perform two compares instead +// of just one. Get the condition codes for both comparisons. If only one is +// needed, the second member of the pair is ARMCC::AL. +static std::pair<ARMCC::CondCodes, ARMCC::CondCodes> +getComparePreds(CmpInst::Predicate Pred) { + std::pair<ARMCC::CondCodes, ARMCC::CondCodes> Preds = {ARMCC::AL, ARMCC::AL}; switch (Pred) { - // Needs two compares... case CmpInst::FCMP_ONE: + Preds = {ARMCC::GT, ARMCC::MI}; + break; case CmpInst::FCMP_UEQ: - default: - // AL is our "false" for now. The other two need more compares. - return ARMCC::AL; + Preds = {ARMCC::EQ, ARMCC::VS}; + break; case CmpInst::ICMP_EQ: case CmpInst::FCMP_OEQ: - return ARMCC::EQ; + Preds.first = ARMCC::EQ; + break; case CmpInst::ICMP_SGT: case CmpInst::FCMP_OGT: - return ARMCC::GT; + Preds.first = ARMCC::GT; + break; case CmpInst::ICMP_SGE: case CmpInst::FCMP_OGE: - return ARMCC::GE; + Preds.first = ARMCC::GE; + break; case CmpInst::ICMP_UGT: case CmpInst::FCMP_UGT: - return ARMCC::HI; + Preds.first = ARMCC::HI; + break; case CmpInst::FCMP_OLT: - return ARMCC::MI; + Preds.first = ARMCC::MI; + break; case CmpInst::ICMP_ULE: case CmpInst::FCMP_OLE: - return ARMCC::LS; + Preds.first = ARMCC::LS; + break; case CmpInst::FCMP_ORD: - return ARMCC::VC; + Preds.first = ARMCC::VC; + break; case CmpInst::FCMP_UNO: - return ARMCC::VS; + Preds.first = ARMCC::VS; + break; case CmpInst::FCMP_UGE: - return ARMCC::PL; + Preds.first = ARMCC::PL; + break; case CmpInst::ICMP_SLT: case CmpInst::FCMP_ULT: - return ARMCC::LT; + Preds.first = ARMCC::LT; + break; case CmpInst::ICMP_SLE: case CmpInst::FCMP_ULE: - return ARMCC::LE; + Preds.first = ARMCC::LE; + break; case CmpInst::FCMP_UNE: case CmpInst::ICMP_NE: - return ARMCC::NE; + Preds.first = ARMCC::NE; + break; case CmpInst::ICMP_UGE: - return ARMCC::HS; + Preds.first = ARMCC::HS; + break; case CmpInst::ICMP_ULT: - return ARMCC::LO; + Preds.first = ARMCC::LO; + break; + default: + break; } + assert(Preds.first != ARMCC::AL && "No comparisons needed?"); + return Preds; } -bool ARMInstructionSelector::selectICmp(MachineInstrBuilder &MIB, - const ARMBaseInstrInfo &TII, - MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) const { - auto &MBB = *MIB->getParent(); - auto InsertBefore = std::next(MIB->getIterator()); - auto &DebugLoc = MIB->getDebugLoc(); - - // Move 0 into the result register. - auto Mov0I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVi)) - .addDef(MRI.createVirtualRegister(&ARM::GPRRegClass)) - .addImm(0) - .add(predOps(ARMCC::AL)) - .add(condCodeOp()); - if (!constrainSelectedInstRegOperands(*Mov0I, TII, TRI, RBI)) +struct ARMInstructionSelector::CmpConstants { + CmpConstants(unsigned CmpOpcode, unsigned FlagsOpcode, unsigned OpRegBank, + unsigned OpSize) + : ComparisonOpcode(CmpOpcode), ReadFlagsOpcode(FlagsOpcode), + OperandRegBankID(OpRegBank), OperandSize(OpSize) {} + + // The opcode used for performing the comparison. + const unsigned ComparisonOpcode; + + // The opcode used for reading the flags set by the comparison. May be + // ARM::INSTRUCTION_LIST_END if we don't need to read the flags. + const unsigned ReadFlagsOpcode; + + // The assumed register bank ID for the operands. + const unsigned OperandRegBankID; + + // The assumed size in bits for the operands. + const unsigned OperandSize; +}; + +struct ARMInstructionSelector::InsertInfo { + InsertInfo(MachineInstrBuilder &MIB) + : MBB(*MIB->getParent()), InsertBefore(std::next(MIB->getIterator())), + DbgLoc(MIB->getDebugLoc()) {} + + MachineBasicBlock &MBB; + const MachineBasicBlock::instr_iterator InsertBefore; + const DebugLoc &DbgLoc; +}; + +void ARMInstructionSelector::putConstant(InsertInfo I, unsigned DestReg, + unsigned Constant) const { + (void)BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVi)) + .addDef(DestReg) + .addImm(Constant) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); +} + +bool ARMInstructionSelector::validOpRegPair(MachineRegisterInfo &MRI, + unsigned LHSReg, unsigned RHSReg, + unsigned ExpectedSize, + unsigned ExpectedRegBankID) const { + return MRI.getType(LHSReg) == MRI.getType(RHSReg) && + validReg(MRI, LHSReg, ExpectedSize, ExpectedRegBankID) && + validReg(MRI, RHSReg, ExpectedSize, ExpectedRegBankID); +} + +bool ARMInstructionSelector::validReg(MachineRegisterInfo &MRI, unsigned Reg, + unsigned ExpectedSize, + unsigned ExpectedRegBankID) const { + if (MRI.getType(Reg).getSizeInBits() != ExpectedSize) { + DEBUG(dbgs() << "Unexpected size for register"); return false; + } - // Perform the comparison. - auto LHSReg = MIB->getOperand(2).getReg(); - auto RHSReg = MIB->getOperand(3).getReg(); - assert(MRI.getType(LHSReg) == MRI.getType(RHSReg) && - MRI.getType(LHSReg).getSizeInBits() == 32 && - MRI.getType(RHSReg).getSizeInBits() == 32 && - "Unsupported types for comparison operation"); - auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPrr)) - .addUse(LHSReg) - .addUse(RHSReg) - .add(predOps(ARMCC::AL)); - if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI)) + if (RBI.getRegBank(Reg, MRI, TRI)->getID() != ExpectedRegBankID) { + DEBUG(dbgs() << "Unexpected register bank for register"); return false; + } + + return true; +} + +bool ARMInstructionSelector::selectCmp(CmpConstants Helper, + MachineInstrBuilder &MIB, + MachineRegisterInfo &MRI) const { + const InsertInfo I(MIB); - // Move 1 into the result register if the flags say so. auto ResReg = MIB->getOperand(0).getReg(); + if (!validReg(MRI, ResReg, 1, ARM::GPRRegBankID)) + return false; + auto Cond = static_cast<CmpInst::Predicate>(MIB->getOperand(1).getPredicate()); - auto ARMCond = getComparePred(Cond); - if (ARMCond == ARMCC::AL) + if (Cond == CmpInst::FCMP_TRUE || Cond == CmpInst::FCMP_FALSE) { + putConstant(I, ResReg, Cond == CmpInst::FCMP_TRUE ? 1 : 0); + MIB->eraseFromParent(); + return true; + } + + auto LHSReg = MIB->getOperand(2).getReg(); + auto RHSReg = MIB->getOperand(3).getReg(); + if (!validOpRegPair(MRI, LHSReg, RHSReg, Helper.OperandSize, + Helper.OperandRegBankID)) return false; - auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCi)) + auto ARMConds = getComparePreds(Cond); + auto ZeroReg = MRI.createVirtualRegister(&ARM::GPRRegClass); + putConstant(I, ZeroReg, 0); + + if (ARMConds.second == ARMCC::AL) { + // Simple case, we only need one comparison and we're done. + if (!insertComparison(Helper, I, ResReg, ARMConds.first, LHSReg, RHSReg, + ZeroReg)) + return false; + } else { + // Not so simple, we need two successive comparisons. + auto IntermediateRes = MRI.createVirtualRegister(&ARM::GPRRegClass); + if (!insertComparison(Helper, I, IntermediateRes, ARMConds.first, LHSReg, + RHSReg, ZeroReg)) + return false; + if (!insertComparison(Helper, I, ResReg, ARMConds.second, LHSReg, RHSReg, + IntermediateRes)) + return false; + } + + MIB->eraseFromParent(); + return true; +} + +bool ARMInstructionSelector::insertComparison(CmpConstants Helper, InsertInfo I, + unsigned ResReg, + ARMCC::CondCodes Cond, + unsigned LHSReg, unsigned RHSReg, + unsigned PrevRes) const { + // Perform the comparison. + auto CmpI = + BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(Helper.ComparisonOpcode)) + .addUse(LHSReg) + .addUse(RHSReg) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*CmpI, TII, TRI, RBI)) + return false; + + // Read the comparison flags (if necessary). + if (Helper.ReadFlagsOpcode != ARM::INSTRUCTION_LIST_END) { + auto ReadI = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, + TII.get(Helper.ReadFlagsOpcode)) + .add(predOps(ARMCC::AL)); + if (!constrainSelectedInstRegOperands(*ReadI, TII, TRI, RBI)) + return false; + } + + // Select either 1 or the previous result based on the value of the flags. + auto Mov1I = BuildMI(I.MBB, I.InsertBefore, I.DbgLoc, TII.get(ARM::MOVCCi)) .addDef(ResReg) - .addUse(Mov0I->getOperand(0).getReg()) + .addUse(PrevRes) .addImm(1) - .add(predOps(ARMCond, ARM::CPSR)); + .add(predOps(Cond, ARM::CPSR)); if (!constrainSelectedInstRegOperands(*Mov1I, TII, TRI, RBI)) return false; - MIB->eraseFromParent(); return true; } bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, - const ARMBaseInstrInfo &TII, - MachineRegisterInfo &MRI, - const TargetRegisterInfo &TRI, - const RegisterBankInfo &RBI) const { + MachineRegisterInfo &MRI) const { auto &MBB = *MIB->getParent(); auto InsertBefore = std::next(MIB->getIterator()); - auto &DebugLoc = MIB->getDebugLoc(); + auto &DbgLoc = MIB->getDebugLoc(); // Compare the condition to 0. auto CondReg = MIB->getOperand(1).getReg(); - assert(MRI.getType(CondReg).getSizeInBits() == 1 && - RBI.getRegBank(CondReg, MRI, TRI)->getID() == ARM::GPRRegBankID && + assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) && "Unsupported types for select operation"); - auto CmpI = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::CMPri)) + auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::CMPri)) .addUse(CondReg) .addImm(0) .add(predOps(ARMCC::AL)); @@ -376,13 +510,10 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB, auto ResReg = MIB->getOperand(0).getReg(); auto TrueReg = MIB->getOperand(2).getReg(); auto FalseReg = MIB->getOperand(3).getReg(); - assert(MRI.getType(ResReg) == MRI.getType(TrueReg) && - MRI.getType(TrueReg) == MRI.getType(FalseReg) && - MRI.getType(FalseReg).getSizeInBits() == 32 && - RBI.getRegBank(TrueReg, MRI, TRI)->getID() == ARM::GPRRegBankID && - RBI.getRegBank(FalseReg, MRI, TRI)->getID() == ARM::GPRRegBankID && + assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) && + validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) && "Unsupported types for select operation"); - auto Mov1I = BuildMI(MBB, InsertBefore, DebugLoc, TII.get(ARM::MOVCCr)) + auto Mov1I = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(ARM::MOVCCr)) .addDef(ResReg) .addUse(TrueReg) .addUse(FalseReg) @@ -494,10 +625,32 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { I.setDesc(TII.get(COPY)); return selectCopy(I, TII, MRI, TRI, RBI); } - case G_ICMP: - return selectICmp(MIB, TII, MRI, TRI, RBI); case G_SELECT: - return selectSelect(MIB, TII, MRI, TRI, RBI); + return selectSelect(MIB, MRI); + case G_ICMP: { + CmpConstants Helper(ARM::CMPrr, ARM::INSTRUCTION_LIST_END, + ARM::GPRRegBankID, 32); + return selectCmp(Helper, MIB, MRI); + } + case G_FCMP: { + assert(TII.getSubtarget().hasVFP2() && "Can't select fcmp without VFP"); + + unsigned OpReg = I.getOperand(2).getReg(); + unsigned Size = MRI.getType(OpReg).getSizeInBits(); + + if (Size == 64 && TII.getSubtarget().isFPOnlySP()) { + DEBUG(dbgs() << "Subtarget only supports single precision"); + return false; + } + if (Size != 32 && Size != 64) { + DEBUG(dbgs() << "Unsupported size for G_FCMP operand"); + return false; + } + + CmpConstants Helper(Size == 32 ? ARM::VCMPS : ARM::VCMPD, ARM::FMSTAT, + ARM::FPRRegBankID, Size); + return selectCmp(Helper, MIB, MRI); + } case G_GEP: I.setDesc(TII.get(ARM::ADDrr)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); @@ -510,11 +663,10 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { break; case G_CONSTANT: { unsigned Reg = I.getOperand(0).getReg(); - if (MRI.getType(Reg).getSizeInBits() != 32) + + if (!validReg(MRI, Reg, 32, ARM::GPRRegBankID)) return false; - assert(RBI.getRegBank(Reg, MRI, TRI)->getID() == ARM::GPRRegBankID && - "Expected constant to live in a GPR"); I.setDesc(TII.get(ARM::MOVi)); MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index f3e62d09cc30..f23e62595d2e 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -28,6 +28,10 @@ using namespace llvm; #error "You shouldn't build this" #endif +static bool AEABI(const ARMSubtarget &ST) { + return ST.isTargetAEABI() || ST.isTargetGNUAEABI() || ST.isTargetMuslAEABI(); +} + ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { using namespace TargetOpcode; @@ -66,8 +70,7 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { for (unsigned Op : {G_SREM, G_UREM}) if (ST.hasDivideInARMMode()) setAction({Op, s32}, Lower); - else if (ST.isTargetAEABI() || ST.isTargetGNUAEABI() || - ST.isTargetMuslAEABI()) + else if (AEABI(ST)) setAction({Op, s32}, Custom); else setAction({Op, s32}, Libcall); @@ -86,6 +89,8 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_SELECT, 1, s1}, Legal); setAction({G_CONSTANT, s32}, Legal); + for (auto Ty : {s1, s8, s16}) + setAction({G_CONSTANT, Ty}, WidenScalar); setAction({G_ICMP, s1}, Legal); for (auto Ty : {s8, s16}) @@ -99,9 +104,22 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({G_LOAD, s64}, Legal); setAction({G_STORE, s64}, Legal); + + setAction({G_FCMP, s1}, Legal); + setAction({G_FCMP, 1, s32}, Legal); + setAction({G_FCMP, 1, s64}, Legal); } else { for (auto Ty : {s32, s64}) setAction({G_FADD, Ty}, Libcall); + + setAction({G_FCMP, s1}, Legal); + setAction({G_FCMP, 1, s32}, Custom); + setAction({G_FCMP, 1, s64}, Custom); + + if (AEABI(ST)) + setFCmpLibcallsAEABI(); + else + setFCmpLibcallsGNU(); } for (unsigned Op : {G_FREM, G_FPOW}) @@ -111,11 +129,120 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { computeTables(); } +void ARMLegalizerInfo::setFCmpLibcallsAEABI() { + // FCMP_TRUE and FCMP_FALSE don't need libcalls, they should be + // default-initialized. + FCmp32Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1); + FCmp32Libcalls[CmpInst::FCMP_OEQ] = { + {RTLIB::OEQ_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_OGE] = { + {RTLIB::OGE_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_OGT] = { + {RTLIB::OGT_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_OLE] = { + {RTLIB::OLE_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_OLT] = { + {RTLIB::OLT_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_UNO] = { + {RTLIB::UO_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_ONE] = { + {RTLIB::OGT_F32, CmpInst::BAD_ICMP_PREDICATE}, + {RTLIB::OLT_F32, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp32Libcalls[CmpInst::FCMP_UEQ] = { + {RTLIB::OEQ_F32, CmpInst::BAD_ICMP_PREDICATE}, + {RTLIB::UO_F32, CmpInst::BAD_ICMP_PREDICATE}}; + + FCmp64Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1); + FCmp64Libcalls[CmpInst::FCMP_OEQ] = { + {RTLIB::OEQ_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_OGE] = { + {RTLIB::OGE_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_OGT] = { + {RTLIB::OGT_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_OLE] = { + {RTLIB::OLE_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_OLT] = { + {RTLIB::OLT_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_UNO] = { + {RTLIB::UO_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_ONE] = { + {RTLIB::OGT_F64, CmpInst::BAD_ICMP_PREDICATE}, + {RTLIB::OLT_F64, CmpInst::BAD_ICMP_PREDICATE}}; + FCmp64Libcalls[CmpInst::FCMP_UEQ] = { + {RTLIB::OEQ_F64, CmpInst::BAD_ICMP_PREDICATE}, + {RTLIB::UO_F64, CmpInst::BAD_ICMP_PREDICATE}}; +} + +void ARMLegalizerInfo::setFCmpLibcallsGNU() { + // FCMP_TRUE and FCMP_FALSE don't need libcalls, they should be + // default-initialized. + FCmp32Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1); + FCmp32Libcalls[CmpInst::FCMP_OEQ] = {{RTLIB::OEQ_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_OGE] = {{RTLIB::OGE_F32, CmpInst::ICMP_SGE}}; + FCmp32Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F32, CmpInst::ICMP_SGT}}; + FCmp32Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F32, CmpInst::ICMP_SLE}}; + FCmp32Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F32, CmpInst::ICMP_SLT}}; + FCmp32Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F32, CmpInst::ICMP_EQ}}; + FCmp32Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F32, CmpInst::ICMP_SGE}}; + FCmp32Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F32, CmpInst::ICMP_SGT}}; + FCmp32Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F32, CmpInst::ICMP_SLE}}; + FCmp32Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F32, CmpInst::ICMP_SLT}}; + FCmp32Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F32, CmpInst::ICMP_NE}}; + FCmp32Libcalls[CmpInst::FCMP_UNO] = {{RTLIB::UO_F32, CmpInst::ICMP_NE}}; + FCmp32Libcalls[CmpInst::FCMP_ONE] = {{RTLIB::OGT_F32, CmpInst::ICMP_SGT}, + {RTLIB::OLT_F32, CmpInst::ICMP_SLT}}; + FCmp32Libcalls[CmpInst::FCMP_UEQ] = {{RTLIB::OEQ_F32, CmpInst::ICMP_EQ}, + {RTLIB::UO_F32, CmpInst::ICMP_NE}}; + + FCmp64Libcalls.resize(CmpInst::LAST_FCMP_PREDICATE + 1); + FCmp64Libcalls[CmpInst::FCMP_OEQ] = {{RTLIB::OEQ_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_OGE] = {{RTLIB::OGE_F64, CmpInst::ICMP_SGE}}; + FCmp64Libcalls[CmpInst::FCMP_OGT] = {{RTLIB::OGT_F64, CmpInst::ICMP_SGT}}; + FCmp64Libcalls[CmpInst::FCMP_OLE] = {{RTLIB::OLE_F64, CmpInst::ICMP_SLE}}; + FCmp64Libcalls[CmpInst::FCMP_OLT] = {{RTLIB::OLT_F64, CmpInst::ICMP_SLT}}; + FCmp64Libcalls[CmpInst::FCMP_ORD] = {{RTLIB::O_F64, CmpInst::ICMP_EQ}}; + FCmp64Libcalls[CmpInst::FCMP_UGE] = {{RTLIB::OLT_F64, CmpInst::ICMP_SGE}}; + FCmp64Libcalls[CmpInst::FCMP_UGT] = {{RTLIB::OLE_F64, CmpInst::ICMP_SGT}}; + FCmp64Libcalls[CmpInst::FCMP_ULE] = {{RTLIB::OGT_F64, CmpInst::ICMP_SLE}}; + FCmp64Libcalls[CmpInst::FCMP_ULT] = {{RTLIB::OGE_F64, CmpInst::ICMP_SLT}}; + FCmp64Libcalls[CmpInst::FCMP_UNE] = {{RTLIB::UNE_F64, CmpInst::ICMP_NE}}; + FCmp64Libcalls[CmpInst::FCMP_UNO] = {{RTLIB::UO_F64, CmpInst::ICMP_NE}}; + FCmp64Libcalls[CmpInst::FCMP_ONE] = {{RTLIB::OGT_F64, CmpInst::ICMP_SGT}, + {RTLIB::OLT_F64, CmpInst::ICMP_SLT}}; + FCmp64Libcalls[CmpInst::FCMP_UEQ] = {{RTLIB::OEQ_F64, CmpInst::ICMP_EQ}, + {RTLIB::UO_F64, CmpInst::ICMP_NE}}; +} + +ARMLegalizerInfo::FCmpLibcallsList +ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate, + unsigned Size) const { + assert(CmpInst::isFPPredicate(Predicate) && "Unsupported FCmp predicate"); + if (Size == 32) + return FCmp32Libcalls[Predicate]; + if (Size == 64) + return FCmp64Libcalls[Predicate]; + llvm_unreachable("Unsupported size for FCmp predicate"); +} + bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const { using namespace TargetOpcode; + MIRBuilder.setInstr(MI); + switch (MI.getOpcode()) { default: return false; @@ -137,9 +264,9 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, auto RetVal = MRI.createGenericVirtualRegister( getLLTForType(*RetTy, MIRBuilder.getMF().getDataLayout())); - auto Status = replaceWithLibcall(MI, MIRBuilder, Libcall, {RetVal, RetTy}, - {{MI.getOperand(1).getReg(), ArgTy}, - {MI.getOperand(2).getReg(), ArgTy}}); + auto Status = createLibcall(MIRBuilder, Libcall, {RetVal, RetTy}, + {{MI.getOperand(1).getReg(), ArgTy}, + {MI.getOperand(2).getReg(), ArgTy}}); if (Status != LegalizerHelper::Legalized) return false; @@ -149,8 +276,76 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI, MIRBuilder.buildUnmerge( {MRI.createGenericVirtualRegister(LLT::scalar(32)), OriginalResult}, RetVal); + break; + } + case G_FCMP: { + assert(MRI.getType(MI.getOperand(2).getReg()) == + MRI.getType(MI.getOperand(3).getReg()) && + "Mismatched operands for G_FCMP"); + auto OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits(); + + auto OriginalResult = MI.getOperand(0).getReg(); + auto Predicate = + static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate()); + auto Libcalls = getFCmpLibcalls(Predicate, OpSize); + + if (Libcalls.empty()) { + assert((Predicate == CmpInst::FCMP_TRUE || + Predicate == CmpInst::FCMP_FALSE) && + "Predicate needs libcalls, but none specified"); + MIRBuilder.buildConstant(OriginalResult, + Predicate == CmpInst::FCMP_TRUE ? 1 : 0); + MI.eraseFromParent(); + return true; + } + + auto &Ctx = MIRBuilder.getMF().getFunction()->getContext(); + assert((OpSize == 32 || OpSize == 64) && "Unsupported operand size"); + auto *ArgTy = OpSize == 32 ? Type::getFloatTy(Ctx) : Type::getDoubleTy(Ctx); + auto *RetTy = Type::getInt32Ty(Ctx); + + SmallVector<unsigned, 2> Results; + for (auto Libcall : Libcalls) { + auto LibcallResult = MRI.createGenericVirtualRegister(LLT::scalar(32)); + auto Status = + createLibcall(MIRBuilder, Libcall.LibcallID, {LibcallResult, RetTy}, + {{MI.getOperand(2).getReg(), ArgTy}, + {MI.getOperand(3).getReg(), ArgTy}}); + + if (Status != LegalizerHelper::Legalized) + return false; - return LegalizerHelper::Legalized; + auto ProcessedResult = + Libcalls.size() == 1 + ? OriginalResult + : MRI.createGenericVirtualRegister(MRI.getType(OriginalResult)); + + // We have a result, but we need to transform it into a proper 1-bit 0 or + // 1, taking into account the different peculiarities of the values + // returned by the comparison functions. + CmpInst::Predicate ResultPred = Libcall.Predicate; + if (ResultPred == CmpInst::BAD_ICMP_PREDICATE) { + // We have a nice 0 or 1, and we just need to truncate it back to 1 bit + // to keep the types consistent. + MIRBuilder.buildTrunc(ProcessedResult, LibcallResult); + } else { + // We need to compare against 0. + assert(CmpInst::isIntPredicate(ResultPred) && "Unsupported predicate"); + auto Zero = MRI.createGenericVirtualRegister(LLT::scalar(32)); + MIRBuilder.buildConstant(Zero, 0); + MIRBuilder.buildICmp(ResultPred, ProcessedResult, LibcallResult, Zero); + } + Results.push_back(ProcessedResult); + } + + if (Results.size() != 1) { + assert(Results.size() == 2 && "Unexpected number of results"); + MIRBuilder.buildOr(OriginalResult, Results[0], Results[1]); + } + break; } } + + MI.eraseFromParent(); + return true; } diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h index a9bdd367737e..78ab9412c04b 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.h +++ b/lib/Target/ARM/ARMLegalizerInfo.h @@ -14,7 +14,10 @@ #ifndef LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H #define LLVM_LIB_TARGET_ARM_ARMMACHINELEGALIZER_H +#include "llvm/ADT/IndexedMap.h" #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/RuntimeLibcalls.h" +#include "llvm/IR/Instructions.h" namespace llvm { @@ -27,6 +30,36 @@ public: bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &MIRBuilder) const override; + +private: + void setFCmpLibcallsGNU(); + void setFCmpLibcallsAEABI(); + + struct FCmpLibcallInfo { + // Which libcall this is. + RTLIB::Libcall LibcallID; + + // The predicate to be used when comparing the value returned by the + // function with a relevant constant (currently hard-coded to zero). This is + // necessary because often the libcall will return e.g. a value greater than + // 0 to represent 'true' and anything negative to represent 'false', or + // maybe 0 to represent 'true' and non-zero for 'false'. If no comparison is + // needed, this should be CmpInst::BAD_ICMP_PREDICATE. + CmpInst::Predicate Predicate; + }; + using FCmpLibcallsList = SmallVector<FCmpLibcallInfo, 2>; + + // Map from each FCmp predicate to the corresponding libcall infos. A FCmp + // instruction may be lowered to one or two libcalls, which is why we need a + // list. If two libcalls are needed, their results will be OR'ed. + using FCmpLibcallsMapTy = IndexedMap<FCmpLibcallsList>; + + FCmpLibcallsMapTy FCmp32Libcalls; + FCmpLibcallsMapTy FCmp64Libcalls; + + // Get the libcall(s) corresponding to \p Predicate for operands of \p Size + // bits. + FCmpLibcallsList getFCmpLibcalls(CmpInst::Predicate, unsigned Size) const; }; } // End llvm namespace. #endif diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 11fb81a4f9fe..c0c09e8c15af 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -212,8 +212,6 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); - LLT Ty = MRI.getType(MI.getOperand(0).getReg()); - unsigned NumOperands = MI.getNumOperands(); const ValueMapping *OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx]; @@ -236,26 +234,31 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx]; break; case G_LOAD: - case G_STORE: + case G_STORE: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); OperandsMapping = Ty.getSizeInBits() == 64 ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx], &ARM::ValueMappings[ARM::GPR3OpsIdx]}) : &ARM::ValueMappings[ARM::GPR3OpsIdx]; break; - case G_FADD: + } + case G_FADD: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); assert((Ty.getSizeInBits() == 32 || Ty.getSizeInBits() == 64) && "Unsupported size for G_FADD"); OperandsMapping = Ty.getSizeInBits() == 64 ? &ARM::ValueMappings[ARM::DPR3OpsIdx] : &ARM::ValueMappings[ARM::SPR3OpsIdx]; break; + } case G_CONSTANT: case G_FRAME_INDEX: OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr}); break; case G_SELECT: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); LLT Ty2 = MRI.getType(MI.getOperand(1).getReg()); (void)Ty2; assert(Ty.getSizeInBits() == 32 && "Unsupported size for G_SELECT"); @@ -277,9 +280,29 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { &ARM::ValueMappings[ARM::GPR3OpsIdx]}); break; } + case G_FCMP: { + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); + LLT Ty1 = MRI.getType(MI.getOperand(2).getReg()); + LLT Ty2 = MRI.getType(MI.getOperand(3).getReg()); + (void)Ty2; + assert(Ty.getSizeInBits() == 1 && "Unsupported size for G_FCMP"); + assert(Ty1.getSizeInBits() == Ty2.getSizeInBits() && + "Mismatched operand sizes for G_FCMP"); + + unsigned Size = Ty1.getSizeInBits(); + assert((Size == 32 || Size == 64) && "Unsupported size for G_FCMP"); + + auto FPRValueMapping = Size == 32 ? &ARM::ValueMappings[ARM::SPR3OpsIdx] + : &ARM::ValueMappings[ARM::DPR3OpsIdx]; + OperandsMapping = + getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr, + FPRValueMapping, FPRValueMapping}); + break; + } case G_MERGE_VALUES: { // We only support G_MERGE_VALUES for creating a double precision floating // point value out of two GPRs. + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); LLT Ty1 = MRI.getType(MI.getOperand(1).getReg()); LLT Ty2 = MRI.getType(MI.getOperand(2).getReg()); if (Ty.getSizeInBits() != 64 || Ty1.getSizeInBits() != 32 || @@ -294,6 +317,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_UNMERGE_VALUES: { // We only support G_UNMERGE_VALUES for splitting a double precision // floating point value into two GPRs. + LLT Ty = MRI.getType(MI.getOperand(0).getReg()); LLT Ty1 = MRI.getType(MI.getOperand(1).getReg()); LLT Ty2 = MRI.getType(MI.getOperand(2).getReg()); if (Ty.getSizeInBits() != 32 || Ty1.getSizeInBits() != 32 || diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp index 8eb9dbf5f9de..51b0fedd2b54 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -15,6 +15,24 @@ using namespace llvm; #define DEBUG_TYPE "armtti" +bool ARMTTIImpl::areInlineCompatible(const Function *Caller, + const Function *Callee) const { + const TargetMachine &TM = getTLI()->getTargetMachine(); + const FeatureBitset &CallerBits = + TM.getSubtargetImpl(*Caller)->getFeatureBits(); + const FeatureBitset &CalleeBits = + TM.getSubtargetImpl(*Callee)->getFeatureBits(); + + // To inline a callee, all features not in the whitelist must match exactly. + bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) == + (CalleeBits & ~InlineFeatureWhitelist); + // For features in the whitelist, the callee's features must be a subset of + // the callers'. + bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) == + (CalleeBits & InlineFeatureWhitelist); + return MatchExact && MatchSubset; +} + int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) { assert(Ty->isIntegerTy()); diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h index 8a1a37863877..0695a4e63346 100644 --- a/lib/Target/ARM/ARMTargetTransformInfo.h +++ b/lib/Target/ARM/ARMTargetTransformInfo.h @@ -33,6 +33,39 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> { const ARMSubtarget *ST; const ARMTargetLowering *TLI; + // Currently the following features are excluded from InlineFeatureWhitelist. + // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureVFPOnlySP, FeatureD16 + // Depending on whether they are set or unset, different + // instructions/registers are available. For example, inlining a callee with + // -thumb-mode in a caller with +thumb-mode, may cause the assembler to + // fail if the callee uses ARM only instructions, e.g. in inline asm. + const FeatureBitset InlineFeatureWhitelist = { + ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2, + ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8, + ARM::FeatureFullFP16, ARM::FeatureHWDivThumb, + ARM::FeatureHWDivARM, ARM::FeatureDB, ARM::FeatureV7Clrex, + ARM::FeatureAcquireRelease, ARM::FeatureSlowFPBrcc, + ARM::FeaturePerfMon, ARM::FeatureTrustZone, ARM::Feature8MSecExt, + ARM::FeatureCrypto, ARM::FeatureCRC, ARM::FeatureRAS, + ARM::FeatureFPAO, ARM::FeatureFuseAES, ARM::FeatureZCZeroing, + ARM::FeatureProfUnpredicate, ARM::FeatureSlowVGETLNi32, + ARM::FeatureSlowVDUP32, ARM::FeaturePreferVMOVSR, + ARM::FeaturePrefISHSTBarrier, ARM::FeatureMuxedUnits, + ARM::FeatureSlowOddRegister, ARM::FeatureSlowLoadDSubreg, + ARM::FeatureDontWidenVMOVS, ARM::FeatureExpandMLx, + ARM::FeatureHasVMLxHazards, ARM::FeatureNEONForFPMovs, + ARM::FeatureNEONForFP, ARM::FeatureCheckVLDnAlign, + ARM::FeatureHasSlowFPVMLx, ARM::FeatureVMLxForwarding, + ARM::FeaturePref32BitThumb, ARM::FeatureAvoidPartialCPSR, + ARM::FeatureCheapPredicableCPSR, ARM::FeatureAvoidMOVsShOp, + ARM::FeatureHasRetAddrStack, ARM::FeatureHasNoBranchPredictor, + ARM::FeatureDSP, ARM::FeatureMP, ARM::FeatureVirtualization, + ARM::FeatureMClass, ARM::FeatureRClass, ARM::FeatureAClass, + ARM::FeatureNaClTrap, ARM::FeatureStrictAlign, ARM::FeatureLongCalls, + ARM::FeatureExecuteOnly, ARM::FeatureReserveR9, ARM::FeatureNoMovt, + ARM::FeatureNoNegativeImmediates + }; + const ARMSubtarget *getST() const { return ST; } const ARMTargetLowering *getTLI() const { return TLI; } @@ -41,6 +74,9 @@ public: : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), TLI(ST->getTargetLowering()) {} + bool areInlineCompatible(const Function *Caller, + const Function *Callee) const; + bool enableInterleavedAccessVectorization() { return true; } /// Floating-point computation using ARMv8 AArch32 Advanced diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp index 891b5c60e1fd..1129826f21f6 100644 --- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -5249,6 +5249,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) { // Fall though for the Identifier case that is not a register or a // special name. + LLVM_FALLTHROUGH; } case AsmToken::LParen: // parenthesized expressions like (_strcmp-4) case AsmToken::Integer: // things like 1f and 2b as a branch targets @@ -8992,6 +8993,8 @@ unsigned ARMAsmParser::MatchInstruction(OperandVector &Operands, MCInst &Inst, return PlainMatchResult; } +std::string ARMMnemonicSpellCheck(StringRef S, uint64_t FBS); + static const char *getSubtargetFeatureName(uint64_t Val); bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, @@ -9085,9 +9088,13 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, return Error(ErrorLoc, "invalid operand for instruction"); } - case Match_MnemonicFail: - return Error(IDLoc, "invalid instruction", + case Match_MnemonicFail: { + uint64_t FBS = ComputeAvailableFeatures(getSTI().getFeatureBits()); + std::string Suggestion = ARMMnemonicSpellCheck( + ((ARMOperand &)*Operands[0]).getToken(), FBS); + return Error(IDLoc, "invalid instruction" + Suggestion, ((ARMOperand &)*Operands[0]).getLocRange()); + } case Match_RequiresNotITBlock: return Error(IDLoc, "flag setting instruction only valid outside IT block"); case Match_RequiresITBlock: diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp index 22de728fe06e..a77df7a2598f 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp @@ -361,9 +361,8 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf, unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, uint64_t Value, - bool IsPCRel, MCContext &Ctx, - bool IsLittleEndian, - bool IsResolved) const { + bool IsResolved, MCContext &Ctx, + bool IsLittleEndian) const { unsigned Kind = Fixup.getKind(); // MachO tries to make .o files that look vaguely pre-linked, so for MOVW/MOVT @@ -392,7 +391,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, case FK_SecRel_4: return Value; case ARM::fixup_arm_movt_hi16: - if (!IsPCRel) + if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF()) Value >>= 16; LLVM_FALLTHROUGH; case ARM::fixup_arm_movw_lo16: { @@ -404,7 +403,7 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm, return Value; } case ARM::fixup_t2_movt_hi16: - if (!IsPCRel) + if (IsResolved || !STI->getTargetTriple().isOSBinFormatELF()) Value >>= 16; LLVM_FALLTHROUGH; case ARM::fixup_t2_movw_lo16: { @@ -885,11 +884,11 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) { void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind()); MCContext &Ctx = Asm.getContext(); - Value = adjustFixupValue(Asm, Fixup, Target, Value, IsPCRel, Ctx, - IsLittleEndian, true); + Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, + IsLittleEndian); if (!Value) return; // Doesn't change encoding. diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h index 84b54bbb9a49..02374966dafe 100644 --- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h +++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h @@ -42,13 +42,13 @@ public: const MCValue &Target) override; unsigned adjustFixupValue(const MCAssembler &Asm, const MCFixup &Fixup, - const MCValue &Target, uint64_t Value, bool IsPCRel, - MCContext &Ctx, bool IsLittleEndian, - bool IsResolved) const; + const MCValue &Target, uint64_t Value, + bool IsResolved, MCContext &Ctx, + bool IsLittleEndian) const; void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; unsigned getRelaxedOpcode(unsigned Op) const; diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp index 0b6574c37de1..5709b4e61798 100644 --- a/lib/Target/ARM/Thumb1FrameLowering.cpp +++ b/lib/Target/ARM/Thumb1FrameLowering.cpp @@ -236,7 +236,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF, case ARM::R12: if (STI.splitFramePushPop(MF)) break; - // fallthough + LLVM_FALLTHROUGH; case ARM::R0: case ARM::R1: case ARM::R2: diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp index f0c7b11895b4..c058c9e1f534 100644 --- a/lib/Target/AVR/AVRAsmPrinter.cpp +++ b/lib/Target/AVR/AVRAsmPrinter.cpp @@ -149,7 +149,10 @@ bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, (void)MO; assert(MO.isReg() && "Unexpected inline asm memory operand"); - // TODO: We can look up the alternative name for the register if it's given. + // TODO: We should be able to look up the alternative name for + // the register if it's given. + // TableGen doesn't expose a way of getting retrieving names + // for registers. if (MI->getOperand(OpNum).getReg() == AVR::R31R30) { O << "Z"; } else { diff --git a/lib/Target/AVR/AVRDevices.td b/lib/Target/AVR/AVRDevices.td index 9224af613d14..62def4574437 100644 --- a/lib/Target/AVR/AVRDevices.td +++ b/lib/Target/AVR/AVRDevices.td @@ -6,7 +6,6 @@ // :TODO: We define all devices with SRAM to have all variants of LD/ST/LDD/STD. // In reality, avr1 (no SRAM) has one variant each of `LD` and `ST`. // avr2 (with SRAM) adds the rest of the variants. -// :TODO: s/AVRTiny/Tiny // A feature set aggregates features, grouping them. We don't want to create a @@ -136,7 +135,7 @@ def ELFArchAVR4 : ELFArch<"EF_AVR_ARCH_AVR4">; def ELFArchAVR5 : ELFArch<"EF_AVR_ARCH_AVR5">; def ELFArchAVR51 : ELFArch<"EF_AVR_ARCH_AVR51">; def ELFArchAVR6 : ELFArch<"EF_AVR_ARCH_AVR6">; -def ELFArchAVRTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">; +def ELFArchTiny : ELFArch<"EF_AVR_ARCH_AVRTINY">; def ELFArchXMEGA1 : ELFArch<"EF_AVR_ARCH_XMEGA1">; def ELFArchXMEGA2 : ELFArch<"EF_AVR_ARCH_XMEGA2">; def ELFArchXMEGA3 : ELFArch<"EF_AVR_ARCH_XMEGA3">; @@ -189,7 +188,7 @@ def FamilyAVR51 : Family<"avr51", def FamilyAVR6 : Family<"avr6", [FamilyAVR51]>; -def FamilyAVRTiny : Family<"avrtiny", +def FamilyTiny : Family<"avrtiny", [FamilyAVR0, FeatureBREAK, FeatureSRAM, FeatureTinyEncoding]>; @@ -240,7 +239,7 @@ def : Device<"avrxmega4", FamilyXMEGA, ELFArchXMEGA4>; def : Device<"avrxmega5", FamilyXMEGA, ELFArchXMEGA5>; def : Device<"avrxmega6", FamilyXMEGA, ELFArchXMEGA6>; def : Device<"avrxmega7", FamilyXMEGA, ELFArchXMEGA7>; -def : Device<"avrtiny", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"avrtiny", FamilyTiny, ELFArchTiny>; // Specific MCUs def : Device<"at90s1200", FamilyAVR0, ELFArchAVR1>; @@ -480,12 +479,12 @@ def : Device<"atxmega384d3", FamilyXMEGA, ELFArchXMEGA6>; def : Device<"atxmega128a1", FamilyXMEGA, ELFArchXMEGA7>; def : Device<"atxmega128a1u", FamilyXMEGAU, ELFArchXMEGA7>; def : Device<"atxmega128a4u", FamilyXMEGAU, ELFArchXMEGA7>; -def : Device<"attiny4", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny5", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny9", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny10", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny20", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny40", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny102", FamilyAVRTiny, ELFArchAVRTiny>; -def : Device<"attiny104", FamilyAVRTiny, ELFArchAVRTiny>; +def : Device<"attiny4", FamilyTiny, ELFArchTiny>; +def : Device<"attiny5", FamilyTiny, ELFArchTiny>; +def : Device<"attiny9", FamilyTiny, ELFArchTiny>; +def : Device<"attiny10", FamilyTiny, ELFArchTiny>; +def : Device<"attiny20", FamilyTiny, ELFArchTiny>; +def : Device<"attiny40", FamilyTiny, ELFArchTiny>; +def : Device<"attiny102", FamilyTiny, ELFArchTiny>; +def : Device<"attiny104", FamilyTiny, ELFArchTiny>; diff --git a/lib/Target/AVR/AVRInstrInfo.cpp b/lib/Target/AVR/AVRInstrInfo.cpp index afba66b2e69b..744aa723c416 100644 --- a/lib/Target/AVR/AVRInstrInfo.cpp +++ b/lib/Target/AVR/AVRInstrInfo.cpp @@ -402,7 +402,7 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB, ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded) const { - assert(!BytesAdded && "code size not handled"); + if (BytesAdded) *BytesAdded = 0; // Shouldn't be a fall through. assert(TBB && "insertBranch must not be told to insert a fallthrough"); @@ -411,19 +411,24 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB, if (Cond.empty()) { assert(!FBB && "Unconditional branch with multiple successors!"); - BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB); + auto &MI = *BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(TBB); + if (BytesAdded) + *BytesAdded += getInstSizeInBytes(MI); return 1; } // Conditional branch. unsigned Count = 0; AVRCC::CondCodes CC = (AVRCC::CondCodes)Cond[0].getImm(); - BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB); + auto &CondMI = *BuildMI(&MBB, DL, getBrCond(CC)).addMBB(TBB); + + if (BytesAdded) *BytesAdded += getInstSizeInBytes(CondMI); ++Count; if (FBB) { // Two-way Conditional branch. Insert the second branch. - BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB); + auto &MI = *BuildMI(&MBB, DL, get(AVR::RJMPk)).addMBB(FBB); + if (BytesAdded) *BytesAdded += getInstSizeInBytes(MI); ++Count; } @@ -432,7 +437,7 @@ unsigned AVRInstrInfo::insertBranch(MachineBasicBlock &MBB, unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB, int *BytesRemoved) const { - assert(!BytesRemoved && "code size not handled"); + if (BytesRemoved) *BytesRemoved = 0; MachineBasicBlock::iterator I = MBB.end(); unsigned Count = 0; @@ -450,6 +455,7 @@ unsigned AVRInstrInfo::removeBranch(MachineBasicBlock &MBB, } // Remove the branch. + if (BytesRemoved) *BytesRemoved += getInstSizeInBytes(*I); I->eraseFromParent(); I = MBB.end(); ++Count; @@ -494,5 +500,61 @@ unsigned AVRInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const { } } +MachineBasicBlock * +AVRInstrInfo::getBranchDestBlock(const MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + llvm_unreachable("unexpected opcode!"); + case AVR::JMPk: + case AVR::CALLk: + case AVR::RCALLk: + case AVR::RJMPk: + case AVR::BREQk: + case AVR::BRNEk: + case AVR::BRSHk: + case AVR::BRLOk: + case AVR::BRMIk: + case AVR::BRPLk: + case AVR::BRGEk: + case AVR::BRLTk: + return MI.getOperand(0).getMBB(); + case AVR::BRBSsk: + case AVR::BRBCsk: + return MI.getOperand(1).getMBB(); + case AVR::SBRCRrB: + case AVR::SBRSRrB: + case AVR::SBICAb: + case AVR::SBISAb: + llvm_unreachable("unimplemented branch instructions"); + } +} + +bool AVRInstrInfo::isBranchOffsetInRange(unsigned BranchOp, + int64_t BrOffset) const { + + switch (BranchOp) { + default: + llvm_unreachable("unexpected opcode!"); + case AVR::JMPk: + case AVR::CALLk: + assert(BrOffset >= 0 && "offset must be absolute address"); + return isUIntN(16, BrOffset); + case AVR::RCALLk: + case AVR::RJMPk: + return isIntN(13, BrOffset); + case AVR::BRBSsk: + case AVR::BRBCsk: + case AVR::BREQk: + case AVR::BRNEk: + case AVR::BRSHk: + case AVR::BRLOk: + case AVR::BRMIk: + case AVR::BRPLk: + case AVR::BRGEk: + case AVR::BRLTk: + return isIntN(7, BrOffset); + } +} + } // end of namespace llvm diff --git a/lib/Target/AVR/AVRInstrInfo.h b/lib/Target/AVR/AVRInstrInfo.h index c5105dafe5eb..f42d34fb2848 100644 --- a/lib/Target/AVR/AVRInstrInfo.h +++ b/lib/Target/AVR/AVRInstrInfo.h @@ -103,6 +103,10 @@ public: bool reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override; + MachineBasicBlock *getBranchDestBlock(const MachineInstr &MI) const override; + + bool isBranchOffsetInRange(unsigned BranchOpc, + int64_t BrOffset) const override; private: const AVRRegisterInfo RI; }; diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index 5dd8b2c27b21..184e4d53f7c8 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -1411,17 +1411,11 @@ hasSideEffects = 0 in def LPMRdZ : FLPMX<0, 0, (outs GPR8:$dst), - (ins ZREGS:$z), + (ins ZREG:$z), "lpm\t$dst, $z", []>, Requires<[HasLPMX]>; - def LPMWRdZ : Pseudo<(outs DREGS:$dst), - (ins ZREGS:$z), - "lpmw\t$dst, $z", - []>, - Requires<[HasLPMX]>; - // Load program memory, while postincrementing the Z register. let mayLoad = 1, Defs = [R31R30] in @@ -1429,13 +1423,19 @@ hasSideEffects = 0 in def LPMRdZPi : FLPMX<0, 1, (outs GPR8:$dst), - (ins ZREGS:$z), + (ins ZREG:$z), "lpm\t$dst, $z+", []>, Requires<[HasLPMX]>; + def LPMWRdZ : Pseudo<(outs DREGS:$dst), + (ins ZREG:$z), + "lpmw\t$dst, $z", + []>, + Requires<[HasLPMX]>; + def LPMWRdZPi : Pseudo<(outs DREGS:$dst), - (ins ZREGS:$z), + (ins ZREG:$z), "lpmw\t$dst, $z+", []>, Requires<[HasLPMX]>; @@ -1458,7 +1458,7 @@ hasSideEffects = 0 in def ELPMRdZ : FLPMX<1, 0, (outs GPR8:$dst), - (ins ZREGS:$z), + (ins ZREG:$z), "elpm\t$dst, $z", []>, Requires<[HasELPMX]>; @@ -1467,7 +1467,7 @@ hasSideEffects = 0 in def ELPMRdZPi : FLPMX<1, 1, (outs GPR8:$dst), - (ins ZREGS: $z), + (ins ZREG: $z), "elpm\t$dst, $z+", []>, Requires<[HasELPMX]>; @@ -1487,7 +1487,7 @@ let Uses = [R1, R0] in let Defs = [R31R30] in def SPMZPi : F16<0b1001010111111000, (outs), - (ins ZREGS:$z), + (ins ZREG:$z), "spm $z+", []>, Requires<[HasSPMX]>; @@ -1564,28 +1564,28 @@ hasSideEffects = 0 in // Read-Write-Modify (RMW) instructions. def XCHZRd : FZRd<0b100, (outs GPR8:$rd), - (ins ZREGS:$z), + (ins ZREG:$z), "xch\t$z, $rd", []>, Requires<[SupportsRMW]>; def LASZRd : FZRd<0b101, (outs GPR8:$rd), - (ins ZREGS:$z), + (ins ZREG:$z), "las\t$z, $rd", []>, Requires<[SupportsRMW]>; def LACZRd : FZRd<0b110, (outs GPR8:$rd), - (ins ZREGS:$z), + (ins ZREG:$z), "lac\t$z, $rd", []>, Requires<[SupportsRMW]>; def LATZRd : FZRd<0b111, (outs GPR8:$rd), - (ins ZREGS:$z), + (ins ZREG:$z), "lat\t$z, $rd", []>, Requires<[SupportsRMW]>; diff --git a/lib/Target/AVR/AVRMCInstLower.cpp b/lib/Target/AVR/AVRMCInstLower.cpp index 475dda420e89..dfefd09bc4b8 100644 --- a/lib/Target/AVR/AVRMCInstLower.cpp +++ b/lib/Target/AVR/AVRMCInstLower.cpp @@ -37,10 +37,22 @@ MCOperand AVRMCInstLower::lowerSymbolOperand(const MachineOperand &MO, Expr, MCConstantExpr::create(MO.getOffset(), Ctx), Ctx); } + bool IsFunction = MO.isGlobal() && isa<Function>(MO.getGlobal()); + if (TF & AVRII::MO_LO) { - Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_LO8, Expr, IsNegated, Ctx); + if (IsFunction) { + // N.B. Should we use _GS fixups here to cope with >128k progmem? + Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_PM_LO8, Expr, IsNegated, Ctx); + } else { + Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_LO8, Expr, IsNegated, Ctx); + } } else if (TF & AVRII::MO_HI) { - Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_HI8, Expr, IsNegated, Ctx); + if (IsFunction) { + // N.B. Should we use _GS fixups here to cope with >128k progmem? + Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_PM_HI8, Expr, IsNegated, Ctx); + } else { + Expr = AVRMCExpr::create(AVRMCExpr::VK_AVR_HI8, Expr, IsNegated, Ctx); + } } else if (TF != 0) { llvm_unreachable("Unknown target flag on symbol operand"); } diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 55f3f5cf428a..249dc5512c28 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -95,7 +95,8 @@ AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC, } /// Fold a frame offset shared between two add instructions into a single one. -static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) { +static void foldFrameOffset(MachineBasicBlock::iterator &II, int &Offset, unsigned DstReg) { + MachineInstr &MI = *II; int Opcode = MI.getOpcode(); // Don't bother trying if the next instruction is not an add or a sub. @@ -120,6 +121,7 @@ static void foldFrameOffset(MachineInstr &MI, int &Offset, unsigned DstReg) { } // Finally remove the instruction. + II++; MI.eraseFromParent(); } @@ -158,6 +160,8 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, unsigned DstReg = MI.getOperand(0).getReg(); assert(DstReg != AVR::R29R28 && "Dest reg cannot be the frame pointer"); + II++; // Skip over the FRMIDX (and now MOVW) instruction. + // Generally, to load a frame address two add instructions are emitted that // could get folded into a single one: // movw r31:r30, r29:r28 @@ -166,7 +170,8 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, // to: // movw r31:r30, r29:r28 // adiw r31:r30, 45 - foldFrameOffset(*std::next(II), Offset, DstReg); + if (II != MBB.end()) + foldFrameOffset(II, Offset, DstReg); // Select the best opcode based on DstReg and the offset size. switch (DstReg) { @@ -187,7 +192,7 @@ void AVRRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II, } } - MachineInstr *New = BuildMI(MBB, std::next(II), dl, TII.get(Opcode), DstReg) + MachineInstr *New = BuildMI(MBB, II, dl, TII.get(Opcode), DstReg) .addReg(DstReg, RegState::Kill) .addImm(Offset); New->getOperand(3).setIsDead(); diff --git a/lib/Target/AVR/AVRRegisterInfo.td b/lib/Target/AVR/AVRRegisterInfo.td index 32650fc66751..8162f12052be 100644 --- a/lib/Target/AVR/AVRRegisterInfo.td +++ b/lib/Target/AVR/AVRRegisterInfo.td @@ -110,8 +110,6 @@ CoveredBySubRegs = 1 in // Register Classes //===----------------------------------------------------------------------===// -//:TODO: use proper set instructions instead of using always "add" - // Main 8-bit register class. def GPR8 : RegisterClass<"AVR", [i8], 8, ( @@ -199,14 +197,11 @@ def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, // We have a bunch of instructions with an explicit Z register argument. We // model this using a register class containing only the Z register. -// :TODO: Rename to 'ZREG'. -def ZREGS : RegisterClass<"AVR", [i16], 8, (add R31R30)>; +def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>; // Register class used for the stack read pseudo instruction. def GPRSP: RegisterClass<"AVR", [i16], 8, (add SP)>; -//:TODO: if we remove this we get an error in tablegen -//:TODO: this is just a hack, remove it once add16 works! // Status register. def SREG : AVRReg<14, "FLAGS">, DwarfRegNum<[88]>; def CCR : RegisterClass<"AVR", [i8], 8, (add SREG)> diff --git a/lib/Target/AVR/AVRTargetMachine.cpp b/lib/Target/AVR/AVRTargetMachine.cpp index 91d2a8737b87..a9d61ffc952c 100644 --- a/lib/Target/AVR/AVRTargetMachine.cpp +++ b/lib/Target/AVR/AVRTargetMachine.cpp @@ -66,6 +66,7 @@ public: bool addInstSelector() override; void addPreSched2() override; + void addPreEmitPass() override; void addPreRegAlloc() override; }; } // namespace @@ -115,4 +116,9 @@ void AVRPassConfig::addPreSched2() { addPass(createAVRExpandPseudoPass()); } +void AVRPassConfig::addPreEmitPass() { + // Must run branch selection immediately preceding the asm printer. + addPass(&BranchRelaxationPassID); +} + } // end of namespace llvm diff --git a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp index cf52e552978f..5004736365c7 100644 --- a/lib/Target/AVR/AsmParser/AVRAsmParser.cpp +++ b/lib/Target/AVR/AsmParser/AVRAsmParser.cpp @@ -466,6 +466,7 @@ bool AVRAsmParser::parseOperand(OperandVector &Operands) { if (!tryParseRegisterOperand(Operands)) { return false; } + LLVM_FALLTHROUGH; case AsmToken::LParen: case AsmToken::Integer: case AsmToken::Dot: diff --git a/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp b/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp index 316b7836df0d..0f34b8e18ff9 100644 --- a/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp +++ b/lib/Target/AVR/InstPrinter/AVRInstPrinter.cpp @@ -106,7 +106,7 @@ void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, if (Op.isReg()) { bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) || (MOI.RegClass == AVR::PTRDISPREGSRegClassID) || - (MOI.RegClass == AVR::ZREGSRegClassID); + (MOI.RegClass == AVR::ZREGRegClassID); if (isPtrReg) { O << getRegisterName(Op.getReg(), AVR::ptr); diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp index 1e61eccf775f..6d126ed622aa 100644 --- a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp +++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp @@ -33,7 +33,7 @@ static unsigned getEFlagsForFeatureSet(const FeatureBitset &Features) { EFlags |= ELF::EF_AVR_ARCH_AVR51; else if (Features[AVR::ELFArchAVR6]) EFlags |= ELF::EF_AVR_ARCH_AVR6; - else if (Features[AVR::ELFArchAVRTiny]) + else if (Features[AVR::ELFArchTiny]) EFlags |= ELF::EF_AVR_ARCH_AVRTINY; else if (Features[AVR::ELFArchXMEGA1]) EFlags |= ELF::EF_AVR_ARCH_XMEGA1; diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp index 15e89fb2a261..9fc812cdef14 100644 --- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp +++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp @@ -29,7 +29,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -65,7 +65,7 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { void BPFAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) { assert(Value == 0); } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) { diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp index c19e636d79ca..d901abbd1692 100644 --- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp +++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp @@ -1413,6 +1413,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst, // Translate a "$Rx = CONST32(#imm)" to "$Rx = memw(gp+#LABEL) " case Hexagon::CONST32: is32bit = true; + LLVM_FALLTHROUGH; // Translate a "$Rx:y = CONST64(#imm)" to "$Rx:y = memd(gp+#LABEL) " case Hexagon::CONST64: // FIXME: need better way to detect AsmStreamer (upstream removed getKind()) diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp index 14c682c6df4b..b064778c4bbd 100644 --- a/lib/Target/Hexagon/HexagonBitSimplify.cpp +++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp @@ -1947,8 +1947,10 @@ bool BitSimplification::genStoreImmediate(MachineInstr *MI) { switch (Opc) { case Hexagon::S2_storeri_io: Align++; + LLVM_FALLTHROUGH; case Hexagon::S2_storerh_io: Align++; + LLVM_FALLTHROUGH; case Hexagon::S2_storerb_io: break; default: diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp index 730026121d3b..3de531088240 100644 --- a/lib/Target/Hexagon/HexagonBitTracker.cpp +++ b/lib/Target/Hexagon/HexagonBitTracker.cpp @@ -937,6 +937,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &BI, case Hexagon::J2_jumpfnew: case Hexagon::J2_jumpfnewpt: Negated = true; + LLVM_FALLTHROUGH; case Hexagon::J2_jumpt: case Hexagon::J2_jumptpt: case Hexagon::J2_jumptnew: diff --git a/lib/Target/Hexagon/HexagonConstPropagation.cpp b/lib/Target/Hexagon/HexagonConstPropagation.cpp index aa68f6cfdfc1..49ddd6961f8a 100644 --- a/lib/Target/Hexagon/HexagonConstPropagation.cpp +++ b/lib/Target/Hexagon/HexagonConstPropagation.cpp @@ -2244,6 +2244,7 @@ bool HexagonConstEvaluator::evaluate(const MachineInstr &BrI, case Hexagon::J2_jumpfnew: case Hexagon::J2_jumpfnewpt: Negated = true; + LLVM_FALLTHROUGH; case Hexagon::J2_jumpt: case Hexagon::J2_jumptnew: case Hexagon::J2_jumptnewpt: diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp index 97a53dcbaed7..c790579ccebc 100644 --- a/lib/Target/Hexagon/HexagonFrameLowering.cpp +++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp @@ -979,18 +979,6 @@ bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const { if (MFI.hasCalls() || HMFI.hasClobberLR()) return true; - // Frame pointer elimination is a possiblility at this point, but - // to know if FP is necessary we need to know if spill/restore - // functions will be used (they require FP to be valid). - // This means that hasFP shouldn't really be called before CSI is - // calculated, and some measures are taken to make sure of that - // (e.g. default implementations of virtual functions that call it - // are overridden apropriately). - assert(MFI.isCalleeSavedInfoValid() && "Need to know CSI"); - const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo(); - if (useSpillFunction(MF, CSI) || useRestoreFunction(MF, CSI)) - return true; - return false; } @@ -2437,6 +2425,8 @@ bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF, const CSIVect &CSI) const { if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn()) return true; + if (!hasFP(MF)) + return true; if (!isOptSize(MF) && !isMinSize(MF)) if (MF.getTarget().getOptLevel() > CodeGenOpt::Default) return true; diff --git a/lib/Target/Hexagon/HexagonGenPredicate.cpp b/lib/Target/Hexagon/HexagonGenPredicate.cpp index f14c733dcf51..3470480d607d 100644 --- a/lib/Target/Hexagon/HexagonGenPredicate.cpp +++ b/lib/Target/Hexagon/HexagonGenPredicate.cpp @@ -334,6 +334,7 @@ bool HexagonGenPredicate::isScalarPred(Register PredReg) { if (MRI->getRegClass(PR.R) != PredRC) return false; // If it is a copy between two predicate registers, fall through. + LLVM_FALLTHROUGH; } case Hexagon::C2_and: case Hexagon::C2_andn: diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp index e5f49ca77a91..0163b2e2bdc4 100644 --- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp +++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp @@ -241,22 +241,31 @@ void HexagonDAGToDAGISel::SelectIndexedLoad(LoadSDNode *LD, const SDLoc &dl) { case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: - if (isAlignedMemNode(LD)) - Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai; - else + if (isAlignedMemNode(LD)) { + if (LD->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vL32b_nt_pi : Hexagon::V6_vL32b_nt_ai; + else + Opcode = IsValidInc ? Hexagon::V6_vL32b_pi : Hexagon::V6_vL32b_ai; + } else { Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi : Hexagon::V6_vL32Ub_ai; + } break; // 128B case MVT::v128i8: case MVT::v64i16: case MVT::v32i32: case MVT::v16i64: - if (isAlignedMemNode(LD)) - Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B - : Hexagon::V6_vL32b_ai_128B; - else + if (isAlignedMemNode(LD)) { + if (LD->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vL32b_nt_pi_128B + : Hexagon::V6_vL32b_nt_ai_128B; + else + Opcode = IsValidInc ? Hexagon::V6_vL32b_pi_128B + : Hexagon::V6_vL32b_ai_128B; + } else { Opcode = IsValidInc ? Hexagon::V6_vL32Ub_pi_128B : Hexagon::V6_vL32Ub_ai_128B; + } break; default: llvm_unreachable("Unexpected memory type in indexed load"); @@ -529,22 +538,31 @@ void HexagonDAGToDAGISel::SelectIndexedStore(StoreSDNode *ST, const SDLoc &dl) { case MVT::v32i16: case MVT::v16i32: case MVT::v8i64: - if (isAlignedMemNode(ST)) - Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai; - else + if (isAlignedMemNode(ST)) { + if (ST->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vS32b_nt_pi : Hexagon::V6_vS32b_nt_ai; + else + Opcode = IsValidInc ? Hexagon::V6_vS32b_pi : Hexagon::V6_vS32b_ai; + } else { Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi : Hexagon::V6_vS32Ub_ai; + } break; // 128B case MVT::v128i8: case MVT::v64i16: case MVT::v32i32: case MVT::v16i64: - if (isAlignedMemNode(ST)) - Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B - : Hexagon::V6_vS32b_ai_128B; - else + if (isAlignedMemNode(ST)) { + if (ST->isNonTemporal()) + Opcode = IsValidInc ? Hexagon::V6_vS32b_nt_pi_128B + : Hexagon::V6_vS32b_nt_ai_128B; + else + Opcode = IsValidInc ? Hexagon::V6_vS32b_pi_128B + : Hexagon::V6_vS32b_ai_128B; + } else { Opcode = IsValidInc ? Hexagon::V6_vS32Ub_pi_128B : Hexagon::V6_vS32Ub_ai_128B; + } break; default: llvm_unreachable("Unexpected memory type in indexed store"); diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 2daacf795555..67242764d453 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -716,6 +716,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, bool IsStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet(); MachineFunction &MF = DAG.getMachineFunction(); + MachineFrameInfo &MFI = MF.getFrameInfo(); auto PtrVT = getPointerTy(MF.getDataLayout()); // Check for varargs. @@ -832,7 +833,6 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, if (NeedsArgAlign && Subtarget.hasV60TOps()) { DEBUG(dbgs() << "Function needs byte stack align due to call args\n"); - MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo(); // V6 vectors passed by value have 64 or 128 byte alignment depending // on whether we are 64 byte vector mode or 128 byte. bool UseHVXDbl = Subtarget.useHVXDblOps(); @@ -916,10 +916,15 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Ops.push_back(Glue); if (IsTailCall) { - MF.getFrameInfo().setHasTailCall(); + MFI.setHasTailCall(); return DAG.getNode(HexagonISD::TC_RETURN, dl, NodeTys, Ops); } + // Set this here because we need to know this for "hasFP" in frame lowering. + // The target-independent code calls getFrameRegister before setting it, and + // getFrameRegister uses hasFP to determine whether the function has FP. + MFI.setHasCalls(true); + unsigned OpCode = DoesNotReturn ? HexagonISD::CALLnr : HexagonISD::CALL; Chain = DAG.getNode(OpCode, dl, NodeTys, Ops); Glue = Chain.getValue(1); @@ -1284,11 +1289,9 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { // Creates a SPLAT instruction for a constant value VAL. static SDValue createSplat(SelectionDAG &DAG, const SDLoc &dl, EVT VT, SDValue Val) { - if (VT.getSimpleVT() == MVT::v4i8) - return DAG.getNode(HexagonISD::VSPLATB, dl, VT, Val); - - if (VT.getSimpleVT() == MVT::v4i16) - return DAG.getNode(HexagonISD::VSPLATH, dl, VT, Val); + EVT T = VT.getVectorElementType(); + if (T == MVT::i8 || T == MVT::i16) + return DAG.getNode(HexagonISD::VSPLAT, dl, VT, Val); return SDValue(); } @@ -2296,32 +2299,13 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const { case HexagonISD::JT: return "HexagonISD::JT"; case HexagonISD::PACKHL: return "HexagonISD::PACKHL"; case HexagonISD::RET_FLAG: return "HexagonISD::RET_FLAG"; - case HexagonISD::SHUFFEB: return "HexagonISD::SHUFFEB"; - case HexagonISD::SHUFFEH: return "HexagonISD::SHUFFEH"; - case HexagonISD::SHUFFOB: return "HexagonISD::SHUFFOB"; - case HexagonISD::SHUFFOH: return "HexagonISD::SHUFFOH"; case HexagonISD::TC_RETURN: return "HexagonISD::TC_RETURN"; - case HexagonISD::VCMPBEQ: return "HexagonISD::VCMPBEQ"; - case HexagonISD::VCMPBGT: return "HexagonISD::VCMPBGT"; - case HexagonISD::VCMPBGTU: return "HexagonISD::VCMPBGTU"; - case HexagonISD::VCMPHEQ: return "HexagonISD::VCMPHEQ"; - case HexagonISD::VCMPHGT: return "HexagonISD::VCMPHGT"; - case HexagonISD::VCMPHGTU: return "HexagonISD::VCMPHGTU"; - case HexagonISD::VCMPWEQ: return "HexagonISD::VCMPWEQ"; - case HexagonISD::VCMPWGT: return "HexagonISD::VCMPWGT"; - case HexagonISD::VCMPWGTU: return "HexagonISD::VCMPWGTU"; case HexagonISD::VCOMBINE: return "HexagonISD::VCOMBINE"; case HexagonISD::VPACK: return "HexagonISD::VPACK"; - case HexagonISD::VSHLH: return "HexagonISD::VSHLH"; - case HexagonISD::VSHLW: return "HexagonISD::VSHLW"; - case HexagonISD::VSPLATB: return "HexagonISD::VSPLTB"; - case HexagonISD::VSPLATH: return "HexagonISD::VSPLATH"; - case HexagonISD::VSRAH: return "HexagonISD::VSRAH"; - case HexagonISD::VSRAW: return "HexagonISD::VSRAW"; - case HexagonISD::VSRLH: return "HexagonISD::VSRLH"; - case HexagonISD::VSRLW: return "HexagonISD::VSRLW"; - case HexagonISD::VSXTBH: return "HexagonISD::VSXTBH"; - case HexagonISD::VSXTBW: return "HexagonISD::VSXTBW"; + case HexagonISD::VASL: return "HexagonISD::VASL"; + case HexagonISD::VASR: return "HexagonISD::VASR"; + case HexagonISD::VLSR: return "HexagonISD::VLSR"; + case HexagonISD::VSPLAT: return "HexagonISD::VSPLAT"; case HexagonISD::READCYCLE: return "HexagonISD::READCYCLE"; case HexagonISD::OP_END: break; } @@ -2503,13 +2487,13 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const { if (VT.getSimpleVT() == MVT::v4i16) { switch (Op.getOpcode()) { case ISD::SRA: - Result = DAG.getNode(HexagonISD::VSRAH, dl, VT, V3, CommonSplat); + Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat); break; case ISD::SHL: - Result = DAG.getNode(HexagonISD::VSHLH, dl, VT, V3, CommonSplat); + Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat); break; case ISD::SRL: - Result = DAG.getNode(HexagonISD::VSRLH, dl, VT, V3, CommonSplat); + Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat); break; default: return SDValue(); @@ -2517,13 +2501,13 @@ HexagonTargetLowering::LowerVECTOR_SHIFT(SDValue Op, SelectionDAG &DAG) const { } else if (VT.getSimpleVT() == MVT::v2i32) { switch (Op.getOpcode()) { case ISD::SRA: - Result = DAG.getNode(HexagonISD::VSRAW, dl, VT, V3, CommonSplat); + Result = DAG.getNode(HexagonISD::VASR, dl, VT, V3, CommonSplat); break; case ISD::SHL: - Result = DAG.getNode(HexagonISD::VSHLW, dl, VT, V3, CommonSplat); + Result = DAG.getNode(HexagonISD::VASL, dl, VT, V3, CommonSplat); break; case ISD::SRL: - Result = DAG.getNode(HexagonISD::VSRLW, dl, VT, V3, CommonSplat); + Result = DAG.getNode(HexagonISD::VLSR, dl, VT, V3, CommonSplat); break; default: return SDValue(); diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h index 1415156487c0..bfd2c94eeaba 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.h +++ b/lib/Target/Hexagon/HexagonISelLowering.h @@ -52,29 +52,10 @@ namespace HexagonISD { COMBINE, PACKHL, - VSPLATB, - VSPLATH, - SHUFFEB, - SHUFFEH, - SHUFFOB, - SHUFFOH, - VSXTBH, - VSXTBW, - VSRAW, - VSRAH, - VSRLW, - VSRLH, - VSHLW, - VSHLH, - VCMPBEQ, - VCMPBGT, - VCMPBGTU, - VCMPHEQ, - VCMPHGT, - VCMPHGTU, - VCMPWEQ, - VCMPWGT, - VCMPWGTU, + VSPLAT, + VASL, + VASR, + VLSR, INSERT, INSERTRP, diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp index 1eac2d3dd8e2..c77c669f4ca7 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.cpp +++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp @@ -250,15 +250,19 @@ unsigned HexagonInstrInfo::isLoadFromStackSlot(const MachineInstr &MI, case Hexagon::L2_loadri_io: case Hexagon::L2_loadrd_io: case Hexagon::V6_vL32b_ai: + case Hexagon::V6_vL32b_nt_ai: case Hexagon::V6_vL32b_ai_128B: + case Hexagon::V6_vL32b_nt_ai_128B: case Hexagon::V6_vL32Ub_ai: case Hexagon::V6_vL32Ub_ai_128B: case Hexagon::LDriw_pred: case Hexagon::LDriw_mod: case Hexagon::PS_vloadrq_ai: case Hexagon::PS_vloadrw_ai: + case Hexagon::PS_vloadrw_nt_ai: case Hexagon::PS_vloadrq_ai_128B: - case Hexagon::PS_vloadrw_ai_128B: { + case Hexagon::PS_vloadrw_ai_128B: + case Hexagon::PS_vloadrw_nt_ai_128B: { const MachineOperand OpFI = MI.getOperand(1); if (!OpFI.isFI()) return 0; @@ -1726,6 +1730,39 @@ bool HexagonInstrInfo::getIncrementValue(const MachineInstr &MI, return false; } +std::pair<unsigned, unsigned> +HexagonInstrInfo::decomposeMachineOperandsTargetFlags(unsigned TF) const { + return std::make_pair(TF & ~HexagonII::MO_Bitmasks, + TF & HexagonII::MO_Bitmasks); +} + +ArrayRef<std::pair<unsigned, const char*>> +HexagonInstrInfo::getSerializableDirectMachineOperandTargetFlags() const { + using namespace HexagonII; + static const std::pair<unsigned, const char*> Flags[] = { + {MO_PCREL, "hexagon-pcrel"}, + {MO_GOT, "hexagon-got"}, + {MO_LO16, "hexagon-lo16"}, + {MO_HI16, "hexagon-hi16"}, + {MO_GPREL, "hexagon-gprel"}, + {MO_GDGOT, "hexagon-gdgot"}, + {MO_GDPLT, "hexagon-gdplt"}, + {MO_IE, "hexagon-ie"}, + {MO_IEGOT, "hexagon-iegot"}, + {MO_TPREL, "hexagon-tprel"} + }; + return makeArrayRef(Flags); +} + +ArrayRef<std::pair<unsigned, const char*>> +HexagonInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const { + using namespace HexagonII; + static const std::pair<unsigned, const char*> Flags[] = { + {HMOTF_ConstExtended, "hexagon-ext"} + }; + return makeArrayRef(Flags); +} + unsigned HexagonInstrInfo::createVR(MachineFunction *MF, MVT VT) const { MachineRegisterInfo &MRI = MF->getRegInfo(); const TargetRegisterClass *TRC; @@ -1797,7 +1834,7 @@ bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const { const MachineOperand &MO = MI.getOperand(ExtOpNum); // Use MO operand flags to determine if MO // has the HMOTF_ConstExtended flag set. - if (MO.getTargetFlags() && HexagonII::HMOTF_ConstExtended) + if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended) return true; // If this is a Machine BB address we are talking about, and it is // not marked as extended, say so. @@ -1807,9 +1844,6 @@ bool HexagonInstrInfo::isConstExtended(const MachineInstr &MI) const { // We could be using an instruction with an extendable immediate and shoehorn // a global address into it. If it is a global address it will be constant // extended. We do this for COMBINE. - // We currently only handle isGlobal() because it is the only kind of - // object we are going to end up with here for now. - // In the future we probably should add isSymbol(), etc. if (MO.isGlobal() || MO.isSymbol() || MO.isBlockAddress() || MO.isJTI() || MO.isCPI() || MO.isFPImm()) return true; @@ -1961,11 +1995,9 @@ bool HexagonInstrInfo::isExtended(const MachineInstr &MI) const { return true; // Use MO operand flags to determine if one of MI's operands // has HMOTF_ConstExtended flag set. - for (MachineInstr::const_mop_iterator I = MI.operands_begin(), - E = MI.operands_end(); I != E; ++I) { - if (I->getTargetFlags() && HexagonII::HMOTF_ConstExtended) + for (const MachineOperand &MO : MI.operands()) + if (MO.getTargetFlags() & HexagonII::HMOTF_ConstExtended) return true; - } return false; } @@ -2445,20 +2477,28 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset, switch (Opcode) { case Hexagon::PS_vstorerq_ai: case Hexagon::PS_vstorerw_ai: + case Hexagon::PS_vstorerw_nt_ai: case Hexagon::PS_vloadrq_ai: case Hexagon::PS_vloadrw_ai: + case Hexagon::PS_vloadrw_nt_ai: case Hexagon::V6_vL32b_ai: case Hexagon::V6_vS32b_ai: + case Hexagon::V6_vL32b_nt_ai: + case Hexagon::V6_vS32b_nt_ai: case Hexagon::V6_vL32Ub_ai: case Hexagon::V6_vS32Ub_ai: return isShiftedInt<4,6>(Offset); case Hexagon::PS_vstorerq_ai_128B: case Hexagon::PS_vstorerw_ai_128B: + case Hexagon::PS_vstorerw_nt_ai_128B: case Hexagon::PS_vloadrq_ai_128B: case Hexagon::PS_vloadrw_ai_128B: + case Hexagon::PS_vloadrw_nt_ai_128B: case Hexagon::V6_vL32b_ai_128B: case Hexagon::V6_vS32b_ai_128B: + case Hexagon::V6_vL32b_nt_ai_128B: + case Hexagon::V6_vS32b_nt_ai_128B: case Hexagon::V6_vL32Ub_ai_128B: case Hexagon::V6_vS32Ub_ai_128B: return isShiftedInt<4,7>(Offset); @@ -3170,11 +3210,19 @@ int HexagonInstrInfo::getDotCurOp(const MachineInstr &MI) const { return Hexagon::V6_vL32b_cur_pi; case Hexagon::V6_vL32b_ai: return Hexagon::V6_vL32b_cur_ai; + case Hexagon::V6_vL32b_nt_pi: + return Hexagon::V6_vL32b_nt_cur_pi; + case Hexagon::V6_vL32b_nt_ai: + return Hexagon::V6_vL32b_nt_cur_ai; //128B case Hexagon::V6_vL32b_pi_128B: return Hexagon::V6_vL32b_cur_pi_128B; case Hexagon::V6_vL32b_ai_128B: return Hexagon::V6_vL32b_cur_ai_128B; + case Hexagon::V6_vL32b_nt_pi_128B: + return Hexagon::V6_vL32b_nt_cur_pi_128B; + case Hexagon::V6_vL32b_nt_ai_128B: + return Hexagon::V6_vL32b_nt_cur_ai_128B; } return 0; } @@ -3187,11 +3235,19 @@ int HexagonInstrInfo::getNonDotCurOp(const MachineInstr &MI) const { return Hexagon::V6_vL32b_pi; case Hexagon::V6_vL32b_cur_ai: return Hexagon::V6_vL32b_ai; + case Hexagon::V6_vL32b_nt_cur_pi: + return Hexagon::V6_vL32b_nt_pi; + case Hexagon::V6_vL32b_nt_cur_ai: + return Hexagon::V6_vL32b_nt_ai; //128B case Hexagon::V6_vL32b_cur_pi_128B: return Hexagon::V6_vL32b_pi_128B; case Hexagon::V6_vL32b_cur_ai_128B: return Hexagon::V6_vL32b_ai_128B; + case Hexagon::V6_vL32b_nt_cur_pi_128B: + return Hexagon::V6_vL32b_nt_pi_128B; + case Hexagon::V6_vL32b_nt_cur_ai_128B: + return Hexagon::V6_vL32b_nt_ai_128B; } return 0; } diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h index 944d0161a7c8..0436ce3ac475 100644 --- a/lib/Target/Hexagon/HexagonInstrInfo.h +++ b/lib/Target/Hexagon/HexagonInstrInfo.h @@ -301,6 +301,27 @@ public: const MachineInstr &UseMI, unsigned UseIdx) const override; + /// Decompose the machine operand's target flags into two values - the direct + /// target flag value and any of bit flags that are applied. + std::pair<unsigned, unsigned> + decomposeMachineOperandsTargetFlags(unsigned TF) const override; + + /// Return an array that contains the direct target flag values and their + /// names. + /// + /// MIR Serialization is able to serialize only the target flags that are + /// defined by this method. + ArrayRef<std::pair<unsigned, const char *>> + getSerializableDirectMachineOperandTargetFlags() const override; + + /// Return an array that contains the bitmask target flag values and their + /// names. + /// + /// MIR Serialization is able to serialize only the target flags that are + /// defined by this method. + ArrayRef<std::pair<unsigned, const char *>> + getSerializableBitmaskMachineOperandTargetFlags() const override; + bool isTailCall(const MachineInstr &MI) const override; /// HexagonInstrInfo specifics. diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp index 4602de979024..1a26805d190d 100644 --- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp +++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp @@ -49,7 +49,7 @@ static cl::opt<bool> CheckEarlyAvail("check-early-avail", cl::Hidden, using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" namespace { class HexagonCallMutation : public ScheduleDAGMutation { diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 689419638f54..ba98b8994937 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -2770,6 +2770,9 @@ def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { // Aligned stores + def : Pat<(alignednontemporalstore (VTSgl VectorRegs:$src1), IntRegs:$addr), + (V6_vS32b_nt_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr), (V6_vS32b_ai IntRegs:$addr, 0, (VTSgl VectorRegs:$src1))>, Requires<[UseHVXSgl]>; @@ -2778,6 +2781,9 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { Requires<[UseHVXSgl]>; // 128B Aligned stores + def : Pat<(alignednontemporalstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr), + (V6_vS32b_nt_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), IntRegs:$addr), (V6_vS32b_ai_128B IntRegs:$addr, 0, (VTDbl VectorRegs128B:$src1))>, Requires<[UseHVXDbl]>; @@ -2787,6 +2793,11 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { // Fold Add R+OFF into vector store. let AddedComplexity = 10 in { + def : Pat<(alignednontemporalstore (VTSgl VectorRegs:$src1), + (add IntRegs:$src2, Iss4_6:$offset)), + (V6_vS32b_nt_ai IntRegs:$src2, Iss4_6:$offset, + (VTSgl VectorRegs:$src1))>, + Requires<[UseHVXSgl]>; def : Pat<(alignedstore (VTSgl VectorRegs:$src1), (add IntRegs:$src2, Iss4_6:$offset)), (V6_vS32b_ai IntRegs:$src2, Iss4_6:$offset, @@ -2799,6 +2810,11 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { Requires<[UseHVXSgl]>; // Fold Add R+OFF into vector store 128B. + def : Pat<(alignednontemporalstore (VTDbl VectorRegs128B:$src1), + (add IntRegs:$src2, Iss4_7:$offset)), + (V6_vS32b_nt_ai_128B IntRegs:$src2, Iss4_7:$offset, + (VTDbl VectorRegs128B:$src1))>, + Requires<[UseHVXDbl]>; def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1), (add IntRegs:$src2, Iss4_7:$offset)), (V6_vS32b_ai_128B IntRegs:$src2, Iss4_7:$offset, @@ -2820,6 +2836,9 @@ defm : vS32b_ai_pats <v8i64, v16i64>; multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { // Aligned loads + def : Pat < (VTSgl (alignednontemporalload IntRegs:$addr)), + (V6_vL32b_nt_ai IntRegs:$addr, 0) >, + Requires<[UseHVXSgl]>; def : Pat < (VTSgl (alignedload IntRegs:$addr)), (V6_vL32b_ai IntRegs:$addr, 0) >, Requires<[UseHVXSgl]>; @@ -2828,6 +2847,9 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { Requires<[UseHVXSgl]>; // 128B Load + def : Pat < (VTDbl (alignednontemporalload IntRegs:$addr)), + (V6_vL32b_nt_ai_128B IntRegs:$addr, 0) >, + Requires<[UseHVXDbl]>; def : Pat < (VTDbl (alignedload IntRegs:$addr)), (V6_vL32b_ai_128B IntRegs:$addr, 0) >, Requires<[UseHVXDbl]>; @@ -2837,6 +2859,9 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { // Fold Add R+OFF into vector load. let AddedComplexity = 10 in { + def : Pat<(VTDbl (alignednontemporalload (add IntRegs:$src2, Iss4_7:$offset))), + (V6_vL32b_nt_ai_128B IntRegs:$src2, Iss4_7:$offset)>, + Requires<[UseHVXDbl]>; def : Pat<(VTDbl (alignedload (add IntRegs:$src2, Iss4_7:$offset))), (V6_vL32b_ai_128B IntRegs:$src2, Iss4_7:$offset)>, Requires<[UseHVXDbl]>; @@ -2844,6 +2869,9 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> { (V6_vL32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset)>, Requires<[UseHVXDbl]>; + def : Pat<(VTSgl (alignednontemporalload (add IntRegs:$src2, Iss4_6:$offset))), + (V6_vL32b_nt_ai IntRegs:$src2, Iss4_6:$offset)>, + Requires<[UseHVXSgl]>; def : Pat<(VTSgl (alignedload (add IntRegs:$src2, Iss4_6:$offset))), (V6_vL32b_ai IntRegs:$src2, Iss4_6:$offset)>, Requires<[UseHVXSgl]>; @@ -2859,6 +2887,9 @@ defm : vL32b_ai_pats <v16i32, v32i32>; defm : vL32b_ai_pats <v8i64, v16i64>; multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> { + def : Pat<(alignednontemporalstore (VTSgl VecDblRegs:$src1), IntRegs:$addr), + (PS_vstorerw_nt_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>, + Requires<[UseHVXSgl]>; def : Pat<(alignedstore (VTSgl VecDblRegs:$src1), IntRegs:$addr), (PS_vstorerw_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>, Requires<[UseHVXSgl]>; @@ -2866,6 +2897,10 @@ multiclass STrivv_pats <ValueType VTSgl, ValueType VTDbl> { (PS_vstorerwu_ai IntRegs:$addr, 0, (VTSgl VecDblRegs:$src1))>, Requires<[UseHVXSgl]>; + def : Pat<(alignednontemporalstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr), + (PS_vstorerw_nt_ai_128B IntRegs:$addr, 0, + (VTDbl VecDblRegs128B:$src1))>, + Requires<[UseHVXDbl]>; def : Pat<(alignedstore (VTDbl VecDblRegs128B:$src1), IntRegs:$addr), (PS_vstorerw_ai_128B IntRegs:$addr, 0, (VTDbl VecDblRegs128B:$src1))>, @@ -2882,6 +2917,9 @@ defm : STrivv_pats <v32i32, v64i32>; defm : STrivv_pats <v16i64, v32i64>; multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> { + def : Pat<(VTSgl (alignednontemporalload I32:$addr)), + (PS_vloadrw_nt_ai I32:$addr, 0)>, + Requires<[UseHVXSgl]>; def : Pat<(VTSgl (alignedload I32:$addr)), (PS_vloadrw_ai I32:$addr, 0)>, Requires<[UseHVXSgl]>; @@ -2889,6 +2927,9 @@ multiclass LDrivv_pats <ValueType VTSgl, ValueType VTDbl> { (PS_vloadrwu_ai I32:$addr, 0)>, Requires<[UseHVXSgl]>; + def : Pat<(VTDbl (alignednontemporalload I32:$addr)), + (PS_vloadrw_nt_ai_128B I32:$addr, 0)>, + Requires<[UseHVXDbl]>; def : Pat<(VTDbl (alignedload I32:$addr)), (PS_vloadrw_ai_128B I32:$addr, 0)>, Requires<[UseHVXDbl]>; @@ -3021,16 +3062,16 @@ def : Pat<(v2i16 (add (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))), def : Pat<(v2i16 (sub (v2i16 IntRegs:$src1), (v2i16 IntRegs:$src2))), (A2_svsubh IntRegs:$src1, IntRegs:$src2)>; -def HexagonVSPLATB: SDNode<"HexagonISD::VSPLATB", SDTUnaryOp>; -def HexagonVSPLATH: SDNode<"HexagonISD::VSPLATH", SDTUnaryOp>; +def SDTHexagonVSPLAT: SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>; +def HexagonVSPLAT: SDNode<"HexagonISD::VSPLAT", SDTHexagonVSPLAT>; // Replicate the low 8-bits from 32-bits input register into each of the // four bytes of 32-bits destination register. -def: Pat<(v4i8 (HexagonVSPLATB I32:$Rs)), (S2_vsplatrb I32:$Rs)>; +def: Pat<(v4i8 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrb I32:$Rs)>; // Replicate the low 16-bits from 32-bits input register into each of the // four halfwords of 64-bits destination register. -def: Pat<(v4i16 (HexagonVSPLATH I32:$Rs)), (S2_vsplatrh I32:$Rs)>; +def: Pat<(v4i16 (HexagonVSPLAT I32:$Rs)), (S2_vsplatrh I32:$Rs)>; class VArith_pat <InstHexagon MI, SDNode Op, PatFrag Type> @@ -3068,84 +3109,44 @@ def: Pat<(v2i32 (shl V2I32:$b, (i64 (HexagonCOMBINE (i32 u5_0ImmPred:$c), (i32 u5_0ImmPred:$c))))), (S2_asl_i_vw V2I32:$b, imm:$c)>; -def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))), +def: Pat<(v4i16 (sra V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))), (S2_asr_i_vh V4I16:$b, imm:$c)>; -def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))), +def: Pat<(v4i16 (srl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))), (S2_lsr_i_vh V4I16:$b, imm:$c)>; -def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLATH (i32 (u4_0ImmPred:$c)))))), +def: Pat<(v4i16 (shl V4I16:$b, (v4i16 (HexagonVSPLAT u4_0ImmPred:$c)))), (S2_asl_i_vh V4I16:$b, imm:$c)>; -def SDTHexagon_v2i32_v2i32_i32 : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisVT<0, v2i32>, SDTCisInt<2>]>; -def SDTHexagon_v4i16_v4i16_i32 : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisVT<0, v4i16>, SDTCisInt<2>]>; +def SDTHexagonVShift + : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>, SDTCisVec<0>, SDTCisVT<2, i32>]>; -def HexagonVSRAW: SDNode<"HexagonISD::VSRAW", SDTHexagon_v2i32_v2i32_i32>; -def HexagonVSRAH: SDNode<"HexagonISD::VSRAH", SDTHexagon_v4i16_v4i16_i32>; -def HexagonVSRLW: SDNode<"HexagonISD::VSRLW", SDTHexagon_v2i32_v2i32_i32>; -def HexagonVSRLH: SDNode<"HexagonISD::VSRLH", SDTHexagon_v4i16_v4i16_i32>; -def HexagonVSHLW: SDNode<"HexagonISD::VSHLW", SDTHexagon_v2i32_v2i32_i32>; -def HexagonVSHLH: SDNode<"HexagonISD::VSHLH", SDTHexagon_v4i16_v4i16_i32>; +def HexagonVASL: SDNode<"HexagonISD::VASL", SDTHexagonVShift>; +def HexagonVASR: SDNode<"HexagonISD::VASR", SDTHexagonVShift>; +def HexagonVLSR: SDNode<"HexagonISD::VLSR", SDTHexagonVShift>; -def: Pat<(v2i32 (HexagonVSRAW V2I32:$Rs, u5_0ImmPred:$u5)), +def: Pat<(v2i32 (HexagonVASL V2I32:$Rs, u5_0ImmPred:$u5)), + (S2_asl_i_vw V2I32:$Rs, imm:$u5)>; +def: Pat<(v4i16 (HexagonVASL V4I16:$Rs, u4_0ImmPred:$u4)), + (S2_asl_i_vh V4I16:$Rs, imm:$u4)>; +def: Pat<(v2i32 (HexagonVASR V2I32:$Rs, u5_0ImmPred:$u5)), (S2_asr_i_vw V2I32:$Rs, imm:$u5)>; -def: Pat<(v4i16 (HexagonVSRAH V4I16:$Rs, u4_0ImmPred:$u4)), +def: Pat<(v4i16 (HexagonVASR V4I16:$Rs, u4_0ImmPred:$u4)), (S2_asr_i_vh V4I16:$Rs, imm:$u4)>; -def: Pat<(v2i32 (HexagonVSRLW V2I32:$Rs, u5_0ImmPred:$u5)), +def: Pat<(v2i32 (HexagonVLSR V2I32:$Rs, u5_0ImmPred:$u5)), (S2_lsr_i_vw V2I32:$Rs, imm:$u5)>; -def: Pat<(v4i16 (HexagonVSRLH V4I16:$Rs, u4_0ImmPred:$u4)), +def: Pat<(v4i16 (HexagonVLSR V4I16:$Rs, u4_0ImmPred:$u4)), (S2_lsr_i_vh V4I16:$Rs, imm:$u4)>; -def: Pat<(v2i32 (HexagonVSHLW V2I32:$Rs, u5_0ImmPred:$u5)), - (S2_asl_i_vw V2I32:$Rs, imm:$u5)>; -def: Pat<(v4i16 (HexagonVSHLH V4I16:$Rs, u4_0ImmPred:$u4)), - (S2_asl_i_vh V4I16:$Rs, imm:$u4)>; class vshift_rr_pat<InstHexagon MI, SDNode Op, PatFrag Value> : Pat <(Op Value:$Rs, I32:$Rt), (MI Value:$Rs, I32:$Rt)>; -def: vshift_rr_pat <S2_asr_r_vw, HexagonVSRAW, V2I32>; -def: vshift_rr_pat <S2_asr_r_vh, HexagonVSRAH, V4I16>; -def: vshift_rr_pat <S2_lsr_r_vw, HexagonVSRLW, V2I32>; -def: vshift_rr_pat <S2_lsr_r_vh, HexagonVSRLH, V4I16>; -def: vshift_rr_pat <S2_asl_r_vw, HexagonVSHLW, V2I32>; -def: vshift_rr_pat <S2_asl_r_vh, HexagonVSHLH, V4I16>; - - -def SDTHexagonVecCompare_v8i8 : SDTypeProfile<1, 2, - [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v8i8>]>; -def SDTHexagonVecCompare_v4i16 : SDTypeProfile<1, 2, - [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v4i16>]>; -def SDTHexagonVecCompare_v2i32 : SDTypeProfile<1, 2, - [SDTCisSameAs<1, 2>, SDTCisVT<0, i1>, SDTCisVT<1, v2i32>]>; - -def HexagonVCMPBEQ: SDNode<"HexagonISD::VCMPBEQ", SDTHexagonVecCompare_v8i8>; -def HexagonVCMPBGT: SDNode<"HexagonISD::VCMPBGT", SDTHexagonVecCompare_v8i8>; -def HexagonVCMPBGTU: SDNode<"HexagonISD::VCMPBGTU", SDTHexagonVecCompare_v8i8>; -def HexagonVCMPHEQ: SDNode<"HexagonISD::VCMPHEQ", SDTHexagonVecCompare_v4i16>; -def HexagonVCMPHGT: SDNode<"HexagonISD::VCMPHGT", SDTHexagonVecCompare_v4i16>; -def HexagonVCMPHGTU: SDNode<"HexagonISD::VCMPHGTU", SDTHexagonVecCompare_v4i16>; -def HexagonVCMPWEQ: SDNode<"HexagonISD::VCMPWEQ", SDTHexagonVecCompare_v2i32>; -def HexagonVCMPWGT: SDNode<"HexagonISD::VCMPWGT", SDTHexagonVecCompare_v2i32>; -def HexagonVCMPWGTU: SDNode<"HexagonISD::VCMPWGTU", SDTHexagonVecCompare_v2i32>; - - -class vcmp_i1_pat<InstHexagon MI, SDNode Op, PatFrag Value> - : Pat <(i1 (Op Value:$Rs, Value:$Rt)), - (MI Value:$Rs, Value:$Rt)>; - -def: vcmp_i1_pat<A2_vcmpbeq, HexagonVCMPBEQ, V8I8>; -def: vcmp_i1_pat<A4_vcmpbgt, HexagonVCMPBGT, V8I8>; -def: vcmp_i1_pat<A2_vcmpbgtu, HexagonVCMPBGTU, V8I8>; - -def: vcmp_i1_pat<A2_vcmpheq, HexagonVCMPHEQ, V4I16>; -def: vcmp_i1_pat<A2_vcmphgt, HexagonVCMPHGT, V4I16>; -def: vcmp_i1_pat<A2_vcmphgtu, HexagonVCMPHGTU, V4I16>; - -def: vcmp_i1_pat<A2_vcmpweq, HexagonVCMPWEQ, V2I32>; -def: vcmp_i1_pat<A2_vcmpwgt, HexagonVCMPWGT, V2I32>; -def: vcmp_i1_pat<A2_vcmpwgtu, HexagonVCMPWGTU, V2I32>; +def: vshift_rr_pat <S2_asl_r_vw, HexagonVASL, V2I32>; +def: vshift_rr_pat <S2_asl_r_vh, HexagonVASL, V4I16>; +def: vshift_rr_pat <S2_asr_r_vw, HexagonVASR, V2I32>; +def: vshift_rr_pat <S2_asr_r_vh, HexagonVASR, V4I16>; +def: vshift_rr_pat <S2_lsr_r_vw, HexagonVLSR, V2I32>; +def: vshift_rr_pat <S2_lsr_r_vh, HexagonVLSR, V4I16>; class vcmp_vi1_pat<InstHexagon MI, PatFrag Op, PatFrag InVal, ValueType OutTy> @@ -3255,13 +3256,6 @@ def: Pat<(v4i8 (trunc V4I16:$Rs)), def: Pat<(v2i16 (trunc V2I32:$Rs)), (LoReg (S2_packhl (HiReg $Rs), (LoReg $Rs)))>; - -def HexagonVSXTBH : SDNode<"HexagonISD::VSXTBH", SDTUnaryOp>; -def HexagonVSXTBW : SDNode<"HexagonISD::VSXTBW", SDTUnaryOp>; - -def: Pat<(i64 (HexagonVSXTBH I32:$Rs)), (S2_vsxtbh I32:$Rs)>; -def: Pat<(i64 (HexagonVSXTBW I32:$Rs)), (S2_vsxthw I32:$Rs)>; - def: Pat<(v4i16 (zext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; def: Pat<(v2i32 (zext V2I16:$Rs)), (S2_vzxthw V2I16:$Rs)>; def: Pat<(v4i16 (anyext V4I8:$Rs)), (S2_vzxtbh V4I8:$Rs)>; @@ -3322,31 +3316,6 @@ def: Pat<(v8i8 (mul V8I8:$Rs, V8I8:$Rt)), (A2_combinew (S2_vtrunehb (VMPYB_no_V5 (HiReg $Rs), (HiReg $Rt))), (S2_vtrunehb (VMPYB_no_V5 (LoReg $Rs), (LoReg $Rt))))>; -def SDTHexagonBinOp64 : SDTypeProfile<1, 2, - [SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisVT<0, i64>]>; - -def HexagonSHUFFEB: SDNode<"HexagonISD::SHUFFEB", SDTHexagonBinOp64>; -def HexagonSHUFFEH: SDNode<"HexagonISD::SHUFFEH", SDTHexagonBinOp64>; -def HexagonSHUFFOB: SDNode<"HexagonISD::SHUFFOB", SDTHexagonBinOp64>; -def HexagonSHUFFOH: SDNode<"HexagonISD::SHUFFOH", SDTHexagonBinOp64>; - -class ShufflePat<InstHexagon MI, SDNode Op> - : Pat<(i64 (Op DoubleRegs:$src1, DoubleRegs:$src2)), - (i64 (MI DoubleRegs:$src1, DoubleRegs:$src2))>; - -// Shuffles even bytes for i=0..3: A[2*i].b = C[2*i].b; A[2*i+1].b = B[2*i].b -def: ShufflePat<S2_shuffeb, HexagonSHUFFEB>; - -// Shuffles odd bytes for i=0..3: A[2*i].b = C[2*i+1].b; A[2*i+1].b = B[2*i+1].b -def: ShufflePat<S2_shuffob, HexagonSHUFFOB>; - -// Shuffles even half for i=0,1: A[2*i].h = C[2*i].h; A[2*i+1].h = B[2*i].h -def: ShufflePat<S2_shuffeh, HexagonSHUFFEH>; - -// Shuffles odd half for i=0,1: A[2*i].h = C[2*i+1].h; A[2*i+1].h = B[2*i+1].h -def: ShufflePat<S2_shuffoh, HexagonSHUFFOH>; - - // Truncated store from v4i16 to v4i8. def truncstorev4i8: PatFrag<(ops node:$val, node:$ptr), (truncstore node:$val, node:$ptr), diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index 93fb688fc1c0..b42c1ab975a8 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -407,6 +407,11 @@ def PS_vstorerw_ai: STrivv_template<VecDblRegs, V6_vS32b_ai>, def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32b_ai_128B>, Requires<[HasV60T,UseHVXDbl]>; +def PS_vstorerw_nt_ai: STrivv_template<VecDblRegs, V6_vS32b_nt_ai>, + Requires<[HasV60T,UseHVXSgl]>; +def PS_vstorerw_nt_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32b_nt_ai_128B>, + Requires<[HasV60T,UseHVXDbl]>; + def PS_vstorerwu_ai: STrivv_template<VecDblRegs, V6_vS32Ub_ai>, Requires<[HasV60T,UseHVXSgl]>; def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B, V6_vS32Ub_ai_128B>, @@ -433,6 +438,11 @@ def PS_vloadrw_ai: LDrivv_template<VecDblRegs, V6_vL32b_ai>, def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32b_ai_128B>, Requires<[HasV60T,UseHVXDbl]>; +def PS_vloadrw_nt_ai: LDrivv_template<VecDblRegs, V6_vL32b_nt_ai>, + Requires<[HasV60T,UseHVXSgl]>; +def PS_vloadrw_nt_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32b_nt_ai_128B>, + Requires<[HasV60T,UseHVXDbl]>; + def PS_vloadrwu_ai: LDrivv_template<VecDblRegs, V6_vL32Ub_ai>, Requires<[HasV60T,UseHVXSgl]>; def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B, V6_vL32Ub_ai_128B>, diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp index db268b78cd73..4fa929a20810 100644 --- a/lib/Target/Hexagon/HexagonSplitDouble.cpp +++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp @@ -350,6 +350,8 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const { MI->getOperand(2).getImm()); case Hexagon::A4_combineri: ImmX++; + // Fall through into A4_combineir. + LLVM_FALLTHROUGH; case Hexagon::A4_combineir: { ImmX++; int64_t V = MI->getOperand(ImmX).getImm(); diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp index 76d9b31b005f..7d88b51f32dd 100644 --- a/lib/Target/Hexagon/HexagonTargetMachine.cpp +++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp @@ -110,10 +110,11 @@ SchedCustomRegistry("hexagon", "Run Hexagon's custom scheduler", namespace llvm { extern char &HexagonExpandCondsetsID; void initializeHexagonExpandCondsetsPass(PassRegistry&); - void initializeHexagonLoopIdiomRecognizePass(PassRegistry&); void initializeHexagonGenMuxPass(PassRegistry&); - void initializeHexagonOptAddrModePass(PassRegistry&); + void initializeHexagonLoopIdiomRecognizePass(PassRegistry&); void initializeHexagonNewValueJumpPass(PassRegistry&); + void initializeHexagonOptAddrModePass(PassRegistry&); + void initializeHexagonPacketizerPass(PassRegistry&); Pass *createHexagonLoopIdiomPass(); FunctionPass *createHexagonBitSimplify(); @@ -156,10 +157,11 @@ extern "C" void LLVMInitializeHexagonTarget() { RegisterTargetMachine<HexagonTargetMachine> X(getTheHexagonTarget()); PassRegistry &PR = *PassRegistry::getPassRegistry(); - initializeHexagonLoopIdiomRecognizePass(PR); initializeHexagonGenMuxPass(PR); - initializeHexagonOptAddrModePass(PR); + initializeHexagonLoopIdiomRecognizePass(PR); initializeHexagonNewValueJumpPass(PR); + initializeHexagonOptAddrModePass(PR); + initializeHexagonPacketizerPass(PR); } HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT, diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp index 7667bfb7a0eb..a3021e3dfe43 100644 --- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp +++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp @@ -60,9 +60,7 @@ namespace { class HexagonPacketizer : public MachineFunctionPass { public: static char ID; - HexagonPacketizer() : MachineFunctionPass(ID) { - initializeHexagonPacketizerPass(*PassRegistry::getPassRegistry()); - } + HexagonPacketizer() : MachineFunctionPass(ID) {} void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); @@ -89,14 +87,14 @@ namespace { char HexagonPacketizer::ID = 0; } -INITIALIZE_PASS_BEGIN(HexagonPacketizer, "packets", "Hexagon Packetizer", - false, false) +INITIALIZE_PASS_BEGIN(HexagonPacketizer, "hexagon-packetizer", + "Hexagon Packetizer", false, false) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo) INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) -INITIALIZE_PASS_END(HexagonPacketizer, "packets", "Hexagon Packetizer", - false, false) +INITIALIZE_PASS_END(HexagonPacketizer, "hexagon-packetizer", + "Hexagon Packetizer", false, false) HexagonPacketizerList::HexagonPacketizerList(MachineFunction &MF, MachineLoopInfo &MLI, AliasAnalysis *AA, diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp index 34d0b55aa22a..2a0edda8dcee 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp @@ -412,7 +412,7 @@ public: /// fixup kind as appropriate. void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t FixupValue, bool IsPCRel) const override { + uint64_t FixupValue, bool IsResolved) const override { // When FixupValue is 0 the relocation is external and there // is nothing for us to do. @@ -442,6 +442,7 @@ public: case fixup_Hexagon_B7_PCREL: if (!(isIntN(7, sValue))) HandleFixupError(7, 2, (int64_t)FixupValue, "B7_PCREL"); + LLVM_FALLTHROUGH; case fixup_Hexagon_B7_PCREL_X: InstMask = 0x00001f18; // Word32_B7 Reloc = (((Value >> 2) & 0x1f) << 8) | // Value 6-2 = Target 12-8 @@ -451,6 +452,7 @@ public: case fixup_Hexagon_B9_PCREL: if (!(isIntN(9, sValue))) HandleFixupError(9, 2, (int64_t)FixupValue, "B9_PCREL"); + LLVM_FALLTHROUGH; case fixup_Hexagon_B9_PCREL_X: InstMask = 0x003000fe; // Word32_B9 Reloc = (((Value >> 7) & 0x3) << 20) | // Value 8-7 = Target 21-20 @@ -462,6 +464,7 @@ public: case fixup_Hexagon_B13_PCREL: if (!(isIntN(13, sValue))) HandleFixupError(13, 2, (int64_t)FixupValue, "B13_PCREL"); + LLVM_FALLTHROUGH; case fixup_Hexagon_B13_PCREL_X: InstMask = 0x00202ffe; // Word32_B13 Reloc = (((Value >> 12) & 0x1) << 21) | // Value 12 = Target 21 @@ -472,6 +475,7 @@ public: case fixup_Hexagon_B15_PCREL: if (!(isIntN(15, sValue))) HandleFixupError(15, 2, (int64_t)FixupValue, "B15_PCREL"); + LLVM_FALLTHROUGH; case fixup_Hexagon_B15_PCREL_X: InstMask = 0x00df20fe; // Word32_B15 Reloc = (((Value >> 13) & 0x3) << 22) | // Value 14-13 = Target 23-22 @@ -483,6 +487,7 @@ public: case fixup_Hexagon_B22_PCREL: if (!(isIntN(22, sValue))) HandleFixupError(22, 2, (int64_t)FixupValue, "B22_PCREL"); + LLVM_FALLTHROUGH; case fixup_Hexagon_B22_PCREL_X: InstMask = 0x01ff3ffe; // Word32_B22 Reloc = (((Value >> 13) & 0x1ff) << 16) | // Value 21-13 = Target 24-16 diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h index d8009c5da08e..7f90e83fc8e9 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h @@ -169,8 +169,11 @@ namespace HexagonII { // Hexagon specific MO operand flag mask. enum HexagonMOTargetFlagVal { - //===------------------------------------------------------------------===// - // Hexagon Specific MachineOperand flags. + // Hexagon-specific MachineOperand target flags. + // + // When chaning these, make sure to update + // getSerializableDirectMachineOperandTargetFlags and + // getSerializableBitmaskMachineOperandTargetFlags if needed. MO_NO_FLAG, /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation @@ -207,10 +210,12 @@ namespace HexagonII { MO_TPREL, // HMOTF_ConstExtended - // Addendum to abovem, indicates a const extended op + // Addendum to above, indicates a const extended op // Can be used as a mask. - HMOTF_ConstExtended = 0x80 + HMOTF_ConstExtended = 0x80, + // Union of all bitmasks (currently only HMOTF_ConstExtended). + MO_Bitmasks = HMOTF_ConstExtended }; // Hexagon Sub-instruction classes. diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp index 564d43b45cb8..1604e7c8dc54 100644 --- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp +++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp @@ -259,6 +259,7 @@ bool HexagonShuffler::check() { break; case HexagonII::TypeCVI_VM_VP_LDU: ++onlyNo1; + LLVM_FALLTHROUGH; case HexagonII::TypeCVI_VM_LD: case HexagonII::TypeCVI_VM_TMP_LD: case HexagonII::TypeLD: @@ -274,6 +275,7 @@ bool HexagonShuffler::check() { break; case HexagonII::TypeCVI_VM_STU: ++onlyNo1; + LLVM_FALLTHROUGH; case HexagonII::TypeCVI_VM_ST: case HexagonII::TypeCVI_VM_NEW_ST: case HexagonII::TypeST: diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp index 72e471f5766e..1394ac7210f2 100644 --- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp +++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp @@ -787,6 +787,7 @@ std::unique_ptr<LanaiOperand> LanaiAsmParser::parseImmediate() { case AsmToken::Dot: if (!Parser.parseExpression(ExprVal)) return LanaiOperand::createImm(ExprVal, Start, End); + LLVM_FALLTHROUGH; default: return nullptr; } diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp index c212726113ab..bbce5f670c99 100644 --- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp +++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp @@ -51,7 +51,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -92,7 +92,7 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { void LanaiAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool /*IsPCRel*/) const { + bool /*IsResolved*/) const { MCFixupKind Kind = Fixup.getKind(); Value = adjustFixupValue(static_cast<unsigned>(Kind), Value); diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp index 69b1ba1528d0..b72c9d534478 100644 --- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp +++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp @@ -304,6 +304,9 @@ class MipsAsmParser : public MCTargetAsmParser { bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, const MCSubtargetInfo *STI); + bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI); + bool reportParseError(Twine ErrorMsg); bool reportParseError(SMLoc Loc, Twine ErrorMsg); @@ -343,6 +346,8 @@ class MipsAsmParser : public MCTargetAsmParser { bool parseSetPushDirective(); bool parseSetSoftFloatDirective(); bool parseSetHardFloatDirective(); + bool parseSetMtDirective(); + bool parseSetNoMtDirective(); bool parseSetAssignment(); @@ -628,6 +633,9 @@ public: bool useSoftFloat() const { return getSTI().getFeatureBits()[Mips::FeatureSoftFloat]; } + bool hasMT() const { + return getSTI().getFeatureBits()[Mips::FeatureMT]; + } /// Warn if RegIndex is the same as the current AT. void warnIfRegIndexIsAT(unsigned RegIndex, SMLoc Loc); @@ -1966,6 +1974,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc, case Mips::SDIV_MM: FirstOp = 0; SecondOp = 1; + LLVM_FALLTHROUGH; case Mips::SDivMacro: case Mips::DSDivMacro: case Mips::UDivMacro: @@ -2505,6 +2514,16 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; case Mips::SEQIMacro: return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; + case Mips::MFTC0: case Mips::MTTC0: + case Mips::MFTGPR: case Mips::MTTGPR: + case Mips::MFTLO: case Mips::MTTLO: + case Mips::MFTHI: case Mips::MTTHI: + case Mips::MFTACX: case Mips::MTTACX: + case Mips::MFTDSP: case Mips::MTTDSP: + case Mips::MFTC1: case Mips::MTTC1: + case Mips::MFTHC1: case Mips::MTTHC1: + case Mips::CFTC1: case Mips::CTTC1: + return expandMXTRAlias(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success; } } @@ -4876,6 +4895,212 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, return false; } +// Map the DSP accumulator and control register to the corresponding gpr +// operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions +// do not map the DSP registers contigously to gpr registers. +static unsigned getRegisterForMxtrDSP(MCInst &Inst, bool IsMFDSP) { + switch (Inst.getOpcode()) { + case Mips::MFTLO: + case Mips::MTTLO: + switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { + case Mips::AC0: + return Mips::ZERO; + case Mips::AC1: + return Mips::A0; + case Mips::AC2: + return Mips::T0; + case Mips::AC3: + return Mips::T4; + default: + llvm_unreachable("Unknown register for 'mttr' alias!"); + } + case Mips::MFTHI: + case Mips::MTTHI: + switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { + case Mips::AC0: + return Mips::AT; + case Mips::AC1: + return Mips::A1; + case Mips::AC2: + return Mips::T1; + case Mips::AC3: + return Mips::T5; + default: + llvm_unreachable("Unknown register for 'mttr' alias!"); + } + case Mips::MFTACX: + case Mips::MTTACX: + switch (Inst.getOperand(IsMFDSP ? 1 : 0).getReg()) { + case Mips::AC0: + return Mips::V0; + case Mips::AC1: + return Mips::A2; + case Mips::AC2: + return Mips::T2; + case Mips::AC3: + return Mips::T6; + default: + llvm_unreachable("Unknown register for 'mttr' alias!"); + } + case Mips::MFTDSP: + case Mips::MTTDSP: + return Mips::S0; + default: + llvm_unreachable("Unknown instruction for 'mttr' dsp alias!"); + } +} + +// Map the floating point register operand to the corresponding register +// operand. +static unsigned getRegisterForMxtrFP(MCInst &Inst, bool IsMFTC1) { + switch (Inst.getOperand(IsMFTC1 ? 1 : 0).getReg()) { + case Mips::F0: return Mips::ZERO; + case Mips::F1: return Mips::AT; + case Mips::F2: return Mips::V0; + case Mips::F3: return Mips::V1; + case Mips::F4: return Mips::A0; + case Mips::F5: return Mips::A1; + case Mips::F6: return Mips::A2; + case Mips::F7: return Mips::A3; + case Mips::F8: return Mips::T0; + case Mips::F9: return Mips::T1; + case Mips::F10: return Mips::T2; + case Mips::F11: return Mips::T3; + case Mips::F12: return Mips::T4; + case Mips::F13: return Mips::T5; + case Mips::F14: return Mips::T6; + case Mips::F15: return Mips::T7; + case Mips::F16: return Mips::S0; + case Mips::F17: return Mips::S1; + case Mips::F18: return Mips::S2; + case Mips::F19: return Mips::S3; + case Mips::F20: return Mips::S4; + case Mips::F21: return Mips::S5; + case Mips::F22: return Mips::S6; + case Mips::F23: return Mips::S7; + case Mips::F24: return Mips::T8; + case Mips::F25: return Mips::T9; + case Mips::F26: return Mips::K0; + case Mips::F27: return Mips::K1; + case Mips::F28: return Mips::GP; + case Mips::F29: return Mips::SP; + case Mips::F30: return Mips::FP; + case Mips::F31: return Mips::RA; + default: llvm_unreachable("Unknown register for mttc1 alias!"); + } +} + +// Map the coprocessor operand the corresponding gpr register operand. +static unsigned getRegisterForMxtrC0(MCInst &Inst, bool IsMFTC0) { + switch (Inst.getOperand(IsMFTC0 ? 1 : 0).getReg()) { + case Mips::COP00: return Mips::ZERO; + case Mips::COP01: return Mips::AT; + case Mips::COP02: return Mips::V0; + case Mips::COP03: return Mips::V1; + case Mips::COP04: return Mips::A0; + case Mips::COP05: return Mips::A1; + case Mips::COP06: return Mips::A2; + case Mips::COP07: return Mips::A3; + case Mips::COP08: return Mips::T0; + case Mips::COP09: return Mips::T1; + case Mips::COP010: return Mips::T2; + case Mips::COP011: return Mips::T3; + case Mips::COP012: return Mips::T4; + case Mips::COP013: return Mips::T5; + case Mips::COP014: return Mips::T6; + case Mips::COP015: return Mips::T7; + case Mips::COP016: return Mips::S0; + case Mips::COP017: return Mips::S1; + case Mips::COP018: return Mips::S2; + case Mips::COP019: return Mips::S3; + case Mips::COP020: return Mips::S4; + case Mips::COP021: return Mips::S5; + case Mips::COP022: return Mips::S6; + case Mips::COP023: return Mips::S7; + case Mips::COP024: return Mips::T8; + case Mips::COP025: return Mips::T9; + case Mips::COP026: return Mips::K0; + case Mips::COP027: return Mips::K1; + case Mips::COP028: return Mips::GP; + case Mips::COP029: return Mips::SP; + case Mips::COP030: return Mips::FP; + case Mips::COP031: return Mips::RA; + default: llvm_unreachable("Unknown register for mttc0 alias!"); + } +} + +/// Expand an alias of 'mftr' or 'mttr' into the full instruction, by producing +/// an mftr or mttr with the correctly mapped gpr register, u, sel and h bits. +bool MipsAsmParser::expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out, + const MCSubtargetInfo *STI) { + MipsTargetStreamer &TOut = getTargetStreamer(); + unsigned rd = 0; + unsigned u = 1; + unsigned sel = 0; + unsigned h = 0; + bool IsMFTR = false; + switch (Inst.getOpcode()) { + case Mips::MFTC0: + IsMFTR = true; + LLVM_FALLTHROUGH; + case Mips::MTTC0: + u = 0; + rd = getRegisterForMxtrC0(Inst, IsMFTR); + sel = Inst.getOperand(2).getImm(); + break; + case Mips::MFTGPR: + IsMFTR = true; + LLVM_FALLTHROUGH; + case Mips::MTTGPR: + rd = Inst.getOperand(IsMFTR ? 1 : 0).getReg(); + break; + case Mips::MFTLO: + case Mips::MFTHI: + case Mips::MFTACX: + case Mips::MFTDSP: + IsMFTR = true; + LLVM_FALLTHROUGH; + case Mips::MTTLO: + case Mips::MTTHI: + case Mips::MTTACX: + case Mips::MTTDSP: + rd = getRegisterForMxtrDSP(Inst, IsMFTR); + sel = 1; + break; + case Mips::MFTHC1: + h = 1; + LLVM_FALLTHROUGH; + case Mips::MFTC1: + IsMFTR = true; + rd = getRegisterForMxtrFP(Inst, IsMFTR); + sel = 2; + break; + case Mips::MTTHC1: + h = 1; + LLVM_FALLTHROUGH; + case Mips::MTTC1: + rd = getRegisterForMxtrFP(Inst, IsMFTR); + sel = 2; + break; + case Mips::CFTC1: + IsMFTR = true; + LLVM_FALLTHROUGH; + case Mips::CTTC1: + rd = getRegisterForMxtrFP(Inst, IsMFTR); + sel = 3; + break; + } + unsigned Op0 = IsMFTR ? Inst.getOperand(0).getReg() : rd; + unsigned Op1 = + IsMFTR ? rd + : (Inst.getOpcode() != Mips::MTTDSP ? Inst.getOperand(1).getReg() + : Inst.getOperand(0).getReg()); + + TOut.emitRRIII(IsMFTR ? Mips::MFTR : Mips::MTTR, Op0, Op1, u, sel, h, IDLoc, + STI); + return false; +} + unsigned MipsAsmParser::checkEarlyTargetMatchPredicate(MCInst &Inst, const OperandVector &Operands) { @@ -6329,6 +6554,39 @@ bool MipsAsmParser::parseSetNoOddSPRegDirective() { return false; } +bool MipsAsmParser::parseSetMtDirective() { + MCAsmParser &Parser = getParser(); + Parser.Lex(); // Eat "mt". + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + setFeatureBits(Mips::FeatureMT, "mt"); + getTargetStreamer().emitDirectiveSetMt(); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + +bool MipsAsmParser::parseSetNoMtDirective() { + MCAsmParser &Parser = getParser(); + Parser.Lex(); // Eat "nomt". + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + clearFeatureBits(Mips::FeatureMT, "mt"); + + getTargetStreamer().emitDirectiveSetNoMt(); + Parser.Lex(); // Consume the EndOfStatement. + return false; +} + bool MipsAsmParser::parseSetPopDirective() { MCAsmParser &Parser = getParser(); SMLoc Loc = getLexer().getLoc(); @@ -6829,6 +7087,10 @@ bool MipsAsmParser::parseDirectiveSet() { return parseSetMsaDirective(); } else if (Tok.getString() == "nomsa") { return parseSetNoMsaDirective(); + } else if (Tok.getString() == "mt") { + return parseSetMtDirective(); + } else if (Tok.getString() == "nomt") { + return parseSetNoMtDirective(); } else if (Tok.getString() == "softfloat") { return parseSetSoftFloatDirective(); } else if (Tok.getString() == "hardfloat") { @@ -7078,6 +7340,7 @@ bool MipsAsmParser::parseSSectionDirective(StringRef Section, unsigned Type) { /// ::= .module fp=value /// ::= .module softfloat /// ::= .module hardfloat +/// ::= .module mt bool MipsAsmParser::parseDirectiveModule() { MCAsmParser &Parser = getParser(); MCAsmLexer &Lexer = getLexer(); @@ -7177,6 +7440,25 @@ bool MipsAsmParser::parseDirectiveModule() { } return false; // parseDirectiveModule has finished successfully. + } else if (Option == "mt") { + setModuleFeatureBits(Mips::FeatureMT, "mt"); + + // Synchronize the ABI Flags information with the FeatureBits information we + // updated above. + getTargetStreamer().updateABIInfo(*this); + + // If printing assembly, use the recently updated ABI Flags information. + // If generating ELF, don't do anything (the .MIPS.abiflags section gets + // emitted later). + getTargetStreamer().emitDirectiveModuleMT(); + + // If this is not the end of the statement, report an error. + if (getLexer().isNot(AsmToken::EndOfStatement)) { + reportParseError("unexpected token, expected end of statement"); + return false; + } + + return false; // parseDirectiveModule has finished successfully. } else { return Error(L, "'" + Twine(Option) + "' is not a valid .module option."); } diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h index f38541027023..9abd4f1d6b08 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h +++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h @@ -159,6 +159,8 @@ public: ASESet |= Mips::AFL_ASE_MICROMIPS; if (P.inMips16Mode()) ASESet |= Mips::AFL_ASE_MIPS16; + if (P.hasMT()) + ASESet |= Mips::AFL_ASE_MT; } template <class PredicateLibrary> diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp index ae48d6e38fa0..a1ed0ea4d7f3 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp @@ -238,7 +238,7 @@ static unsigned calculateMMLEIndex(unsigned i) { void MipsAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { MCFixupKind Kind = Fixup.getKind(); MCContext &Ctx = Asm.getContext(); Value = adjustFixupValue(Fixup, Value, Ctx); diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h index bf3b290b7ed5..8ebde3b9b7a4 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h +++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h @@ -40,7 +40,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; Optional<MCFixupKind> getFixupKind(StringRef Name) const override; const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp index 0cd4aebe4d16..7caeb08589af 100644 --- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp +++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp @@ -50,6 +50,8 @@ void MipsTargetStreamer::emitDirectiveSetMacro() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoMacro() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetMsa() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetNoMsa() { forbidModuleDirective(); } +void MipsTargetStreamer::emitDirectiveSetMt() {} +void MipsTargetStreamer::emitDirectiveSetNoMt() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetAt() { forbidModuleDirective(); } void MipsTargetStreamer::emitDirectiveSetAtWithArg(unsigned RegNo) { forbidModuleDirective(); @@ -118,6 +120,7 @@ void MipsTargetStreamer::emitDirectiveModuleOddSPReg() { } void MipsTargetStreamer::emitDirectiveModuleSoftFloat() {} void MipsTargetStreamer::emitDirectiveModuleHardFloat() {} +void MipsTargetStreamer::emitDirectiveModuleMT() {} void MipsTargetStreamer::emitDirectiveSetFp( MipsABIFlagsSection::FpABIKind Value) { forbidModuleDirective(); @@ -190,6 +193,21 @@ void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, emitRRX(Opcode, Reg0, Reg1, MCOperand::createImm(Imm), IDLoc, STI); } +void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0, + unsigned Reg1, int16_t Imm0, int16_t Imm1, + int16_t Imm2, SMLoc IDLoc, + const MCSubtargetInfo *STI) { + MCInst TmpInst; + TmpInst.setOpcode(Opcode); + TmpInst.addOperand(MCOperand::createReg(Reg0)); + TmpInst.addOperand(MCOperand::createReg(Reg1)); + TmpInst.addOperand(MCOperand::createImm(Imm0)); + TmpInst.addOperand(MCOperand::createImm(Imm1)); + TmpInst.addOperand(MCOperand::createImm(Imm2)); + TmpInst.setLoc(IDLoc); + getStreamer().EmitInstruction(TmpInst, *STI); +} + void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, const MCSubtargetInfo *STI) { @@ -392,6 +410,16 @@ void MipsTargetAsmStreamer::emitDirectiveSetNoMsa() { MipsTargetStreamer::emitDirectiveSetNoMsa(); } +void MipsTargetAsmStreamer::emitDirectiveSetMt() { + OS << "\t.set\tmt\n"; + MipsTargetStreamer::emitDirectiveSetMt(); +} + +void MipsTargetAsmStreamer::emitDirectiveSetNoMt() { + OS << "\t.set\tnomt\n"; + MipsTargetStreamer::emitDirectiveSetNoMt(); +} + void MipsTargetAsmStreamer::emitDirectiveSetAt() { OS << "\t.set\tat\n"; MipsTargetStreamer::emitDirectiveSetAt(); @@ -656,6 +684,10 @@ void MipsTargetAsmStreamer::emitDirectiveModuleHardFloat() { OS << "\t.module\thardfloat\n"; } +void MipsTargetAsmStreamer::emitDirectiveModuleMT() { + OS << "\t.module\tmt\n"; +} + // This part is for ELF object output. MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI) diff --git a/lib/Target/Mips/Mips.td b/lib/Target/Mips/Mips.td index f24761d7d101..d2f0fdcc6cc1 100644 --- a/lib/Target/Mips/Mips.td +++ b/lib/Target/Mips/Mips.td @@ -188,6 +188,8 @@ def FeatureUseTCCInDIV : SubtargetFeature< def FeatureMadd4 : SubtargetFeature<"nomadd4", "DisableMadd4", "true", "Disable 4-operand madd.fmt and related instructions">; +def FeatureMT : SubtargetFeature<"mt", "HasMT", "true", "Mips MT ASE">; + //===----------------------------------------------------------------------===// // Mips processors supported. //===----------------------------------------------------------------------===// diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index 40078fb77144..89a5854bede0 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -240,7 +240,8 @@ def HasMSA : Predicate<"Subtarget->hasMSA()">, AssemblerPredicate<"FeatureMSA">; def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">, AssemblerPredicate<"!FeatureMadd4">; - +def HasMT : Predicate<"Subtarget->hasMT()">, + AssemblerPredicate<"FeatureMT">; //===----------------------------------------------------------------------===// // Mips GPR size adjectives. @@ -382,6 +383,10 @@ class ASE_MSA64 { list<Predicate> InsnPredicates = [HasMSA, HasMips64]; } +class ASE_MT { + list <Predicate> InsnPredicates = [HasMT]; +} + // Class used for separating microMIPSr6 and microMIPS (r3) instruction. // It can be used only on instructions that doesn't inherit PredicateControl. class ISA_MICROMIPS_NOT_32R6_64R6 : PredicateControl { @@ -2919,6 +2924,10 @@ include "MipsMSAInstrInfo.td" include "MipsEVAInstrFormats.td" include "MipsEVAInstrInfo.td" +// MT +include "MipsMTInstrFormats.td" +include "MipsMTInstrInfo.td" + // Micromips include "MicroMipsInstrFormats.td" include "MicroMipsInstrInfo.td" diff --git a/lib/Target/Mips/MipsMTInstrFormats.td b/lib/Target/Mips/MipsMTInstrFormats.td new file mode 100644 index 000000000000..edc0981e6278 --- /dev/null +++ b/lib/Target/Mips/MipsMTInstrFormats.td @@ -0,0 +1,99 @@ +//===-- MipsMTInstrFormats.td - Mips Instruction Formats ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Describe the MIPS MT instructions format +// +// opcode - operation code. +// rt - destination register +// +//===----------------------------------------------------------------------===// + +class MipsMTInst : MipsInst<(outs), (ins), "", [], NoItinerary, FrmOther>, + PredicateControl { + let DecoderNamespace = "Mips"; + let EncodingPredicates = [HasStdEnc]; +} + +class OPCODE1<bits<1> Val> { + bits<1> Value = Val; +} + +def OPCODE_SC_D : OPCODE1<0b0>; +def OPCODE_SC_E : OPCODE1<0b1>; + +class FIELD5<bits<5> Val> { + bits<5> Value = Val; +} + +def FIELD5_1_DMT_EMT : FIELD5<0b00001>; +def FIELD5_2_DMT_EMT : FIELD5<0b01111>; +def FIELD5_1_2_DVPE_EVPE : FIELD5<0b00000>; +def FIELD5_MFTR : FIELD5<0b01000>; +def FIELD5_MTTR : FIELD5<0b01100>; + +class COP0_MFMC0_MT<FIELD5 Op1, FIELD5 Op2, OPCODE1 sc> : MipsMTInst { + bits<32> Inst; + + bits<5> rt; + let Inst{31-26} = 0b010000; // COP0 + let Inst{25-21} = 0b01011; // MFMC0 + let Inst{20-16} = rt; + let Inst{15-11} = Op1.Value; + let Inst{10-6} = Op2.Value; + let Inst{5} = sc.Value; + let Inst{4-3} = 0b00; + let Inst{2-0} = 0b001; +} + +class COP0_MFTTR_MT<FIELD5 Op> : MipsMTInst { + bits<32> Inst; + + bits<5> rt; + bits<5> rd; + bits<1> u; + bits<1> h; + bits<3> sel; + let Inst{31-26} = 0b010000; // COP0 + let Inst{25-21} = Op.Value; // MFMC0 + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-6} = 0b00000; // rx - currently unsupported. + let Inst{5} = u; + let Inst{4} = h; + let Inst{3} = 0b0; + let Inst{2-0} = sel; +} + +class SPECIAL3_MT_FORK : MipsMTInst { + bits<32> Inst; + + bits<5> rs; + bits<5> rt; + bits<5> rd; + let Inst{31-26} = 0b011111; // SPECIAL3 + let Inst{25-21} = rs; + let Inst{20-16} = rt; + let Inst{15-11} = rd; + let Inst{10-6} = 0b00000; + let Inst{5-0} = 0b001000; // FORK +} + +class SPECIAL3_MT_YIELD : MipsMTInst { + bits<32> Inst; + + bits<5> rs; + bits<5> rd; + let Inst{31-26} = 0b011111; // SPECIAL3 + let Inst{25-21} = rs; + let Inst{20-16} = 0b00000; + let Inst{15-11} = rd; + let Inst{10-6} = 0b00000; + let Inst{5-0} = 0b001001; // FORK +} diff --git a/lib/Target/Mips/MipsMTInstrInfo.td b/lib/Target/Mips/MipsMTInstrInfo.td new file mode 100644 index 000000000000..72e626cbec40 --- /dev/null +++ b/lib/Target/Mips/MipsMTInstrInfo.td @@ -0,0 +1,208 @@ +//===-- MipsMTInstrInfo.td - Mips MT Instruction Infos -----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the MIPS MT ASE as defined by MD00378 1.12. +// +// TODO: Add support for the microMIPS encodings for the MT ASE and add the +// instruction mappings. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// MIPS MT Instruction Encodings +//===----------------------------------------------------------------------===// + +class DMT_ENC : COP0_MFMC0_MT<FIELD5_1_DMT_EMT, FIELD5_2_DMT_EMT, + OPCODE_SC_D>; + +class EMT_ENC : COP0_MFMC0_MT<FIELD5_1_DMT_EMT, FIELD5_2_DMT_EMT, + OPCODE_SC_E>; + +class DVPE_ENC : COP0_MFMC0_MT<FIELD5_1_2_DVPE_EVPE, FIELD5_1_2_DVPE_EVPE, + OPCODE_SC_D>; + +class EVPE_ENC : COP0_MFMC0_MT<FIELD5_1_2_DVPE_EVPE, FIELD5_1_2_DVPE_EVPE, + OPCODE_SC_E>; + +class FORK_ENC : SPECIAL3_MT_FORK; + +class YIELD_ENC : SPECIAL3_MT_YIELD; + +class MFTR_ENC : COP0_MFTTR_MT<FIELD5_MFTR>; + +class MTTR_ENC : COP0_MFTTR_MT<FIELD5_MTTR>; + +//===----------------------------------------------------------------------===// +// MIPS MT Instruction Descriptions +//===----------------------------------------------------------------------===// + +class MT_1R_DESC_BASE<string instr_asm, InstrItinClass Itin = NoItinerary> { + dag OutOperandList = (outs GPR32Opnd:$rt); + dag InOperandList = (ins); + string AsmString = !strconcat(instr_asm, "\t$rt"); + list<dag> Pattern = []; + InstrItinClass Itinerary = Itin; +} + +class MFTR_DESC { + dag OutOperandList = (outs GPR32Opnd:$rd); + dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h); + string AsmString = "mftr\t$rd, $rt, $u, $sel, $h"; + list<dag> Pattern = []; + InstrItinClass Itinerary = II_MFTR; +} + +class MTTR_DESC { + dag OutOperandList = (outs GPR32Opnd:$rd); + dag InOperandList = (ins GPR32Opnd:$rt, uimm1:$u, uimm3:$sel, uimm1:$h); + string AsmString = "mttr\t$rt, $rd, $u, $sel, $h"; + list<dag> Pattern = []; + InstrItinClass Itinerary = II_MTTR; +} + +class FORK_DESC { + dag OutOperandList = (outs GPR32Opnd:$rs, GPR32Opnd:$rd); + dag InOperandList = (ins GPR32Opnd:$rt); + string AsmString = "fork\t$rd, $rs, $rt"; + list<dag> Pattern = []; + InstrItinClass Itinerary = II_FORK; +} + +class YIELD_DESC { + dag OutOperandList = (outs GPR32Opnd:$rd); + dag InOperandList = (ins GPR32Opnd:$rs); + string AsmString = "yield\t$rd, $rs"; + list<dag> Pattern = []; + InstrItinClass Itinerary = II_YIELD; +} + +class DMT_DESC : MT_1R_DESC_BASE<"dmt", II_DMT>; + +class EMT_DESC : MT_1R_DESC_BASE<"emt", II_EMT>; + +class DVPE_DESC : MT_1R_DESC_BASE<"dvpe", II_DVPE>; + +class EVPE_DESC : MT_1R_DESC_BASE<"evpe", II_EVPE>; + +//===----------------------------------------------------------------------===// +// MIPS MT Instruction Definitions +//===----------------------------------------------------------------------===// +let hasSideEffects = 1, isNotDuplicable = 1, + AdditionalPredicates = [NotInMicroMips] in { + def DMT : DMT_ENC, DMT_DESC, ASE_MT; + + def EMT : EMT_ENC, EMT_DESC, ASE_MT; + + def DVPE : DVPE_ENC, DVPE_DESC, ASE_MT; + + def EVPE : EVPE_ENC, EVPE_DESC, ASE_MT; + + def FORK : FORK_ENC, FORK_DESC, ASE_MT; + + def YIELD : YIELD_ENC, YIELD_DESC, ASE_MT; + + def MFTR : MFTR_ENC, MFTR_DESC, ASE_MT; + + def MTTR : MTTR_ENC, MTTR_DESC, ASE_MT; +} + +//===----------------------------------------------------------------------===// +// MIPS MT Pseudo Instructions - used to support mtfr & mttr aliases. +//===----------------------------------------------------------------------===// +def MFTC0 : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins COP0Opnd:$rt, + uimm3:$sel), + "mftc0 $rd, $rt, $sel">, ASE_MT; + +def MFTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rt, + uimm3:$sel), + "mftgpr $rd, $rt">, ASE_MT; + +def MFTLO : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), + "mftlo $rt, $ac">, ASE_MT; + +def MFTHI : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), + "mfthi $rt, $ac">, ASE_MT; + +def MFTACX : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins ACC64DSPOpnd:$ac), + "mftacx $rt, $ac">, ASE_MT; + +def MFTDSP : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins), + "mftdsp $rt">, ASE_MT; + +def MFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft), + "mftc1 $rt, $ft">, ASE_MT; + +def MFTHC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGR32Opnd:$ft), + "mfthc1 $rt, $ft">, ASE_MT; + +def CFTC1 : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins FGRCCOpnd:$ft), + "cftc1 $rt, $ft">, ASE_MT; + + +def MTTC0 : MipsAsmPseudoInst<(outs COP0Opnd:$rd), (ins GPR32Opnd:$rt, + uimm3:$sel), + "mttc0 $rt, $rd, $sel">, ASE_MT; + +def MTTGPR : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins GPR32Opnd:$rd), + "mttgpr $rd, $rt">, ASE_MT; + +def MTTLO : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), + "mttlo $rt, $ac">, ASE_MT; + +def MTTHI : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), + "mtthi $rt, $ac">, ASE_MT; + +def MTTACX : MipsAsmPseudoInst<(outs ACC64DSPOpnd:$ac), (ins GPR32Opnd:$rt), + "mttacx $rt, $ac">, ASE_MT; + +def MTTDSP : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rt), + "mttdsp $rt">, ASE_MT; + +def MTTC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt), + "mttc1 $rt, $ft">, ASE_MT; + +def MTTHC1 : MipsAsmPseudoInst<(outs FGR32Opnd:$ft), (ins GPR32Opnd:$rt), + "mtthc1 $rt, $ft">, ASE_MT; + +def CTTC1 : MipsAsmPseudoInst<(outs FGRCCOpnd:$ft), (ins GPR32Opnd:$rt), + "cttc1 $rt, $ft">, ASE_MT; + +//===----------------------------------------------------------------------===// +// MIPS MT Instruction Definitions +//===----------------------------------------------------------------------===// + +let AdditionalPredicates = [NotInMicroMips] in { + def : MipsInstAlias<"dmt", (DMT ZERO), 1>, ASE_MT; + + def : MipsInstAlias<"emt", (EMT ZERO), 1>, ASE_MT; + + def : MipsInstAlias<"dvpe", (DVPE ZERO), 1>, ASE_MT; + + def : MipsInstAlias<"evpe", (EVPE ZERO), 1>, ASE_MT; + + def : MipsInstAlias<"yield $rs", (YIELD ZERO, GPR32Opnd:$rs), 1>, ASE_MT; + + def : MipsInstAlias<"mftc0 $rd, $rt", (MFTC0 GPR32Opnd:$rd, COP0Opnd:$rt, 0), + 1>, ASE_MT; + + def : MipsInstAlias<"mftlo $rt", (MFTLO GPR32Opnd:$rt, AC0), 1>, ASE_MT; + + def : MipsInstAlias<"mfthi $rt", (MFTHI GPR32Opnd:$rt, AC0), 1>, ASE_MT; + + def : MipsInstAlias<"mftacx $rt", (MFTACX GPR32Opnd:$rt, AC0), 1>, ASE_MT; + + def : MipsInstAlias<"mttc0 $rd, $rt", (MTTC0 COP0Opnd:$rt, GPR32Opnd:$rd, 0), + 1>, ASE_MT; + + def : MipsInstAlias<"mttlo $rt", (MTTLO AC0, GPR32Opnd:$rt), 1>, ASE_MT; + + def : MipsInstAlias<"mtthi $rt", (MTTHI AC0, GPR32Opnd:$rt), 1>, ASE_MT; + + def : MipsInstAlias<"mttacx $rt", (MTTACX AC0, GPR32Opnd:$rt), 1>, ASE_MT; +} diff --git a/lib/Target/Mips/MipsSchedule.td b/lib/Target/Mips/MipsSchedule.td index c0de59ba15f5..8ec55ab6284d 100644 --- a/lib/Target/Mips/MipsSchedule.td +++ b/lib/Target/Mips/MipsSchedule.td @@ -84,6 +84,7 @@ def II_DIVU : InstrItinClass; def II_DIV_D : InstrItinClass; def II_DIV_S : InstrItinClass; def II_DMFC0 : InstrItinClass; +def II_DMT : InstrItinClass; def II_DMTC0 : InstrItinClass; def II_DMFC1 : InstrItinClass; def II_DMTC1 : InstrItinClass; @@ -113,8 +114,12 @@ def II_DSBH : InstrItinClass; def II_DSHD : InstrItinClass; def II_DSUBU : InstrItinClass; def II_DSUB : InstrItinClass; +def II_DVPE : InstrItinClass; +def II_EMT : InstrItinClass; +def II_EVPE : InstrItinClass; def II_EXT : InstrItinClass; // Any EXT instruction def II_FLOOR : InstrItinClass; +def II_FORK : InstrItinClass; def II_INS : InstrItinClass; // Any INS instruction def II_IndirectBranchPseudo : InstrItinClass; // Indirect branch pseudo. def II_J : InstrItinClass; @@ -221,6 +226,7 @@ def II_MFC1 : InstrItinClass; def II_MFHC1 : InstrItinClass; def II_MFC2 : InstrItinClass; def II_MFHI_MFLO : InstrItinClass; // mfhi and mflo +def II_MFTR : InstrItinClass; def II_MOD : InstrItinClass; def II_MODU : InstrItinClass; def II_MOVE : InstrItinClass; @@ -250,6 +256,7 @@ def II_MTC1 : InstrItinClass; def II_MTHC1 : InstrItinClass; def II_MTC2 : InstrItinClass; def II_MTHI_MTLO : InstrItinClass; // mthi and mtlo +def II_MTTR : InstrItinClass; def II_MUL : InstrItinClass; def II_MUH : InstrItinClass; def II_MUHU : InstrItinClass; @@ -345,6 +352,7 @@ def II_WRPGPR : InstrItinClass; def II_RDPGPR : InstrItinClass; def II_DVP : InstrItinClass; def II_EVP : InstrItinClass; +def II_YIELD : InstrItinClass; //===----------------------------------------------------------------------===// // Mips Generic instruction itineraries. @@ -386,6 +394,7 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_DCLZ , [InstrStage<1, [ALU]>]>, InstrItinData<II_DMOD , [InstrStage<17, [IMULDIV]>]>, InstrItinData<II_DMODU , [InstrStage<17, [IMULDIV]>]>, + InstrItinData<II_DMT , [InstrStage<2, [ALU]>]>, InstrItinData<II_DSLL , [InstrStage<1, [ALU]>]>, InstrItinData<II_DSLL32 , [InstrStage<1, [ALU]>]>, InstrItinData<II_DSRL , [InstrStage<1, [ALU]>]>, @@ -404,7 +413,11 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_DSHD , [InstrStage<1, [ALU]>]>, InstrItinData<II_DCLO , [InstrStage<1, [ALU]>]>, InstrItinData<II_DCLZ , [InstrStage<1, [ALU]>]>, + InstrItinData<II_DVPE , [InstrStage<2, [ALU]>]>, + InstrItinData<II_EMT , [InstrStage<2, [ALU]>]>, + InstrItinData<II_EVPE , [InstrStage<2, [ALU]>]>, InstrItinData<II_EXT , [InstrStage<1, [ALU]>]>, + InstrItinData<II_FORK , [InstrStage<1, [ALU]>]>, InstrItinData<II_INS , [InstrStage<1, [ALU]>]>, InstrItinData<II_LUI , [InstrStage<1, [ALU]>]>, InstrItinData<II_MOVE , [InstrStage<1, [ALU]>]>, @@ -653,12 +666,14 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_MFHC0 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MFC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MFC2 , [InstrStage<2, [ALU]>]>, + InstrItinData<II_MFTR , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTC0 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTHC0 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTC2 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MFHC1 , [InstrStage<2, [ALU]>]>, InstrItinData<II_MTHC1 , [InstrStage<2, [ALU]>]>, + InstrItinData<II_MTTR , [InstrStage<2, [ALU]>]>, InstrItinData<II_CACHE , [InstrStage<1, [ALU]>]>, InstrItinData<II_PREF , [InstrStage<1, [ALU]>]>, InstrItinData<II_CACHEE , [InstrStage<1, [ALU]>]>, @@ -670,5 +685,6 @@ def MipsGenericItineraries : ProcessorItineraries<[ALU, IMULDIV], [], [ InstrItinData<II_WRPGPR , [InstrStage<1, [ALU]>]>, InstrItinData<II_RDPGPR , [InstrStage<1, [ALU]>]>, InstrItinData<II_DVP , [InstrStage<1, [ALU]>]>, - InstrItinData<II_EVP , [InstrStage<1, [ALU]>]> + InstrItinData<II_EVP , [InstrStage<1, [ALU]>]>, + InstrItinData<II_YIELD , [InstrStage<5, [ALU]>]> ]>; diff --git a/lib/Target/Mips/MipsScheduleGeneric.td b/lib/Target/Mips/MipsScheduleGeneric.td index 15a0401b781e..89cda676441e 100644 --- a/lib/Target/Mips/MipsScheduleGeneric.td +++ b/lib/Target/Mips/MipsScheduleGeneric.td @@ -187,7 +187,11 @@ def GenericIssueCOP0 : ProcResource<1> { let Super = GenericCOP0; } def GenericWriteCOP0TLB : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 4; } def GenericWriteCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 3; } def GenericReadCOP0 : SchedWriteRes<[GenericIssueCOP0]> { let Latency = 2; } -def GnereicReadWritePGPR : SchedWriteRes<[GenericIssueCOP0]>; +def GenericReadWritePGPR : SchedWriteRes<[GenericIssueCOP0]>; +def GenericReadWriteCOP0Long : SchedWriteRes<[GenericIssueCOP0]> { + let Latency = 5; +} +def GenericWriteCOP0Short : SchedWriteRes<[GenericIssueCOP0]>; def : ItinRW<[GenericWriteCOP0TLB], [II_TLBP, II_TLBR, II_TLBWI, II_TLBWR]>; def : ItinRW<[GenericWriteCOP0TLB], [II_TLBINV, II_TLBINVF]>; @@ -261,6 +265,14 @@ def : ItinRW<[GenericWriteLoad], [II_LBE, II_LBUE, II_LHE, II_LHUE, II_LWE, def : ItinRW<[GenericWriteLoad], [II_LWLE, II_LWRE]>; +// MIPS MT instructions +// ==================== + +def : ItinRW<[GenericWriteMove], [II_DMT, II_DVPE, II_EMT, II_EVPE]>; + +def : ItinRW<[GenericReadWriteCOP0Long], [II_YIELD]>; +def : ItinRW<[GenericWriteCOP0Short], [II_FORK]>; + // MIPS32R6 and MIPS16e // ==================== diff --git a/lib/Target/Mips/MipsScheduleP5600.td b/lib/Target/Mips/MipsScheduleP5600.td index 882a241d1426..fedfac24e4e7 100644 --- a/lib/Target/Mips/MipsScheduleP5600.td +++ b/lib/Target/Mips/MipsScheduleP5600.td @@ -19,7 +19,7 @@ def MipsP5600Model : SchedMachineModel { HasMips64, HasMips64r2, HasCnMips, InMicroMips, InMips16Mode, HasMicroMips32r6, HasMicroMips64r6, - HasDSP, HasDSPR2]; + HasDSP, HasDSPR2, HasMT]; } diff --git a/lib/Target/Mips/MipsSubtarget.cpp b/lib/Target/Mips/MipsSubtarget.cpp index 154d5825427b..eba21e0a1c67 100644 --- a/lib/Target/Mips/MipsSubtarget.cpp +++ b/lib/Target/Mips/MipsSubtarget.cpp @@ -70,7 +70,8 @@ MipsSubtarget::MipsSubtarget(const Triple &TT, StringRef CPU, StringRef FS, InMips16HardFloat(Mips16HardFloat), InMicroMipsMode(false), HasDSP(false), HasDSPR2(false), HasDSPR3(false), AllowMixed16_32(Mixed16_32 | Mips_Os16), Os16(Mips_Os16), HasMSA(false), UseTCCInDIV(false), HasSym32(false), - HasEVA(false), DisableMadd4(false), TM(TM), TargetTriple(TT), TSInfo(), + HasEVA(false), DisableMadd4(false), HasMT(false), TM(TM), + TargetTriple(TT), TSInfo(), InstrInfo( MipsInstrInfo::create(initializeSubtargetDependencies(CPU, FS, TM))), FrameLowering(MipsFrameLowering::create(*this)), diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h index ccd47f00c0d3..7619e7b08612 100644 --- a/lib/Target/Mips/MipsSubtarget.h +++ b/lib/Target/Mips/MipsSubtarget.h @@ -149,6 +149,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo { // related instructions. bool DisableMadd4; + // HasMT -- support MT ASE. + bool HasMT; + InstrItineraryData InstrItins; // We can override the determination of whether we are in mips16 mode @@ -259,6 +262,7 @@ public: bool hasMSA() const { return HasMSA; } bool disableMadd4() const { return DisableMadd4; } bool hasEVA() const { return HasEVA; } + bool hasMT() const { return HasMT; } bool useSmallSection() const { return UseSmallSection; } bool hasStandardEncoding() const { return !inMips16Mode(); } diff --git a/lib/Target/Mips/MipsTargetStreamer.h b/lib/Target/Mips/MipsTargetStreamer.h index 41ebe411b98d..af24838665e1 100644 --- a/lib/Target/Mips/MipsTargetStreamer.h +++ b/lib/Target/Mips/MipsTargetStreamer.h @@ -40,6 +40,8 @@ public: virtual void emitDirectiveSetNoMacro(); virtual void emitDirectiveSetMsa(); virtual void emitDirectiveSetNoMsa(); + virtual void emitDirectiveSetMt(); + virtual void emitDirectiveSetNoMt(); virtual void emitDirectiveSetAt(); virtual void emitDirectiveSetAtWithArg(unsigned RegNo); virtual void emitDirectiveSetNoAt(); @@ -96,6 +98,7 @@ public: virtual void emitDirectiveModuleOddSPReg(); virtual void emitDirectiveModuleSoftFloat(); virtual void emitDirectiveModuleHardFloat(); + virtual void emitDirectiveModuleMT(); virtual void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value); virtual void emitDirectiveSetOddSPReg(); virtual void emitDirectiveSetNoOddSPReg(); @@ -116,6 +119,9 @@ public: SMLoc IDLoc, const MCSubtargetInfo *STI); void emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm, SMLoc IDLoc, const MCSubtargetInfo *STI); + void emitRRIII(unsigned Opcode, unsigned Reg0, unsigned Reg1, int16_t Imm0, + int16_t Imm1, int16_t Imm2, SMLoc IDLoc, + const MCSubtargetInfo *STI); void emitAddu(unsigned DstReg, unsigned SrcReg, unsigned TrgReg, bool Is64Bit, const MCSubtargetInfo *STI); void emitDSLL(unsigned DstReg, unsigned SrcReg, int16_t ShiftAmount, @@ -204,6 +210,8 @@ public: void emitDirectiveSetNoMacro() override; void emitDirectiveSetMsa() override; void emitDirectiveSetNoMsa() override; + void emitDirectiveSetMt() override; + void emitDirectiveSetNoMt() override; void emitDirectiveSetAt() override; void emitDirectiveSetAtWithArg(unsigned RegNo) override; void emitDirectiveSetNoAt() override; @@ -267,6 +275,7 @@ public: void emitDirectiveModuleOddSPReg() override; void emitDirectiveModuleSoftFloat() override; void emitDirectiveModuleHardFloat() override; + void emitDirectiveModuleMT() override; void emitDirectiveSetFp(MipsABIFlagsSection::FpABIKind Value) override; void emitDirectiveSetOddSPReg() override; void emitDirectiveSetNoOddSPReg() override; diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index f26b9a7cb8dd..f800d91f4093 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -62,7 +62,6 @@ #include <utility>
#include <vector>
-#undef DEBUG_TYPE
#define DEBUG_TYPE "nvptx-lower"
using namespace llvm;
@@ -2456,7 +2455,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments( // v2f16 was loaded as an i32. Now we must bitcast it back.
else if (EltVT == MVT::v2f16)
Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
- // Extend the element if necesary (e.g. an i8 is loaded
+ // Extend the element if necessary (e.g. an i8 is loaded
// into an i16 register)
if (Ins[InsIdx].VT.isInteger() &&
Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp index 3be291b48b8f..989f0a3aba2f 100644 --- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -14,6 +14,7 @@ //===----------------------------------------------------------------------===// #include "NVPTXLowerAggrCopies.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/StackProtector.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -42,6 +43,7 @@ struct NVPTXLowerAggrCopies : public FunctionPass { void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addPreserved<StackProtector>(); + AU.addRequired<TargetTransformInfoWrapperPass>(); } bool runOnFunction(Function &F) override; @@ -61,6 +63,8 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { const DataLayout &DL = F.getParent()->getDataLayout(); LLVMContext &Context = F.getParent()->getContext(); + const TargetTransformInfo &TTI = + getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F); // Collect all aggregate loads and mem* calls. for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) { @@ -104,15 +108,26 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { Value *SrcAddr = LI->getOperand(0); Value *DstAddr = SI->getOperand(1); unsigned NumLoads = DL.getTypeStoreSize(LI->getType()); - Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads); - - createMemCpyLoop(/* ConvertedInst */ SI, - /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, - /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), - /* SrcIsVolatile */ LI->isVolatile(), - /* DstIsVolatile */ SI->isVolatile()); + ConstantInt *CopyLen = + ConstantInt::get(Type::getInt32Ty(Context), NumLoads); + + if (!TTI.useWideIRMemcpyLoopLowering()) { + createMemCpyLoop(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcAlign */ LI->getAlignment(), + /* DestAlign */ SI->getAlignment(), + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile()); + } else { + createMemCpyLoopKnownSize(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcAlign */ LI->getAlignment(), + /* DestAlign */ SI->getAlignment(), + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile(), TTI); + } SI->eraseFromParent(); LI->eraseFromParent(); @@ -121,7 +136,7 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) { // Transform mem* intrinsic calls. for (MemIntrinsic *MemCall : MemCalls) { if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) { - expandMemCpyAsLoop(Memcpy); + expandMemCpyAsLoop(Memcpy, TTI); } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) { expandMemMoveAsLoop(Memmove); } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) { diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp index 7393f3d7a08a..bdad2fe8714f 100644 --- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp +++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp @@ -115,7 +115,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override { + uint64_t Value, bool IsResolved) const override { Value = adjustFixupValue(Fixup.getKind(), Value); if (!Value) return; // Doesn't change encoding. diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp index 094d3e6a61b5..53f33ac1fc0e 100644 --- a/lib/Target/PowerPC/PPCCTRLoops.cpp +++ b/lib/Target/PowerPC/PPCCTRLoops.cpp @@ -607,7 +607,10 @@ bool PPCCTRLoops::convertToCTRLoop(Loop *L) { // The old condition may be dead now, and may have even created a dead PHI // (the original induction variable). RecursivelyDeleteTriviallyDeadInstructions(OldCond); - DeleteDeadPHIs(CountedExitBlock); + // Run through the basic blocks of the loop and see if any of them have dead + // PHIs that can be removed. + for (auto I : L->blocks()) + DeleteDeadPHIs(I); ++NumCTRLoops; return MadeChange; diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp index c2c115cb6daf..b49c3345a17d 100644 --- a/lib/Target/PowerPC/PPCFrameLowering.cpp +++ b/lib/Target/PowerPC/PPCFrameLowering.cpp @@ -435,22 +435,19 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF, const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo(); - // If we are a leaf function, and use up to 224 bytes of stack space, - // don't have a frame pointer, calls, or dynamic alloca then we do not need - // to adjust the stack pointer (we fit in the Red Zone). - // The 32-bit SVR4 ABI has no Red Zone. However, it can still generate - // stackless code if all local vars are reg-allocated. - bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); unsigned LR = RegInfo->getRARegister(); - if (!DisableRedZone && - (Subtarget.isPPC64() || // 32-bit SVR4, no stack- - !Subtarget.isSVR4ABI() || // allocated locals. - FrameSize == 0) && - FrameSize <= 224 && // Fits in red zone. - !MFI.hasVarSizedObjects() && // No dynamic alloca. - !MFI.adjustsStack() && // No calls. - !MustSaveLR(MF, LR) && - !RegInfo->hasBasePointer(MF)) { // No special alignment. + bool DisableRedZone = MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); + bool CanUseRedZone = !MFI.hasVarSizedObjects() && // No dynamic alloca. + !MFI.adjustsStack() && // No calls. + !MustSaveLR(MF, LR) && // No need to save LR. + !RegInfo->hasBasePointer(MF); // No special alignment. + + // Note: for PPC32 SVR4ABI (Non-DarwinABI), we can still generate stackless + // code if all local vars are reg-allocated. + bool FitsInRedZone = FrameSize <= Subtarget.getRedZoneSize(); + + // Check whether we can skip adjusting the stack pointer (by using red zone) + if (!DisableRedZone && CanUseRedZone && FitsInRedZone) { // No need for frame if (UpdateMF) MFI.setStackSize(0); @@ -1869,8 +1866,13 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF, } if (HasVRSaveArea) { - // Insert alignment padding, we need 16-byte alignment. - LowerBound = (LowerBound - 15) & ~(15); + // Insert alignment padding, we need 16-byte alignment. Note: for postive + // number the alignment formula is : y = (x + (n-1)) & (~(n-1)). But since + // we are using negative number here (the stack grows downward). We should + // use formula : y = x & (~(n-1)). Where x is the size before aligning, n + // is the alignment size ( n = 16 here) and y is the size after aligning. + assert(LowerBound <= 0 && "Expect LowerBound have a non-positive value!"); + LowerBound &= ~(15); for (unsigned i = 0, e = VRegs.size(); i != e; ++i) { int FI = VRegs[i].getFrameIdx(); diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 535b9deaefac..3aaf7ef2c2a0 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -419,25 +419,6 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() { .getNode(); } -/// isIntS16Immediate - This method tests to see if the node is either a 32-bit -/// or 64-bit immediate, and if the value can be accurately represented as a -/// sign extension from a 16-bit value. If so, this returns true and the -/// immediate. -static bool isIntS16Immediate(SDNode *N, short &Imm) { - if (N->getOpcode() != ISD::Constant) - return false; - - Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); - if (N->getValueType(0) == MVT::i32) - return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); - else - return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); -} - -static bool isIntS16Immediate(SDValue Op, short &Imm) { - return isIntS16Immediate(Op.getNode(), Imm); -} - /// isInt32Immediate - This method tests to see if the node is a 32-bit constant /// operand. If so Imm will receive the 32-bit value. static bool isInt32Immediate(SDNode *N, unsigned &Imm) { @@ -728,7 +709,10 @@ static uint64_t Rot64(uint64_t Imm, unsigned R) { static unsigned getInt64Count(int64_t Imm) { unsigned Count = getInt64CountDirect(Imm); - if (Count == 1) + + // If the instruction count is 1 or 2, we do not need further analysis + // since rotate + load constant requires at least 2 instructions. + if (Count <= 2) return Count; for (unsigned r = 1; r < 63; ++r) { @@ -838,7 +822,10 @@ static SDNode *getInt64Direct(SelectionDAG *CurDAG, const SDLoc &dl, static SDNode *getInt64(SelectionDAG *CurDAG, const SDLoc &dl, int64_t Imm) { unsigned Count = getInt64CountDirect(Imm); - if (Count == 1) + + // If the instruction count is 1 or 2, we do not need further analysis + // since rotate + load constant requires at least 2 instructions. + if (Count <= 2) return getInt64Direct(CurDAG, dl, Imm); unsigned RMin = 0; @@ -2126,7 +2113,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, getI32Imm(Imm & 0xFFFF, dl)), 0); Opc = PPC::CMPLW; } else { - short SImm; + int16_t SImm; if (isIntS16Immediate(RHS, SImm)) return SDValue(CurDAG->getMachineNode(PPC::CMPWI, dl, MVT::i32, LHS, getI32Imm((int)SImm & 0xFFFF, @@ -2173,7 +2160,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC, getI64Imm(Imm & 0xFFFF, dl)), 0); Opc = PPC::CMPLD; } else { - short SImm; + int16_t SImm; if (isIntS16Immediate(RHS, SImm)) return SDValue(CurDAG->getMachineNode(PPC::CMPDI, dl, MVT::i64, LHS, getI64Imm(SImm & 0xFFFF, dl)), @@ -3323,7 +3310,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { if (tryLogicOpOfCompares(N)) return; - short Imm; + int16_t Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { KnownBits LHSKnown; @@ -3346,7 +3333,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) { break; } case ISD::ADD: { - short Imm; + int16_t Imm; if (N->getOperand(0)->getOpcode() == ISD::FrameIndex && isIntS16Immediate(N->getOperand(1), Imm)) { selectFrameIndex(N, N->getOperand(0).getNode(), (int)Imm); @@ -4034,11 +4021,13 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) { O0.getNode(), O1.getNode()); }; + // FIXME: When the semantics of the interaction between select and undef + // are clearly defined, it may turn out to be unnecessary to break here. SDValue TrueRes = TryFold(ConstTrue); - if (!TrueRes) + if (!TrueRes || TrueRes.isUndef()) break; SDValue FalseRes = TryFold(ConstFalse); - if (!FalseRes) + if (!FalseRes || FalseRes.isUndef()) break; // For us to materialize these using one instruction, we must be able to diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 72f14e969138..0e069ec1665f 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -136,6 +136,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, addRegisterClass(MVT::f64, &PPC::F8RCRegClass); } + // Match BITREVERSE to customized fast code sequence in the td file. + setOperationAction(ISD::BITREVERSE, MVT::i32, Legal); + setOperationAction(ISD::BITREVERSE, MVT::i64, Legal); + // PowerPC has an i16 but no i8 (or i1) SEXTLOAD. for (MVT VT : MVT::integer_valuetypes()) { setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote); @@ -1168,6 +1172,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const { case PPCISD::LXSIZX: return "PPCISD::LXSIZX"; case PPCISD::STXSIX: return "PPCISD::STXSIX"; case PPCISD::VEXTS: return "PPCISD::VEXTS"; + case PPCISD::SExtVElems: return "PPCISD::SExtVElems"; case PPCISD::LXVD2X: return "PPCISD::LXVD2X"; case PPCISD::STXVD2X: return "PPCISD::STXVD2X"; case PPCISD::COND_BRANCH: return "PPCISD::COND_BRANCH"; @@ -2028,17 +2033,17 @@ int PPC::isQVALIGNIShuffleMask(SDNode *N) { /// or 64-bit immediate, and if the value can be accurately represented as a /// sign extension from a 16-bit value. If so, this returns true and the /// immediate. -static bool isIntS16Immediate(SDNode *N, short &Imm) { +bool llvm::isIntS16Immediate(SDNode *N, int16_t &Imm) { if (!isa<ConstantSDNode>(N)) return false; - Imm = (short)cast<ConstantSDNode>(N)->getZExtValue(); + Imm = (int16_t)cast<ConstantSDNode>(N)->getZExtValue(); if (N->getValueType(0) == MVT::i32) return Imm == (int32_t)cast<ConstantSDNode>(N)->getZExtValue(); else return Imm == (int64_t)cast<ConstantSDNode>(N)->getZExtValue(); } -static bool isIntS16Immediate(SDValue Op, short &Imm) { +bool llvm::isIntS16Immediate(SDValue Op, int16_t &Imm) { return isIntS16Immediate(Op.getNode(), Imm); } @@ -2048,7 +2053,7 @@ static bool isIntS16Immediate(SDValue Op, short &Imm) { bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG) const { - short imm = 0; + int16_t imm = 0; if (N.getOpcode() == ISD::ADD) { if (isIntS16Immediate(N.getOperand(1), imm)) return false; // r+i @@ -2138,7 +2143,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return false; if (N.getOpcode() == ISD::ADD) { - short imm = 0; + int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (!Aligned || (imm & 3) == 0)) { Disp = DAG.getTargetConstant(imm, dl, N.getValueType()); @@ -2162,7 +2167,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, return true; // [&g+r] } } else if (N.getOpcode() == ISD::OR) { - short imm = 0; + int16_t imm = 0; if (isIntS16Immediate(N.getOperand(1), imm) && (!Aligned || (imm & 3) == 0)) { // If this is an or of disjoint bitfields, we can codegen this as an add @@ -2190,7 +2195,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp, // If this address fits entirely in a 16-bit sext immediate field, codegen // this as "d, 0" - short Imm; + int16_t Imm; if (isIntS16Immediate(CN, Imm) && (!Aligned || (Imm & 3) == 0)) { Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0)); Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO, @@ -2235,10 +2240,15 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base, if (SelectAddressRegReg(N, Base, Index, DAG)) return true; - // If the operand is an addition, always emit this as [r+r], since this is - // better (for code size, and execution, as the memop does the add for free) - // than emitting an explicit add. - if (N.getOpcode() == ISD::ADD) { + // If the address is the result of an add, we will utilize the fact that the + // address calculation includes an implicit add. However, we can reduce + // register pressure if we do not materialize a constant just for use as the + // index register. We only get rid of the add if it is not an add of a + // value and a 16-bit signed constant and both have a single use. + int16_t imm = 0; + if (N.getOpcode() == ISD::ADD && + (!isIntS16Immediate(N.getOperand(1), imm) || + !N.getOperand(1).hasOneUse() || !N.getOperand(0).hasOneUse())) { Base = N.getOperand(0); Index = N.getOperand(1); return true; @@ -6422,7 +6432,7 @@ PPCTargetLowering::LowerGET_DYNAMIC_AREA_OFFSET(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); - // Get the corect type for integers. + // Get the correct type for integers. EVT IntVT = Op.getValueType(); // Get the inputs. @@ -6439,7 +6449,7 @@ SDValue PPCTargetLowering::LowerSTACKRESTORE(SDValue Op, // When we pop the dynamic allocation we need to restore the SP link. SDLoc dl(Op); - // Get the corect type for pointers. + // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Construct the stack pointer operand. @@ -6514,7 +6524,7 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SDValue Size = Op.getOperand(1); SDLoc dl(Op); - // Get the corect type for pointers. + // Get the correct type for pointers. EVT PtrVT = getPointerTy(DAG.getDataLayout()); // Negate the size. SDValue NegSize = DAG.getNode(ISD::SUB, dl, PtrVT, @@ -6645,6 +6655,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); + LLVM_FALLTHROUGH; case ISD::SETEQ: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits LHS = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f64, LHS); @@ -6656,6 +6667,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETULT: case ISD::SETLT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + LLVM_FALLTHROUGH; case ISD::SETOGE: case ISD::SETGE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -6664,6 +6676,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { case ISD::SETUGT: case ISD::SETGT: std::swap(TV, FV); // fsel is natively setge, swap operands for setlt + LLVM_FALLTHROUGH; case ISD::SETOLE: case ISD::SETLE: if (LHS.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -6677,6 +6690,7 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const { default: break; // SETUO etc aren't handled by fsel. case ISD::SETNE: std::swap(TV, FV); + LLVM_FALLTHROUGH; case ISD::SETEQ: Cmp = DAG.getNode(ISD::FSUB, dl, CmpVT, LHS, RHS, Flags); if (Cmp.getValueType() == MVT::f32) // Comparison is always 64-bits @@ -11311,6 +11325,132 @@ static SDValue combineBVOfConsecutiveLoads(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +// This function adds the required vector_shuffle needed to get +// the elements of the vector extract in the correct position +// as specified by the CorrectElems encoding. +static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG, + SDValue Input, uint64_t Elems, + uint64_t CorrectElems) { + SDLoc dl(N); + + unsigned NumElems = Input.getValueType().getVectorNumElements(); + SmallVector<int, 16> ShuffleMask(NumElems, -1); + + // Knowing the element indices being extracted from the original + // vector and the order in which they're being inserted, just put + // them at element indices required for the instruction. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (DAG.getDataLayout().isLittleEndian()) + ShuffleMask[CorrectElems & 0xF] = Elems & 0xF; + else + ShuffleMask[(CorrectElems & 0xF0) >> 4] = (Elems & 0xF0) >> 4; + CorrectElems = CorrectElems >> 8; + Elems = Elems >> 8; + } + + SDValue Shuffle = + DAG.getVectorShuffle(Input.getValueType(), dl, Input, + DAG.getUNDEF(Input.getValueType()), ShuffleMask); + + EVT Ty = N->getValueType(0); + SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle); + return BV; +} + +// Look for build vector patterns where input operands come from sign +// extended vector_extract elements of specific indices. If the correct indices +// aren't used, add a vector shuffle to fix up the indices and create a new +// PPCISD:SExtVElems node which selects the vector sign extend instructions +// during instruction selection. +static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) { + // This array encodes the indices that the vector sign extend instructions + // extract from when extending from one type to another for both BE and LE. + // The right nibble of each byte corresponds to the LE incides. + // and the left nibble of each byte corresponds to the BE incides. + // For example: 0x3074B8FC byte->word + // For LE: the allowed indices are: 0x0,0x4,0x8,0xC + // For BE: the allowed indices are: 0x3,0x7,0xB,0xF + // For example: 0x000070F8 byte->double word + // For LE: the allowed indices are: 0x0,0x8 + // For BE: the allowed indices are: 0x7,0xF + uint64_t TargetElems[] = { + 0x3074B8FC, // b->w + 0x000070F8, // b->d + 0x10325476, // h->w + 0x00003074, // h->d + 0x00001032, // w->d + }; + + uint64_t Elems = 0; + int Index; + SDValue Input; + + auto isSExtOfVecExtract = [&](SDValue Op) -> bool { + if (!Op) + return false; + if (Op.getOpcode() != ISD::SIGN_EXTEND) + return false; + + SDValue Extract = Op.getOperand(0); + if (Extract.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return false; + + ConstantSDNode *ExtOp = dyn_cast<ConstantSDNode>(Extract.getOperand(1)); + if (!ExtOp) + return false; + + Index = ExtOp->getZExtValue(); + if (Input && Input != Extract.getOperand(0)) + return false; + + if (!Input) + Input = Extract.getOperand(0); + + Elems = Elems << 8; + Index = DAG.getDataLayout().isLittleEndian() ? Index : Index << 4; + Elems |= Index; + + return true; + }; + + // If the build vector operands aren't sign extended vector extracts, + // of the same input vector, then return. + for (unsigned i = 0; i < N->getNumOperands(); i++) { + if (!isSExtOfVecExtract(N->getOperand(i))) { + return SDValue(); + } + } + + // If the vector extract indicies are not correct, add the appropriate + // vector_shuffle. + int TgtElemArrayIdx; + int InputSize = Input.getValueType().getScalarSizeInBits(); + int OutputSize = N->getValueType(0).getScalarSizeInBits(); + if (InputSize + OutputSize == 40) + TgtElemArrayIdx = 0; + else if (InputSize + OutputSize == 72) + TgtElemArrayIdx = 1; + else if (InputSize + OutputSize == 48) + TgtElemArrayIdx = 2; + else if (InputSize + OutputSize == 80) + TgtElemArrayIdx = 3; + else if (InputSize + OutputSize == 96) + TgtElemArrayIdx = 4; + else + return SDValue(); + + uint64_t CorrectElems = TargetElems[TgtElemArrayIdx]; + CorrectElems = DAG.getDataLayout().isLittleEndian() + ? CorrectElems & 0x0F0F0F0F0F0F0F0F + : CorrectElems & 0xF0F0F0F0F0F0F0F0; + if (Elems != CorrectElems) { + return addShuffleForVecExtend(N, DAG, Input, Elems, CorrectElems); + } + + // Regular lowering will catch cases where a shuffle is not needed. + return SDValue(); +} + SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const { assert(N->getOpcode() == ISD::BUILD_VECTOR && @@ -11338,6 +11478,15 @@ SDValue PPCTargetLowering::DAGCombineBuildVector(SDNode *N, if (Reduced) return Reduced; + // If we're building a vector out of extended elements from another vector + // we have P9 vector integer extend instructions. + if (Subtarget.hasP9Altivec()) { + Reduced = combineBVOfVecSExt(N, DAG); + if (Reduced) + return Reduced; + } + + if (N->getValueType(0) != MVT::v2f64) return SDValue(); diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index a5108727bb4b..821927d3b157 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -67,6 +67,10 @@ namespace llvm { /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer. VEXTS, + /// SExtVElems, takes an input vector of a smaller type and sign + /// extends to an output vector of a larger type. + SExtVElems, + /// Reciprocal estimate instructions (unary FP ops). FRE, FRSQRTE, @@ -1092,6 +1096,9 @@ namespace llvm { ISD::ArgFlagsTy &ArgFlags, CCState &State); + bool isIntS16Immediate(SDNode *N, int16_t &Imm); + bool isIntS16Immediate(SDValue Op, int16_t &Imm); + } // end namespace llvm #endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index 47d59c25392a..6d9f55206b6a 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -32,6 +32,9 @@ def SDT_PPCstxsix : SDTypeProfile<0, 3, [ def SDT_PPCVexts : SDTypeProfile<1, 2, [ SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2> ]>; +def SDT_PPCSExtVElems : SDTypeProfile<1, 1, [ + SDTCisVec<0>, SDTCisVec<1> +]>; def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; @@ -131,6 +134,7 @@ def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx, def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix, [SDNPHasChain, SDNPMayStore]>; def PPCVexts : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>; +def PPCSExtVElems : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>; // Extract FPSCR (not modeled at the DAG level). def PPCmffs : SDNode<"PPCISD::MFFS", @@ -4450,3 +4454,190 @@ def MSGSYNC : XForm_0<31, 886, (outs), (ins), "msgsync", IIC_SprMSGSYNC, []>; def STOP : XForm_0<19, 370, (outs), (ins), "stop", IIC_SprSTOP, []>; } // IsISA3_0 + +// Fast 32-bit reverse bits algorithm: +// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit): +// n = ((n >> 1) & 0x55555555) | ((n << 1) & 0xAAAAAAAA); +// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit): +// n = ((n >> 2) & 0x33333333) | ((n << 2) & 0xCCCCCCCC); +// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit): +// n = ((n >> 4) & 0x0F0F0F0F) | ((n << 4) & 0xF0F0F0F0); +// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4]): +// Step 4.1: Put B4,B2 in the right position (rotate left 3 bytes): +// n' = (n rotl 24); After which n' = [B4, B1, B2, B3] +// Step 4.2: Insert B3 to the right position: +// n' = rlwimi n', n, 8, 8, 15; After which n' = [B4, B3, B2, B3] +// Step 4.3: Insert B1 to the right position: +// n' = rlwimi n', n, 8, 24, 31; After which n' = [B4, B3, B2, B1] +def MaskValues { + dag Lo1 = (ORI (LIS 0x5555), 0x5555); + dag Hi1 = (ORI (LIS 0xAAAA), 0xAAAA); + dag Lo2 = (ORI (LIS 0x3333), 0x3333); + dag Hi2 = (ORI (LIS 0xCCCC), 0xCCCC); + dag Lo4 = (ORI (LIS 0x0F0F), 0x0F0F); + dag Hi4 = (ORI (LIS 0xF0F0), 0xF0F0); +} + +def Shift1 { + dag Right = (RLWINM $A, 31, 1, 31); + dag Left = (RLWINM $A, 1, 0, 30); +} + +def Swap1 { + dag Bit = (OR (AND Shift1.Right, MaskValues.Lo1), + (AND Shift1.Left, MaskValues.Hi1)); +} + +def Shift2 { + dag Right = (RLWINM Swap1.Bit, 30, 2, 31); + dag Left = (RLWINM Swap1.Bit, 2, 0, 29); +} + +def Swap2 { + dag Bits = (OR (AND Shift2.Right, MaskValues.Lo2), + (AND Shift2.Left, MaskValues.Hi2)); +} + +def Shift4 { + dag Right = (RLWINM Swap2.Bits, 28, 4, 31); + dag Left = (RLWINM Swap2.Bits, 4, 0, 27); +} + +def Swap4 { + dag Bits = (OR (AND Shift4.Right, MaskValues.Lo4), + (AND Shift4.Left, MaskValues.Hi4)); +} + +def Rotate { + dag Left3Bytes = (RLWINM Swap4.Bits, 24, 0, 31); +} + +def RotateInsertByte3 { + dag Left = (RLWIMI Rotate.Left3Bytes, Swap4.Bits, 8, 8, 15); +} + +def RotateInsertByte1 { + dag Left = (RLWIMI RotateInsertByte3.Left, Swap4.Bits, 8, 24, 31); +} + +def : Pat<(i32 (bitreverse i32:$A)), + (RLDICL_32 RotateInsertByte1.Left, 0, 32)>; + +// Fast 64-bit reverse bits algorithm: +// Step 1: 1-bit swap (swap odd 1-bit and even 1-bit): +// n = ((n >> 1) & 0x5555555555555555) | ((n << 1) & 0xAAAAAAAAAAAAAAAA); +// Step 2: 2-bit swap (swap odd 2-bit and even 2-bit): +// n = ((n >> 2) & 0x3333333333333333) | ((n << 2) & 0xCCCCCCCCCCCCCCCC); +// Step 3: 4-bit swap (swap odd 4-bit and even 4-bit): +// n = ((n >> 4) & 0x0F0F0F0F0F0F0F0F) | ((n << 4) & 0xF0F0F0F0F0F0F0F0); +// Step 4: byte reverse (Suppose n = [B1,B2,B3,B4,B5,B6,B7,B8]): +// Apply the same byte reverse algorithm mentioned above for the fast 32-bit +// reverse to both the high 32 bit and low 32 bit of the 64 bit value. And +// then OR them together to get the final result. +def MaskValues64 { + dag Lo1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo1, sub_32)); + dag Hi1 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi1, sub_32)); + dag Lo2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo2, sub_32)); + dag Hi2 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi2, sub_32)); + dag Lo4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Lo4, sub_32)); + dag Hi4 = (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), MaskValues.Hi4, sub_32)); +} + +def DWMaskValues { + dag Lo1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo1, 32, 31), 0x5555), 0x5555); + dag Hi1 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi1, 32, 31), 0xAAAA), 0xAAAA); + dag Lo2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo2, 32, 31), 0x3333), 0x3333); + dag Hi2 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi2, 32, 31), 0xCCCC), 0xCCCC); + dag Lo4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Lo4, 32, 31), 0x0F0F), 0x0F0F); + dag Hi4 = (ORI8 (ORIS8 (RLDICR MaskValues64.Hi4, 32, 31), 0xF0F0), 0xF0F0); +} + +def DWShift1 { + dag Right = (RLDICL $A, 63, 1); + dag Left = (RLDICR $A, 1, 62); +} + +def DWSwap1 { + dag Bit = (OR8 (AND8 DWShift1.Right, DWMaskValues.Lo1), + (AND8 DWShift1.Left, DWMaskValues.Hi1)); +} + +def DWShift2 { + dag Right = (RLDICL DWSwap1.Bit, 62, 2); + dag Left = (RLDICR DWSwap1.Bit, 2, 61); +} + +def DWSwap2 { + dag Bits = (OR8 (AND8 DWShift2.Right, DWMaskValues.Lo2), + (AND8 DWShift2.Left, DWMaskValues.Hi2)); +} + +def DWShift4 { + dag Right = (RLDICL DWSwap2.Bits, 60, 4); + dag Left = (RLDICR DWSwap2.Bits, 4, 59); +} + +def DWSwap4 { + dag Bits = (OR8 (AND8 DWShift4.Right, DWMaskValues.Lo4), + (AND8 DWShift4.Left, DWMaskValues.Hi4)); +} + +// Bit swap is done, now start byte swap. +def DWExtractLo32 { + dag SubReg = (i32 (EXTRACT_SUBREG DWSwap4.Bits, sub_32)); +} + +def DWRotateLo32 { + dag Left24 = (RLWINM DWExtractLo32.SubReg, 24, 0, 31); +} + +def DWLo32RotateInsertByte3 { + dag Left = (RLWIMI DWRotateLo32.Left24, DWExtractLo32.SubReg, 8, 8, 15); +} + +// Lower 32 bits in the right order +def DWLo32RotateInsertByte1 { + dag Left = + (RLWIMI DWLo32RotateInsertByte3.Left, DWExtractLo32.SubReg, 8, 24, 31); +} + +def ExtendLo32 { + dag To64Bit = + (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + DWLo32RotateInsertByte1.Left, sub_32)); +} + +def DWShiftHi32 { // SRDI DWSwap4.Bits, 32) + dag ToLo32 = (RLDICL DWSwap4.Bits, 32, 32); +} + +def DWExtractHi32 { + dag SubReg = (i32 (EXTRACT_SUBREG DWShiftHi32.ToLo32, sub_32)); +} + +def DWRotateHi32 { + dag Left24 = (RLWINM DWExtractHi32.SubReg, 24, 0, 31); +} + +def DWHi32RotateInsertByte3 { + dag Left = (RLWIMI DWRotateHi32.Left24, DWExtractHi32.SubReg, 8, 8, 15); +} + +// High 32 bits in the right order, but in the low 32-bit position +def DWHi32RotateInsertByte1 { + dag Left = + (RLWIMI DWHi32RotateInsertByte3.Left, DWExtractHi32.SubReg, 8, 24, 31); +} + +def ExtendHi32 { + dag To64Bit = + (i64 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + DWHi32RotateInsertByte1.Left, sub_32)); +} + +def DWShiftLo32 { // SLDI ExtendHi32.To64Bit, 32 + dag ToHi32 = (RLDICR ExtendHi32.To64Bit, 32, 31); +} + +def : Pat<(i64 (bitreverse i64:$A)), + (OR8 DWShiftLo32.ToHi32, ExtendLo32.To64Bit)>; diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 9cfc897cdb3f..43635a8919e2 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -1901,6 +1901,98 @@ let Predicates = [IsLittleEndian, HasVSX] in def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>; def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>; +// Variable index unsigned vector_extract on Power9 +let Predicates = [HasP9Altivec, IsLittleEndian] in { + def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), + (VEXTUBRX $Idx, $S)>; + + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))), + (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))), + (VEXTUHRX (LI8 0), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))), + (VEXTUHRX (LI8 2), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))), + (VEXTUHRX (LI8 4), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))), + (VEXTUHRX (LI8 6), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))), + (VEXTUHRX (LI8 8), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))), + (VEXTUHRX (LI8 10), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))), + (VEXTUHRX (LI8 12), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))), + (VEXTUHRX (LI8 14), $S)>; + + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), + (VEXTUWRX (LI8 0), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), + (VEXTUWRX (LI8 4), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), + (VEXTUWRX (LI8 8), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), + (VEXTUWRX (LI8 12), $S)>; + + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), + (EXTSW (VEXTUWRX (LI8 0), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), + (EXTSW (VEXTUWRX (LI8 4), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), + (EXTSW (VEXTUWRX (LI8 8), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), + (EXTSW (VEXTUWRX (LI8 12), $S))>; +} +let Predicates = [HasP9Altivec, IsBigEndian] in { + def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))), + (VEXTUBLX $Idx, $S)>; + + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))), + (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))), + (VEXTUHLX (LI8 0), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))), + (VEXTUHLX (LI8 2), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))), + (VEXTUHLX (LI8 4), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))), + (VEXTUHLX (LI8 6), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))), + (VEXTUHLX (LI8 8), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))), + (VEXTUHLX (LI8 10), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))), + (VEXTUHLX (LI8 12), $S)>; + def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))), + (VEXTUHLX (LI8 14), $S)>; + + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))), + (VEXTUWLX (LI8 0), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))), + (VEXTUWLX (LI8 4), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))), + (VEXTUWLX (LI8 8), $S)>; + def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))), + (VEXTUWLX (LI8 12), $S)>; + + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))), + (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))), + (EXTSW (VEXTUWLX (LI8 0), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))), + (EXTSW (VEXTUWLX (LI8 4), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))), + (EXTSW (VEXTUWLX (LI8 8), $S))>; + def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))), + (EXTSW (VEXTUWLX (LI8 12), $S))>; +} + let Predicates = [IsLittleEndian, HasDirectMove] in { // v16i8 scalar <-> vector conversions (LE) def : Pat<(v16i8 (scalar_to_vector i32:$A)), @@ -2729,36 +2821,54 @@ def DblToFlt { } def ByteToWord { - dag A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); - dag A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); - dag A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8)); - dag A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8)); + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 0)), i8)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 4)), i8)); + dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 8)), i8)); + dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 12)), i8)); + dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 3)), i8)); + dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 7)), i8)); + dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 11)), i8)); + dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v16i8:$A, 15)), i8)); } def ByteToDWord { - dag A0 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8)); - dag A1 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8)); + dag LE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 0)))), i8)); + dag LE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 8)))), i8)); + dag BE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 7)))), i8)); + dag BE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v16i8:$A, 15)))), i8)); } def HWordToWord { - dag A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16)); - dag A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16)); - dag A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16)); - dag A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16)); + dag LE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 0)), i16)); + dag LE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 2)), i16)); + dag LE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 4)), i16)); + dag LE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 6)), i16)); + dag BE_A0 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 1)), i16)); + dag BE_A1 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 3)), i16)); + dag BE_A2 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 5)), i16)); + dag BE_A3 = (i32 (sext_inreg (i32 (vector_extract v8i16:$A, 7)), i16)); } def HWordToDWord { - dag A0 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16)); - dag A1 = (i64 (sext_inreg - (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16)); + dag LE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 0)))), i16)); + dag LE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 4)))), i16)); + dag BE_A0 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 3)))), i16)); + dag BE_A1 = (i64 (sext_inreg + (i64 (anyext (i32 (vector_extract v8i16:$A, 7)))), i16)); } def WordToDWord { - dag A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0)))); - dag A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2)))); + dag LE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 0)))); + dag LE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 2)))); + dag BE_A0 = (i64 (sext (i32 (vector_extract v4i32:$A, 1)))); + dag BE_A1 = (i64 (sext (i32 (vector_extract v4i32:$A, 3)))); } def FltToIntLoad { @@ -3016,18 +3126,46 @@ let AddedComplexity = 400 in { // P9 Altivec instructions that can be used to build vectors. // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete // with complexities of existing build vector patterns in this file. - let Predicates = [HasP9Altivec] in { - def : Pat<(v2i64 (build_vector WordToDWord.A0, WordToDWord.A1)), + let Predicates = [HasP9Altivec, IsLittleEndian] in { + def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)), (v2i64 (VEXTSW2D $A))>; - def : Pat<(v2i64 (build_vector HWordToDWord.A0, HWordToDWord.A1)), + def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)), (v2i64 (VEXTSH2D $A))>; - def : Pat<(v4i32 (build_vector HWordToWord.A0, HWordToWord.A1, - HWordToWord.A2, HWordToWord.A3)), + def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1, + HWordToWord.LE_A2, HWordToWord.LE_A3)), (v4i32 (VEXTSH2W $A))>; - def : Pat<(v4i32 (build_vector ByteToWord.A0, ByteToWord.A1, - ByteToWord.A2, ByteToWord.A3)), + def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1, + ByteToWord.LE_A2, ByteToWord.LE_A3)), (v4i32 (VEXTSB2W $A))>; - def : Pat<(v2i64 (build_vector ByteToDWord.A0, ByteToDWord.A1)), + def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)), (v2i64 (VEXTSB2D $A))>; } + + let Predicates = [HasP9Altivec, IsBigEndian] in { + def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)), + (v2i64 (VEXTSW2D $A))>; + def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)), + (v2i64 (VEXTSH2D $A))>; + def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1, + HWordToWord.BE_A2, HWordToWord.BE_A3)), + (v4i32 (VEXTSH2W $A))>; + def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1, + ByteToWord.BE_A2, ByteToWord.BE_A3)), + (v4i32 (VEXTSB2W $A))>; + def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)), + (v2i64 (VEXTSB2D $A))>; + } + + let Predicates = [HasP9Altivec] in { + def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)), + (v2i64 (VEXTSB2D $A))>; + def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)), + (v2i64 (VEXTSH2D $A))>; + def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)), + (v2i64 (VEXTSW2D $A))>; + def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)), + (v4i32 (VEXTSB2W $A))>; + def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)), + (v4i32 (VEXTSH2W $A))>; + } } diff --git a/lib/Target/PowerPC/PPCScheduleP9.td b/lib/Target/PowerPC/PPCScheduleP9.td index a9c1bd78b05e..a01995a629c2 100644 --- a/lib/Target/PowerPC/PPCScheduleP9.td +++ b/lib/Target/PowerPC/PPCScheduleP9.td @@ -260,8 +260,8 @@ let SchedModel = P9Model in { // ***************** Defining Itinerary Class Resources ***************** - def : ItinRW<[P9_DFU_76C, IP_EXEC_1C, DISP_1C, DISP_1C], [IIC_IntSimple, - IIC_IntGeneral]>; + def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C], + [IIC_IntSimple, IIC_IntGeneral]>; def : ItinRW<[P9_ALU_2C, IP_EXEC_1C, DISP_1C, DISP_1C, DISP_1C], [IIC_IntISEL, IIC_IntRotate, IIC_IntShift]>; diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h index 5a97f595ad8c..90d11f46a384 100644 --- a/lib/Target/PowerPC/PPCSubtarget.h +++ b/lib/Target/PowerPC/PPCSubtarget.h @@ -272,6 +272,13 @@ public: return 16; } + + // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no + // red zone and PPC64 SVR4ABI has a 288-byte red zone. + unsigned getRedZoneSize() const { + return isDarwinABI() ? 224 : (isPPC64() ? 288 : 0); + } + bool hasHTM() const { return HasHTM; } bool hasFusion() const { return HasFusion; } bool hasFloat128() const { return HasFloat128; } diff --git a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp index 491eaf326a50..7d34efd4af3e 100644 --- a/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ b/lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -195,8 +195,10 @@ public: return false; // If we don't have VSX on the subtarget, don't do anything. + // Also, on Power 9 the load and store ops preserve element order and so + // the swaps are not required. const PPCSubtarget &STI = MF.getSubtarget<PPCSubtarget>(); - if (!STI.hasVSX()) + if (!STI.hasVSX() || !STI.needsSwapsForVSXMemOps()) return false; bool Changed = false; diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp index f85c0cf111c4..be83efc02d27 100644 --- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp +++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp @@ -34,7 +34,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override; @@ -73,7 +73,7 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const { void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { return; } diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp index d4454c271f5a..0d021d67033e 100644 --- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp +++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp @@ -211,6 +211,7 @@ namespace { case Sparc::fixup_sparc_wplt30: if (Target.getSymA()->getSymbol().isTemporary()) return false; + LLVM_FALLTHROUGH; case Sparc::fixup_sparc_tls_gd_hi22: case Sparc::fixup_sparc_tls_gd_lo10: case Sparc::fixup_sparc_tls_gd_add: @@ -275,7 +276,7 @@ namespace { void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override { + uint64_t Value, bool IsResolved) const override { Value = adjustFixupValue(Fixup.getKind(), Value); if (!Value) return; // Doesn't change encoding. diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp index 6b32a7926437..51ac410a9c81 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp @@ -52,7 +52,7 @@ public: const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override; void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override; + uint64_t Value, bool IsResolved) const override; bool mayNeedRelaxation(const MCInst &Inst) const override { return false; } @@ -94,7 +94,7 @@ void SystemZMCAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, uint64_t Value, - bool IsPCRel) const { + bool IsResolved) const { MCFixupKind Kind = Fixup.getKind(); unsigned Offset = Fixup.getOffset(); unsigned BitSize = getFixupKindInfo(Kind).TargetSize; diff --git a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp index fe4b52b515e0..73a1036f88e0 100644 --- a/lib/Target/SystemZ/SystemZHazardRecognizer.cpp +++ b/lib/Target/SystemZ/SystemZHazardRecognizer.cpp @@ -26,7 +26,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" // This is the limit of processor resource usage at which the // scheduler should try to look for other instructions (not using the diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index fef4a8c92a36..2801141cd951 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -2224,15 +2224,12 @@ static void lowerMUL_LOHI32(SelectionDAG &DAG, const SDLoc &DL, unsigned Extend, // Lower a binary operation that produces two VT results, one in each // half of a GR128 pair. Op0 and Op1 are the VT operands to the operation, -// Extend extends Op0 to a GR128, and Opcode performs the GR128 operation -// on the extended Op0 and (unextended) Op1. Store the even register result +// and Opcode performs the GR128 operation. Store the even register result // in Even and the odd register result in Odd. static void lowerGR128Binary(SelectionDAG &DAG, const SDLoc &DL, EVT VT, - unsigned Extend, unsigned Opcode, SDValue Op0, - SDValue Op1, SDValue &Even, SDValue &Odd) { - SDNode *In128 = DAG.getMachineNode(Extend, DL, MVT::Untyped, Op0); - SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, - SDValue(In128, 0), Op1); + unsigned Opcode, SDValue Op0, SDValue Op1, + SDValue &Even, SDValue &Odd) { + SDValue Result = DAG.getNode(Opcode, DL, MVT::Untyped, Op0, Op1); bool Is32Bit = is32Bit(VT); Even = DAG.getTargetExtractSubreg(SystemZ::even128(Is32Bit), DL, VT, Result); Odd = DAG.getTargetExtractSubreg(SystemZ::odd128(Is32Bit), DL, VT, Result); @@ -2347,6 +2344,7 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // Handle tests for order using (or (ogt y x) (oge x y)). case ISD::SETUO: Invert = true; + LLVM_FALLTHROUGH; case ISD::SETO: { assert(IsFP && "Unexpected integer comparison"); SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0); @@ -2358,6 +2356,7 @@ static SDValue lowerVectorSETCC(SelectionDAG &DAG, const SDLoc &DL, EVT VT, // Handle <> tests using (or (ogt y x) (ogt x y)). case ISD::SETUEQ: Invert = true; + LLVM_FALLTHROUGH; case ISD::SETONE: { assert(IsFP && "Unexpected integer comparison"); SDValue LT = getVectorCmp(DAG, SystemZISD::VFCMPH, DL, VT, CmpOp1, CmpOp0); @@ -2962,7 +2961,7 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op, lowerMUL_LOHI32(DAG, DL, ISD::SIGN_EXTEND, Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); else { - // Do a full 128-bit multiplication based on UMUL_LOHI64: + // Do a full 128-bit multiplication based on SystemZISD::UMUL_LOHI: // // (ll * rl) + ((lh * rl) << 64) + ((ll * rh) << 64) // @@ -2980,10 +2979,10 @@ SDValue SystemZTargetLowering::lowerSMUL_LOHI(SDValue Op, SDValue RL = Op.getOperand(1); SDValue LH = DAG.getNode(ISD::SRA, DL, VT, LL, C63); SDValue RH = DAG.getNode(ISD::SRA, DL, VT, RL, C63); - // UMUL_LOHI64 returns the low result in the odd register and the high - // result in the even register. SMUL_LOHI is defined to return the - // low half first, so the results are in reverse order. - lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64, + // SystemZISD::UMUL_LOHI returns the low result in the odd register and + // the high result in the even register. ISD::SMUL_LOHI is defined to + // return the low half first, so the results are in reverse order. + lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI, LL, RL, Ops[1], Ops[0]); SDValue NegLLTimesRH = DAG.getNode(ISD::AND, DL, VT, LL, RH); SDValue NegLHTimesRL = DAG.getNode(ISD::AND, DL, VT, LH, RL); @@ -3004,10 +3003,10 @@ SDValue SystemZTargetLowering::lowerUMUL_LOHI(SDValue Op, lowerMUL_LOHI32(DAG, DL, ISD::ZERO_EXTEND, Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); else - // UMUL_LOHI64 returns the low result in the odd register and the high - // result in the even register. UMUL_LOHI is defined to return the - // low half first, so the results are in reverse order. - lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, SystemZISD::UMUL_LOHI64, + // SystemZISD::UMUL_LOHI returns the low result in the odd register and + // the high result in the even register. ISD::UMUL_LOHI is defined to + // return the low half first, so the results are in reverse order. + lowerGR128Binary(DAG, DL, VT, SystemZISD::UMUL_LOHI, Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); return DAG.getMergeValues(Ops, DL); } @@ -3018,24 +3017,19 @@ SDValue SystemZTargetLowering::lowerSDIVREM(SDValue Op, SDValue Op1 = Op.getOperand(1); EVT VT = Op.getValueType(); SDLoc DL(Op); - unsigned Opcode; - // We use DSGF for 32-bit division. - if (is32Bit(VT)) { + // We use DSGF for 32-bit division. This means the first operand must + // always be 64-bit, and the second operand should be 32-bit whenever + // that is possible, to improve performance. + if (is32Bit(VT)) Op0 = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64, Op0); - Opcode = SystemZISD::SDIVREM32; - } else if (DAG.ComputeNumSignBits(Op1) > 32) { + else if (DAG.ComputeNumSignBits(Op1) > 32) Op1 = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Op1); - Opcode = SystemZISD::SDIVREM32; - } else - Opcode = SystemZISD::SDIVREM64; - // DSG(F) takes a 64-bit dividend, so the even register in the GR128 - // input is "don't care". The instruction returns the remainder in - // the even register and the quotient in the odd register. + // DSG(F) returns the remainder in the even register and the + // quotient in the odd register. SDValue Ops[2]; - lowerGR128Binary(DAG, DL, VT, SystemZ::AEXT128_64, Opcode, - Op0, Op1, Ops[1], Ops[0]); + lowerGR128Binary(DAG, DL, VT, SystemZISD::SDIVREM, Op0, Op1, Ops[1], Ops[0]); return DAG.getMergeValues(Ops, DL); } @@ -3044,16 +3038,11 @@ SDValue SystemZTargetLowering::lowerUDIVREM(SDValue Op, EVT VT = Op.getValueType(); SDLoc DL(Op); - // DL(G) uses a double-width dividend, so we need to clear the even - // register in the GR128 input. The instruction returns the remainder - // in the even register and the quotient in the odd register. + // DL(G) returns the remainder in the even register and the + // quotient in the odd register. SDValue Ops[2]; - if (is32Bit(VT)) - lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_32, SystemZISD::UDIVREM32, - Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); - else - lowerGR128Binary(DAG, DL, VT, SystemZ::ZEXT128_64, SystemZISD::UDIVREM64, - Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); + lowerGR128Binary(DAG, DL, VT, SystemZISD::UDIVREM, + Op.getOperand(0), Op.getOperand(1), Ops[1], Ops[0]); return DAG.getMergeValues(Ops, DL); } @@ -3193,13 +3182,13 @@ SDValue SystemZTargetLowering::lowerATOMIC_FENCE(SDValue Op, SDLoc DL(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); - SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + SyncScope::ID FenceSSID = static_cast<SyncScope::ID>( cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && - FenceScope == CrossThread) { + FenceSSID == SyncScope::System) { return SDValue(DAG.getMachineNode(SystemZ::Serialize, DL, MVT::Other, Op.getOperand(0)), 0); @@ -4669,11 +4658,9 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const { OPCODE(SELECT_CCMASK); OPCODE(ADJDYNALLOC); OPCODE(POPCNT); - OPCODE(UMUL_LOHI64); - OPCODE(SDIVREM32); - OPCODE(SDIVREM64); - OPCODE(UDIVREM32); - OPCODE(UDIVREM64); + OPCODE(UMUL_LOHI); + OPCODE(SDIVREM); + OPCODE(UDIVREM); OPCODE(MVC); OPCODE(MVC_LOOP); OPCODE(NC); @@ -5778,14 +5765,12 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI, return DoneMBB; } -// Emit an extension from a GR32 or GR64 to a GR128. ClearEven is true +// Emit an extension from a GR64 to a GR128. ClearEven is true // if the high register of the GR128 value must be cleared or false if -// it's "don't care". SubReg is subreg_l32 when extending a GR32 -// and subreg_l64 when extending a GR64. +// it's "don't care". MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI, MachineBasicBlock *MBB, - bool ClearEven, - unsigned SubReg) const { + bool ClearEven) const { MachineFunction &MF = *MBB->getParent(); const SystemZInstrInfo *TII = static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo()); @@ -5808,7 +5793,7 @@ MachineBasicBlock *SystemZTargetLowering::emitExt128(MachineInstr &MI, In128 = NewIn128; } BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::INSERT_SUBREG), Dest) - .addReg(In128).addReg(Src).addImm(SubReg); + .addReg(In128).addReg(Src).addImm(SystemZ::subreg_l64); MI.eraseFromParent(); return MBB; @@ -6172,12 +6157,10 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( case SystemZ::CondStoreF64Inv: return emitCondStore(MI, MBB, SystemZ::STD, 0, true); - case SystemZ::AEXT128_64: - return emitExt128(MI, MBB, false, SystemZ::subreg_l64); - case SystemZ::ZEXT128_32: - return emitExt128(MI, MBB, true, SystemZ::subreg_l32); - case SystemZ::ZEXT128_64: - return emitExt128(MI, MBB, true, SystemZ::subreg_l64); + case SystemZ::AEXT128: + return emitExt128(MI, MBB, false); + case SystemZ::ZEXT128: + return emitExt128(MI, MBB, true); case SystemZ::ATOMIC_SWAPW: return emitAtomicLoadBinary(MI, MBB, 0, 0); diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 5dcb19c0a35d..6c9c404816f0 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -86,14 +86,11 @@ enum NodeType : unsigned { // Count number of bits set in operand 0 per byte. POPCNT, - // Wrappers around the ISD opcodes of the same name. The output and - // first input operands are GR128s. The trailing numbers are the - // widths of the second operand in bits. - UMUL_LOHI64, - SDIVREM32, - SDIVREM64, - UDIVREM32, - UDIVREM64, + // Wrappers around the ISD opcodes of the same name. The output is GR128. + // Input operands may be GR64 or GR32, depending on the instruction. + UMUL_LOHI, + SDIVREM, + UDIVREM, // Use a series of MVCs to copy bytes from one memory location to another. // The operands are: @@ -562,7 +559,7 @@ private: unsigned StoreOpcode, unsigned STOCOpcode, bool Invert) const; MachineBasicBlock *emitExt128(MachineInstr &MI, MachineBasicBlock *MBB, - bool ClearEven, unsigned SubReg) const; + bool ClearEven) const; MachineBasicBlock *emitAtomicLoadBinary(MachineInstr &MI, MachineBasicBlock *BB, unsigned BinOpcode, unsigned BitSize, diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index 98f66c29ae64..4569be7602e4 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -677,6 +677,22 @@ let Predicates = [FeatureLoadAndTrap] in { def LLGTAT : UnaryRXY<"llgtat", 0xE39C, null_frag, GR64, 4>; } +// Extend GR64s to GR128s. +let usesCustomInserter = 1 in + def ZEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>; + +//===----------------------------------------------------------------------===// +// "Any" extensions +//===----------------------------------------------------------------------===// + +// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext. +def : Pat<(i64 (anyext GR32:$src)), + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>; + +// Extend GR64s to GR128s. +let usesCustomInserter = 1 in + def AEXT128 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>; + //===----------------------------------------------------------------------===// // Truncations //===----------------------------------------------------------------------===// @@ -1216,13 +1232,17 @@ def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>; // Multiplication of a register, producing two results. def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>; -def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>; +def MLGR : BinaryRRE<"mlgr", 0xB986, null_frag, GR128, GR64>; +def : Pat<(z_umul_lohi GR64:$src1, GR64:$src2), + (MLGR (AEXT128 GR64:$src1), GR64:$src2)>; // Multiplication of memory, producing two results. def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>; def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>; def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>; -def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>; +def MLG : BinaryRXY<"mlg", 0xE386, null_frag, GR128, load, 8>; +def : Pat<(z_umul_lohi GR64:$src1, (i64 (load bdxaddr20only:$src2))), + (MLG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; //===----------------------------------------------------------------------===// // Division and remainder @@ -1230,19 +1250,38 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>; let hasSideEffects = 1 in { // Do not speculatively execute. // Division and remainder, from registers. - def DR : BinaryRR <"dr", 0x1D, null_frag, GR128, GR32>; - def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>; - def DSGR : BinaryRRE<"dsgr", 0xB90D, z_sdivrem64, GR128, GR64>; - def DLR : BinaryRRE<"dlr", 0xB997, z_udivrem32, GR128, GR32>; - def DLGR : BinaryRRE<"dlgr", 0xB987, z_udivrem64, GR128, GR64>; + def DR : BinaryRR <"dr", 0x1D, null_frag, GR128, GR32>; + def DSGFR : BinaryRRE<"dsgfr", 0xB91D, null_frag, GR128, GR32>; + def DSGR : BinaryRRE<"dsgr", 0xB90D, null_frag, GR128, GR64>; + def DLR : BinaryRRE<"dlr", 0xB997, null_frag, GR128, GR32>; + def DLGR : BinaryRRE<"dlgr", 0xB987, null_frag, GR128, GR64>; // Division and remainder, from memory. - def D : BinaryRX <"d", 0x5D, null_frag, GR128, load, 4>; - def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>; - def DSG : BinaryRXY<"dsg", 0xE30D, z_sdivrem64, GR128, load, 8>; - def DL : BinaryRXY<"dl", 0xE397, z_udivrem32, GR128, load, 4>; - def DLG : BinaryRXY<"dlg", 0xE387, z_udivrem64, GR128, load, 8>; -} + def D : BinaryRX <"d", 0x5D, null_frag, GR128, load, 4>; + def DSGF : BinaryRXY<"dsgf", 0xE31D, null_frag, GR128, load, 4>; + def DSG : BinaryRXY<"dsg", 0xE30D, null_frag, GR128, load, 8>; + def DL : BinaryRXY<"dl", 0xE397, null_frag, GR128, load, 4>; + def DLG : BinaryRXY<"dlg", 0xE387, null_frag, GR128, load, 8>; +} +def : Pat<(z_sdivrem GR64:$src1, GR32:$src2), + (DSGFR (AEXT128 GR64:$src1), GR32:$src2)>; +def : Pat<(z_sdivrem GR64:$src1, (i32 (load bdxaddr20only:$src2))), + (DSGF (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; +def : Pat<(z_sdivrem GR64:$src1, GR64:$src2), + (DSGR (AEXT128 GR64:$src1), GR64:$src2)>; +def : Pat<(z_sdivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))), + (DSG (AEXT128 GR64:$src1), bdxaddr20only:$src2)>; + +def : Pat<(z_udivrem GR32:$src1, GR32:$src2), + (DLR (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1, + subreg_l32)), GR32:$src2)>; +def : Pat<(z_udivrem GR32:$src1, (i32 (load bdxaddr20only:$src2))), + (DL (ZEXT128 (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src1, + subreg_l32)), bdxaddr20only:$src2)>; +def : Pat<(z_udivrem GR64:$src1, GR64:$src2), + (DLGR (ZEXT128 GR64:$src1), GR64:$src2)>; +def : Pat<(z_udivrem GR64:$src1, (i64 (load bdxaddr20only:$src2))), + (DLG (ZEXT128 GR64:$src1), bdxaddr20only:$src2)>; //===----------------------------------------------------------------------===// // Shifts @@ -1894,17 +1933,6 @@ def : Pat<(ctlz GR64:$src), let Predicates = [FeaturePopulationCount], Defs = [CC] in def POPCNT : UnaryRRE<"popcnt", 0xB9E1, z_popcnt, GR64, GR64>; -// Use subregs to populate the "don't care" bits in a 32-bit to 64-bit anyext. -def : Pat<(i64 (anyext GR32:$src)), - (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, subreg_l32)>; - -// Extend GR32s and GR64s to GR128s. -let usesCustomInserter = 1 in { - def AEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>; - def ZEXT128_32 : Pseudo<(outs GR128:$dst), (ins GR32:$src), []>; - def ZEXT128_64 : Pseudo<(outs GR128:$dst), (ins GR64:$src), []>; -} - // Search a block of memory for a character. let mayLoad = 1, Defs = [CC] in defm SRST : StringRRE<"srst", 0xB25E, z_search_string>; diff --git a/lib/Target/SystemZ/SystemZLDCleanup.cpp b/lib/Target/SystemZ/SystemZLDCleanup.cpp index 3a0e01da42f0..d4cd89ce590f 100644 --- a/lib/Target/SystemZ/SystemZLDCleanup.cpp +++ b/lib/Target/SystemZ/SystemZLDCleanup.cpp @@ -127,7 +127,7 @@ MachineInstr *SystemZLDCleanup::ReplaceTLSCall(MachineInstr *I, return Copy; } -// Create a virtal register in *TLSBaseAddrReg, and populate it by +// Create a virtual register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. MachineInstr *SystemZLDCleanup::SetRegister(MachineInstr *I, unsigned *TLSBaseAddrReg) { diff --git a/lib/Target/SystemZ/SystemZMachineScheduler.cpp b/lib/Target/SystemZ/SystemZMachineScheduler.cpp index b6feaa49d858..8342463c1086 100644 --- a/lib/Target/SystemZ/SystemZMachineScheduler.cpp +++ b/lib/Target/SystemZ/SystemZMachineScheduler.cpp @@ -18,7 +18,7 @@ using namespace llvm; -#define DEBUG_TYPE "misched" +#define DEBUG_TYPE "machine-scheduler" #ifndef NDEBUG // Print the set of SUs diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index ab2392809f3b..9c6d5819f8a7 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -36,14 +36,10 @@ def SDT_ZWrapOffset : SDTypeProfile<1, 2, SDTCisSameAs<0, 2>, SDTCisPtrTy<0>]>; def SDT_ZAdjDynAlloc : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>; -def SDT_ZGR128Binary32 : SDTypeProfile<1, 2, +def SDT_ZGR128Binary : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, - SDTCisVT<1, untyped>, - SDTCisVT<2, i32>]>; -def SDT_ZGR128Binary64 : SDTypeProfile<1, 2, - [SDTCisVT<0, untyped>, - SDTCisVT<1, untyped>, - SDTCisVT<2, i64>]>; + SDTCisInt<1>, + SDTCisInt<2>]>; def SDT_ZAtomicLoadBinaryW : SDTypeProfile<1, 5, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, @@ -185,11 +181,9 @@ def z_select_ccmask : SDNode<"SystemZISD::SELECT_CCMASK", SDT_ZSelectCCMask, [SDNPInGlue]>; def z_adjdynalloc : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>; def z_popcnt : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>; -def z_umul_lohi64 : SDNode<"SystemZISD::UMUL_LOHI64", SDT_ZGR128Binary64>; -def z_sdivrem32 : SDNode<"SystemZISD::SDIVREM32", SDT_ZGR128Binary32>; -def z_sdivrem64 : SDNode<"SystemZISD::SDIVREM64", SDT_ZGR128Binary64>; -def z_udivrem32 : SDNode<"SystemZISD::UDIVREM32", SDT_ZGR128Binary32>; -def z_udivrem64 : SDNode<"SystemZISD::UDIVREM64", SDT_ZGR128Binary64>; +def z_umul_lohi : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>; +def z_sdivrem : SDNode<"SystemZISD::SDIVREM", SDT_ZGR128Binary>; +def z_udivrem : SDNode<"SystemZISD::UDIVREM", SDT_ZGR128Binary>; def z_membarrier : SDNode<"SystemZISD::MEMBARRIER", SDTNone, [SDNPHasChain, SDNPSideEffect]>; diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td index adc9f2976f87..72543c1eaee2 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -15,7 +15,7 @@ def Z13Model : SchedMachineModel { let UnsupportedFeatures = Arch11UnsupportedFeatures.List; - + let IssueWidth = 8; let MicroOpBufferSize = 60; // Issue queues let LoadLatency = 1; // Optimistic load latency. @@ -159,7 +159,7 @@ def : InstRW<[FXb], (instregex "CondReturn$")>; // Select instructions //===----------------------------------------------------------------------===// -// Select pseudo +// Select pseudo def : InstRW<[FXa], (instregex "Select(32|64|32Mux)$")>; // CondStore pseudos @@ -226,7 +226,7 @@ def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVST$")>; def : InstRW<[FXa, Lat2], (instregex "LOCRMux$")>; def : InstRW<[FXa, Lat2], (instregex "LOC(G|FH)?R(Asm.*)?$")>; -def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Asm.*)?$")>; +def : InstRW<[FXa, Lat2], (instregex "LOC(G|H)?HI(Mux|(Asm.*))?$")>; def : InstRW<[FXa, LSU, Lat6], (instregex "LOC(G|FH|Mux)?(Asm.*)?$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STOC(G|FH|Mux)?(Asm.*)?$")>; @@ -282,7 +282,7 @@ def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; // Load multiple disjoint -def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; // Store multiple (estimated average of ceil(5/2) FXb ops) def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, @@ -446,13 +446,13 @@ def : InstRW<[FXa, Lat6], (instregex "MS(R|FI)$")>; def : InstRW<[FXa, LSU, Lat12], (instregex "MSG$")>; def : InstRW<[FXa, Lat8], (instregex "MSGR$")>; def : InstRW<[FXa, Lat6], (instregex "MSGF(I|R)$")>; -def : InstRW<[FXa, LSU, Lat15, GroupAlone], (instregex "MLG$")>; -def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>; +def : InstRW<[FXa2, LSU, Lat15, GroupAlone], (instregex "MLG$")>; +def : InstRW<[FXa2, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXa, Lat5], (instregex "MGHI$")>; def : InstRW<[FXa, Lat5], (instregex "MHI$")>; def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>; -def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>; -def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; +def : InstRW<[FXa2, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXa2, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder @@ -460,8 +460,8 @@ def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>; def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>; -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; -def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>; +def : InstRW<[FXa2, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; +def : InstRW<[LSU, FXa2, Lat30, GroupAlone], (instregex "DSG(F)?$")>; def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>; def : InstRW<[FXa2, FXa2, Lat30, GroupAlone], (instregex "DLGR$")>; def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>; @@ -474,7 +474,8 @@ def : InstRW<[FXa], (instregex "SLL(G|K)?$")>; def : InstRW<[FXa], (instregex "SRL(G|K)?$")>; def : InstRW<[FXa], (instregex "SRA(G|K)?$")>; def : InstRW<[FXa], (instregex "SLA(G|K)?$")>; -def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>; +def : InstRW<[FXa, FXa, FXa, FXa, LSU, Lat8, GroupAlone], + (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -537,7 +538,7 @@ def : InstRW<[FXb], (instregex "TMLH(64)?$")>; def : InstRW<[FXb], (instregex "TMLL(64)?$")>; // Compare logical characters under mask -def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>; +def : InstRW<[FXb, LSU, Lat6], (instregex "CLM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Prefetch and execution hint @@ -573,7 +574,7 @@ def : InstRW<[FXa, FXa, FXb, FXb, LSU, FXb, FXb, LSU, LSU, Lat20, GroupAlone], (instregex "CDSG$")>; // Compare and swap and store -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CSST$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "CSST$")>; // Perform locked operation def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PLO$")>; @@ -589,36 +590,45 @@ def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; // Translate and convert //===----------------------------------------------------------------------===// -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "TR$")>; +def : InstRW<[FXa, FXa, FXa, LSU, LSU, Lat30, GroupAlone], (instregex "TRT$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "TRTR$")>; +def : InstRW<[FXa, Lat30], (instregex "TR(TR)?(T)?(E|EOpt)?$")>; +def : InstRW<[LSU, Lat30], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXa, Lat30], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXa, Lat30], (instregex "(CUUTF|CUTFU)(Opt)?$")>; //===----------------------------------------------------------------------===// // Message-security assist //===----------------------------------------------------------------------===// -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>; +def : InstRW<[FXa, Lat30], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXa, Lat30], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>; //===----------------------------------------------------------------------===// // Decimal arithmetic //===----------------------------------------------------------------------===// -def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; -def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[FXb, VecDF, VecDF, LSU, LSU, Lat30, GroupAlone], + (instregex "CVBG$")>; +def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y)?$")>; +def : InstRW<[FXb, FXb, FXb, VecDF2, VecDF2, LSU, Lat30, GroupAlone], + (instregex "CVDG$")>; +def : InstRW<[FXb, VecDF, FXb, LSU, Lat30, GroupAlone], (instregex "CVD(Y)?$")>; +def : InstRW<[LSU, Lat10, GroupAlone], (instregex "MVO$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z)$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; +def : InstRW<[LSU, Lat12, GroupAlone], (instregex "UNPK(A|U)$")>; +def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "UNPK$")>; -def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone], +def : InstRW<[FXb, VecDFX, LSU, LSU, LSU, Lat9, GroupAlone], (instregex "(A|S|ZA)P$")>; -def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone], +def : InstRW<[FXb, VecDFX2, VecDFX2, LSU, LSU, LSU, Lat30, GroupAlone], (instregex "(M|D)P$")>; -def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone], +def : InstRW<[FXb, VecDFX, VecDFX, LSU, LSU, Lat15, GroupAlone], (instregex "SRP$")>; def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>; -def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>; +def : InstRW<[VecDFX, LSU, Lat4, BeginGroup], (instregex "TP$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; //===----------------------------------------------------------------------===// @@ -688,25 +698,25 @@ def : InstRW<[FXb], (instregex "PPA$")>; //===----------------------------------------------------------------------===// // Find leftmost one -def : InstRW<[FXa, Lat6, GroupAlone], (instregex "FLOGR$")>; +def : InstRW<[FXa, FXa, Lat6, GroupAlone], (instregex "FLOGR$")>; // Population count def : InstRW<[FXa, Lat3], (instregex "POPCNT$")>; // Extend -def : InstRW<[FXa], (instregex "AEXT128_64$")>; -def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>; +def : InstRW<[FXa], (instregex "AEXT128$")>; +def : InstRW<[FXa], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>; -def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[FXa, Lat30], (instregex "SRSTU$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; // Various complex instructions -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; +def : InstRW<[LSU, Lat30], (instregex "CFC$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30], (instregex "CKSM$")>; +def : InstRW<[FXa, Lat30], (instregex "CMPSC$")>; // Execute def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>; @@ -833,7 +843,7 @@ def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXBR(A)?$")>; // Addition def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D)B$")>; def : InstRW<[VecBF], (instregex "A(E|D)BR$")>; -def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXBR$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXBR$")>; // Subtraction def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D)B$")>; @@ -848,9 +858,9 @@ def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDBR$")>; def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXBR$")>; // Multiply and add / subtract -def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)EB$")>; def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)EBR$")>; -def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)DB$")>; def : InstRW<[VecBF], (instregex "M(A|S)DBR$")>; // Division @@ -859,7 +869,7 @@ def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>; def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>; // Divide to integer -def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; +def : InstRW<[VecFPd, Lat30], (instregex "DI(E|D)BR$")>; //===----------------------------------------------------------------------===// // FP: Comparisons @@ -882,8 +892,8 @@ def : InstRW<[FXa, LSU, Lat4, GroupAlone], (instregex "EFPC$")>; def : InstRW<[FXb, LSU, Lat5, GroupAlone], (instregex "STFPC$")>; def : InstRW<[LSU, Lat3, GroupAlone], (instregex "SFPC$")>; def : InstRW<[LSU, LSU, Lat6, GroupAlone], (instregex "LFPC$")>; -def : InstRW<[FXa, Lat30, GroupAlone], (instregex "SFASR$")>; -def : InstRW<[FXa, LSU, Lat30, GroupAlone], (instregex "LFAS$")>; +def : InstRW<[FXa, Lat30], (instregex "SFASR$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "LFAS$")>; def : InstRW<[FXb, Lat3, GroupAlone], (instregex "SRNM(B|T)?$")>; @@ -904,7 +914,7 @@ def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "LTXR$")>; // Load rounded def : InstRW<[VecBF], (instregex "(LEDR|LRER)$")>; def : InstRW<[VecBF], (instregex "LEXR$")>; -def : InstRW<[VecDF2, VecDF2], (instregex "(LDXR|LRDR)$")>; +def : InstRW<[VecDF2], (instregex "(LDXR|LRDR)$")>; // Load lengthened def : InstRW<[LSU], (instregex "LDE$")>; @@ -955,7 +965,7 @@ def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "FIXR$")>; // Addition def : InstRW<[VecBF, LSU, Lat12], (instregex "A(E|D|U|W)$")>; def : InstRW<[VecBF], (instregex "A(E|D|U|W)R$")>; -def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXR$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXR$")>; // Subtraction def : InstRW<[VecBF, LSU, Lat12], (instregex "S(E|D|U|W)$")>; @@ -968,16 +978,20 @@ def : InstRW<[VecBF], (instregex "M(D|DE|E|EE)R$")>; def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MXD$")>; def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MXDR$")>; def : InstRW<[VecDF2, VecDF2, Lat20, GroupAlone], (instregex "MXR$")>; -def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)?$")>; -def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MY(H|L)?R$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MY$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MY(H|L)$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MYR$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "MY(H|L)R$")>; // Multiply and add / subtract -def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)E$")>; def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)ER$")>; -def : InstRW<[VecBF, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; -def : InstRW<[VecBF], (instregex "M(A|S)DR$")>; -def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)?$")>; -def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAY(H|L)?R$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "M(A|S)D$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "M(A|S)DR$")>; +def : InstRW<[VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY(H|L)$")>; +def : InstRW<[VecBF2, VecBF2, LSU, Lat12, GroupAlone], (instregex "MAY$")>; +def : InstRW<[VecBF, GroupAlone], (instregex "MAY(H|L)R$")>; +def : InstRW<[VecBF2, VecBF2, GroupAlone], (instregex "MAYR$")>; // Division def : InstRW<[VecFPd, LSU], (instregex "D(E|D)$")>; @@ -989,8 +1003,8 @@ def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXR$")>; //===----------------------------------------------------------------------===// // Compare -def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)$")>; -def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)R$")>; +def : InstRW<[VecBF, LSU, Lat12], (instregex "C(E|D)$")>; +def : InstRW<[VecBF], (instregex "C(E|D)R$")>; def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXR$")>; @@ -1032,7 +1046,7 @@ def : InstRW<[FXb, VecDF, VecDF, Lat30, BeginGroup], (instregex "CL(F|G)XTR$")>; def : InstRW<[FXb, VecDF, Lat9, BeginGroup], (instregex "CD(S|U)TR$")>; def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "CX(S|U)TR$")>; def : InstRW<[FXb, VecDF, Lat12, BeginGroup], (instregex "C(S|U)DTR$")>; -def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, BeginGroup], (instregex "C(S|U)XTR$")>; +def : InstRW<[FXb, FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "C(S|U)XTR$")>; // Convert from / to zoned def : InstRW<[LSU, VecDF, Lat11, BeginGroup], (instregex "CDZT$")>; @@ -1047,7 +1061,7 @@ def : InstRW<[FXb, LSU, VecDF, Lat11, BeginGroup], (instregex "CPDT$")>; def : InstRW<[FXb, LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "CPXT$")>; // Perform floating-point operation -def : InstRW<[LSU, Lat30, GroupAlone], (instregex "PFPO$")>; +def : InstRW<[FXb, Lat30], (instregex "PFPO$")>; //===----------------------------------------------------------------------===// // DFP: Unary arithmetic @@ -1071,7 +1085,7 @@ def : InstRW<[FXb, VecDF, VecDF, Lat15, BeginGroup], (instregex "ESXTR$")>; // Addition def : InstRW<[VecDF], (instregex "ADTR(A)?$")>; -def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "AXTR(A)?$")>; +def : InstRW<[VecDF2, VecDF2, Lat10, GroupAlone], (instregex "AXTR(A)?$")>; // Subtraction def : InstRW<[VecDF], (instregex "SDTR(A)?$")>; @@ -1090,15 +1104,15 @@ def : InstRW<[VecDF], (instregex "QADTR$")>; def : InstRW<[VecDF2, VecDF2, Lat11, GroupAlone], (instregex "QAXTR$")>; // Reround -def : InstRW<[FXb, VecDF, Lat11], (instregex "RRDTR$")>; +def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "RRDTR$")>; def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "RRXTR$")>; // Shift significand left/right -def : InstRW<[LSU, VecDF, Lat11], (instregex "S(L|R)DT$")>; +def : InstRW<[LSU, VecDF, Lat11, GroupAlone], (instregex "S(L|R)DT$")>; def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "S(L|R)XT$")>; // Insert biased exponent -def : InstRW<[FXb, VecDF, Lat11], (instregex "IEDTR$")>; +def : InstRW<[FXb, VecDF, Lat11, BeginGroup], (instregex "IEDTR$")>; def : InstRW<[FXb, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "IEXTR$")>; //===----------------------------------------------------------------------===// @@ -1115,7 +1129,7 @@ def : InstRW<[VecDF], (instregex "CEXTR$")>; // Test Data Class/Group def : InstRW<[LSU, VecDF, Lat11], (instregex "TD(C|G)(E|D)T$")>; -def : InstRW<[LSU, VecDF2, VecDF2, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; +def : InstRW<[LSU, VecDF, VecDF, Lat15, GroupAlone], (instregex "TD(C|G)XT$")>; // --------------------------------- Vector --------------------------------- // @@ -1271,32 +1285,43 @@ def : InstRW<[VecStr, Lat5], (instregex "VTM$")>; // Vector: Floating-point arithmetic //===----------------------------------------------------------------------===// -def : InstRW<[VecBF2], (instregex "VCD(G|GB|LG|LGB)$")>; -def : InstRW<[VecBF], (instregex "WCD(GB|LGB)$")>; +// Conversion and rounding +def : InstRW<[VecBF2], (instregex "VCD(L)?G$")>; +def : InstRW<[VecBF2], (instregex "VCD(L)?GB$")>; +def : InstRW<[VecBF], (instregex "WCD(L)?GB$")>; def : InstRW<[VecBF2], (instregex "VC(L)?GD$")>; -def : InstRW<[VecBF2], (instregex "VFADB$")>; -def : InstRW<[VecBF], (instregex "WFADB$")>; -def : InstRW<[VecBF2], (instregex "VCGDB$")>; -def : InstRW<[VecBF], (instregex "WCGDB$")>; -def : InstRW<[VecBF2], (instregex "VF(I|M|A|S)$")>; -def : InstRW<[VecBF2], (instregex "VF(I|M|S)DB$")>; -def : InstRW<[VecBF], (instregex "WF(I|M|S)DB$")>; -def : InstRW<[VecBF2], (instregex "VCLGDB$")>; -def : InstRW<[VecBF], (instregex "WCLGDB$")>; -def : InstRW<[VecXsPm], (instregex "VFL(C|N|P)DB$")>; -def : InstRW<[VecXsPm], (instregex "WFL(C|N|P)DB$")>; -def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>; -def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>; -def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>; -def : InstRW<[VecXsPm], (instregex "VFPSO$")>; -def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>; -def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI(DB)?$")>; -def : InstRW<[VecXsPm, Lat4], (instregex "WFTCIDB$")>; +def : InstRW<[VecBF2], (instregex "VC(L)?GDB$")>; +def : InstRW<[VecBF], (instregex "WC(L)?GDB$")>; def : InstRW<[VecBF2], (instregex "VL(DE|ED)$")>; def : InstRW<[VecBF2], (instregex "VL(DE|ED)B$")>; def : InstRW<[VecBF], (instregex "WL(DE|ED)B$")>; +def : InstRW<[VecBF2], (instregex "VFI$")>; +def : InstRW<[VecBF2], (instregex "VFIDB$")>; +def : InstRW<[VecBF], (instregex "WFIDB$")>; + +// Sign operations +def : InstRW<[VecXsPm], (instregex "VFPSO$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FPSODB$")>; +def : InstRW<[VecXsPm], (instregex "(V|W)FL(C|N|P)DB$")>; -// divide / square root +// Test data class +def : InstRW<[VecXsPm, Lat4], (instregex "VFTCI$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(V|W)FTCIDB$")>; + +// Add / subtract +def : InstRW<[VecBF2], (instregex "VF(A|S)$")>; +def : InstRW<[VecBF2], (instregex "VF(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "WF(A|S)DB$")>; + +// Multiply / multiply-and-add/subtract +def : InstRW<[VecBF2], (instregex "VFM$")>; +def : InstRW<[VecBF2], (instregex "VFMDB$")>; +def : InstRW<[VecBF], (instregex "WFMDB$")>; +def : InstRW<[VecBF2], (instregex "VFM(A|S)$")>; +def : InstRW<[VecBF2], (instregex "VFM(A|S)DB$")>; +def : InstRW<[VecBF], (instregex "WFM(A|S)DB$")>; + +// Divide / square root def : InstRW<[VecFPd], (instregex "VFD$")>; def : InstRW<[VecFPd], (instregex "(V|W)FDDB$")>; def : InstRW<[VecFPd], (instregex "VFSQ$")>; @@ -1308,10 +1333,10 @@ def : InstRW<[VecFPd], (instregex "(V|W)FSQDB$")>; def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)$")>; def : InstRW<[VecXsPm], (instregex "VFC(E|H|HE)DB$")>; -def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>; def : InstRW<[VecXsPm], (instregex "WFC(E|H|HE)DB$")>; def : InstRW<[VecXsPm, Lat4], (instregex "VFC(E|H|HE)DBS$")>; def : InstRW<[VecXsPm, Lat4], (instregex "WFC(E|H|HE)DBS$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)$")>; def : InstRW<[VecXsPm, Lat4], (instregex "WF(C|K)DB$")>; //===----------------------------------------------------------------------===// @@ -1351,12 +1376,12 @@ def : InstRW<[VecStr, Lat5], (instregex "VSTRCZ(B|F|H)S$")>; def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "LPSW(E)?$")>; -def : InstRW<[FXa, Lat3], (instregex "IPK$")>; -def : InstRW<[LSU], (instregex "SPKA$")>; -def : InstRW<[LSU], (instregex "SSM$")>; -def : InstRW<[FXb], (instregex "ST(N|O)SM$")>; +def : InstRW<[FXa, Lat3, GroupAlone], (instregex "IPK$")>; +def : InstRW<[LSU, EndGroup], (instregex "SPKA$")>; +def : InstRW<[LSU, EndGroup], (instregex "SSM$")>; +def : InstRW<[FXb, LSU, GroupAlone], (instregex "ST(N|O)SM$")>; def : InstRW<[FXa, Lat3], (instregex "IAC$")>; -def : InstRW<[LSU], (instregex "SAC(F)?$")>; +def : InstRW<[LSU, EndGroup], (instregex "SAC(F)?$")>; //===----------------------------------------------------------------------===// // System: Control Register Instructions @@ -1411,14 +1436,14 @@ def : InstRW<[FXb, LSU, Lat30], (instregex "TPROT$")>; def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVC(K|P|S)$")>; def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "MVC(S|D)K$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "MVCOS$")>; -def : InstRW<[FXb, LSU, Lat30], (instregex "MVPG$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVPG$")>; //===----------------------------------------------------------------------===// // System: Address-Space Instructions //===----------------------------------------------------------------------===// def : InstRW<[FXb, LSU, Lat30], (instregex "LASP$")>; -def : InstRW<[LSU], (instregex "PALB$")>; +def : InstRW<[LSU, GroupAlone], (instregex "PALB$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "PC$")>; def : InstRW<[FXb, Lat30], (instregex "PR$")>; def : InstRW<[FXb, Lat30], (instregex "PT(I)?$")>; @@ -1430,7 +1455,7 @@ def : InstRW<[FXb, Lat20], (instregex "TAR$")>; // System: Linkage-Stack Instructions //===----------------------------------------------------------------------===// -def : InstRW<[FXb, Lat30], (instregex "BAKR$")>; +def : InstRW<[FXb, Lat30, EndGroup], (instregex "BAKR$")>; def : InstRW<[FXb, Lat30], (instregex "EREG(G)?$")>; def : InstRW<[FXb, Lat30], (instregex "(E|M)STA$")>; @@ -1442,13 +1467,13 @@ def : InstRW<[FXb, Lat30], (instregex "PTFF$")>; def : InstRW<[FXb, LSU, Lat20], (instregex "SCK$")>; def : InstRW<[FXb, Lat30], (instregex "SCKPF$")>; def : InstRW<[FXb, LSU, Lat20], (instregex "SCKC$")>; -def : InstRW<[LSU, GroupAlone], (instregex "SPT$")>; +def : InstRW<[LSU, LSU, GroupAlone], (instregex "SPT$")>; def : InstRW<[LSU, LSU, LSU, FXa, FXa, FXb, Lat9, GroupAlone], (instregex "STCK(F)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, FXa, FXa, FXb, FXb, Lat11, GroupAlone], (instregex "STCKE$")>; def : InstRW<[FXb, LSU, Lat9], (instregex "STCKC$")>; -def : InstRW<[LSU, LSU, FXb, Lat3], (instregex "STPT$")>; +def : InstRW<[LSU, LSU, FXb, Lat5, BeginGroup], (instregex "STPT$")>; //===----------------------------------------------------------------------===// // System: CPU-Related Instructions @@ -1459,7 +1484,7 @@ def : InstRW<[FXb, LSU, Lat30], (instregex "STIDP$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "STSI$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "STFL(E)?$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "ECAG$")>; -def : InstRW<[FXb, LSU, Lat30], (instregex "ECTG$")>; +def : InstRW<[FXa, LSU, Lat30], (instregex "ECTG$")>; def : InstRW<[FXb, Lat30], (instregex "PTF$")>; def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>; @@ -1468,7 +1493,7 @@ def : InstRW<[FXb, Lat30], (instregex "PCKMO$")>; //===----------------------------------------------------------------------===// def : InstRW<[FXb, Lat30], (instregex "SVC$")>; -def : InstRW<[FXb], (instregex "MC$")>; +def : InstRW<[FXb, GroupAlone], (instregex "MC$")>; def : InstRW<[FXb, Lat30], (instregex "DIAG$")>; def : InstRW<[FXb], (instregex "TRAC(E|G)$")>; def : InstRW<[FXb, Lat30], (instregex "TRAP(2|4)$")>; @@ -1483,7 +1508,8 @@ def : InstRW<[FXb, LSU, Lat30], (instregex "SIE$")>; def : InstRW<[FXb], (instregex "LPP$")>; def : InstRW<[FXb, Lat30], (instregex "ECPGA$")>; def : InstRW<[FXb, Lat30], (instregex "E(C|P)CTR$")>; -def : InstRW<[FXb, LSU, Lat30], (instregex "L(C|P|S)CTL$")>; +def : InstRW<[FXb, Lat30], (instregex "LCCTL$")>; +def : InstRW<[FXb, LSU, Lat30], (instregex "L(P|S)CTL$")>; def : InstRW<[FXb, LSU, Lat30], (instregex "Q(S|CTR)I$")>; def : InstRW<[FXb, Lat30], (instregex "S(C|P)CTR$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index 128049a09086..e3e1999d8ad8 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -627,8 +627,8 @@ def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>; // Extend -def : InstRW<[FXU], (instregex "AEXT128_64$")>; -def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; +def : InstRW<[FXU], (instregex "AEXT128$")>; +def : InstRW<[FXU], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index 76b378454631..59f37205f412 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -665,8 +665,8 @@ def : InstRW<[FXU, Lat7, GroupAlone], (instregex "FLOGR$")>; def : InstRW<[FXU, Lat3], (instregex "POPCNT$")>; // Extend -def : InstRW<[FXU], (instregex "AEXT128_64$")>; -def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; +def : InstRW<[FXU], (instregex "AEXT128$")>; +def : InstRW<[FXU], (instregex "ZEXT128$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index ce5c57e0f519..9ac768b2189d 100644 --- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -779,15 +779,14 @@ int SystemZTTIImpl:: getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) { // vlvgp will insert two grs into a vector register, so only count half the // number of instructions. - if (Opcode == Instruction::InsertElement && - Val->getScalarType()->isIntegerTy(64)) + if (Opcode == Instruction::InsertElement && Val->isIntOrIntVectorTy(64)) return ((Index % 2 == 0) ? 1 : 0); if (Opcode == Instruction::ExtractElement) { int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1); // Give a slight penalty for moving out of vector pipeline to FXU unit. - if (Index == 0 && Val->getScalarType()->isIntegerTy()) + if (Index == 0 && Val->isIntOrIntVectorTy()) Cost += 1; return Cost; diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp index ad59f2f40587..00bf02469bdd 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp @@ -115,8 +115,8 @@ void WebAssemblyTargetAsmStreamer::emitStackPointer(uint32_t Index) { void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; } void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType( - StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) { - OS << "\t.functype\t" << name; + MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) { + OS << "\t.functype\t" << Symbol->getName(); if (Results.empty()) OS << ", void"; else { @@ -171,7 +171,7 @@ void WebAssemblyTargetELFStreamer::emitIndIdx(const MCExpr *Value) { } void WebAssemblyTargetELFStreamer::emitIndirectFunctionType( - StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) { + MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) { // Nothing to emit here. TODO: Re-design how linking works and re-evaluate // whether it's necessary for .o files to declare indirect function types. } @@ -255,9 +255,25 @@ void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) { } void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType( - StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) { - // Nothing to emit here. TODO: Re-design how linking works and re-evaluate - // whether it's necessary for .o files to declare indirect function types. + MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, + SmallVectorImpl<MVT> &Results) { + MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Symbol); + if (WasmSym->isFunction()) { + // Symbol already has its arguments and result set. + return; + } + + SmallVector<wasm::ValType, 4> ValParams; + for (MVT Ty : Params) + ValParams.push_back(WebAssembly::toValType(Ty)); + + SmallVector<wasm::ValType, 1> ValResults; + for (MVT Ty : Results) + ValResults.push_back(WebAssembly::toValType(Ty)); + + WasmSym->setParams(std::move(ValParams)); + WasmSym->setReturns(std::move(ValResults)); + WasmSym->setIsFunction(true); } void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) { diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h index 5ad147e5e596..102d7219a1e7 100644 --- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h +++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h @@ -44,7 +44,7 @@ public: /// .endfunc virtual void emitEndFunc() = 0; /// .functype - virtual void emitIndirectFunctionType(StringRef name, + virtual void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) = 0; /// .indidx @@ -69,7 +69,7 @@ public: void emitGlobal(ArrayRef<wasm::Global> Globals) override; void emitStackPointer(uint32_t Index) override; void emitEndFunc() override; - void emitIndirectFunctionType(StringRef name, + void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) override; void emitIndIdx(const MCExpr *Value) override; @@ -87,7 +87,7 @@ public: void emitGlobal(ArrayRef<wasm::Global> Globals) override; void emitStackPointer(uint32_t Index) override; void emitEndFunc() override; - void emitIndirectFunctionType(StringRef name, + void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) override; void emitIndIdx(const MCExpr *Value) override; @@ -105,7 +105,7 @@ public: void emitGlobal(ArrayRef<wasm::Global> Globals) override; void emitStackPointer(uint32_t Index) override; void emitEndFunc() override; - void emitIndirectFunctionType(StringRef name, + void emitIndirectFunctionType(MCSymbol *Symbol, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) override; void emitIndIdx(const MCExpr *Value) override; diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp index f51585a10ca1..211358ad66cd 100644 --- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp +++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp @@ -84,7 +84,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) { SmallVector<MVT, 4> Results; SmallVector<MVT, 4> Params; ComputeSignatureVTs(F, TM, Params, Results); - getTargetStreamer()->emitIndirectFunctionType(F.getName(), Params, + getTargetStreamer()->emitIndirectFunctionType(getSymbol(&F), Params, Results); } } @@ -214,11 +214,8 @@ void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) { const MCExpr *WebAssemblyAsmPrinter::lowerConstant(const Constant *CV) { if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) if (GV->getValueType()->isFunctionTy()) { - MCSymbol* Sym = getSymbol(GV); - if (!isa<MCSymbolELF>(Sym)) - cast<MCSymbolWasm>(Sym)->setIsFunction(true); return MCSymbolRefExpr::create( - Sym, MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext); + getSymbol(GV), MCSymbolRefExpr::VK_WebAssembly_FUNCTION, OutContext); } return AsmPrinter::lowerConstant(CV); } diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp index 1691808d05a0..700111743ee8 100644 --- a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp +++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp @@ -132,7 +132,7 @@ static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI, // no blocks not dominated by the loop header. // - It's desirable to preserve the original block order when possible. // We use two ready lists; Preferred and Ready. Preferred has recently - // processed sucessors, to help preserve block sequences from the original + // processed successors, to help preserve block sequences from the original // order. Ready has the remaining ready blocks. PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>, CompareBlockNumbers> diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp index ff186eb91503..8880539804ca 100644 --- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp +++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp @@ -112,8 +112,6 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym, MCSymbolRefExpr::VariantKind VK = IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION : MCSymbolRefExpr::VK_None; - if (!isa<MCSymbolELF>(Sym)) - cast<MCSymbolWasm>(Sym)->setIsFunction(IsFunc); const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx); diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp index c02ef4a1c399..2599064334ee 100644 --- a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp +++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp @@ -394,11 +394,22 @@ RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = { /* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR, // ELEMENT-WISE ATOMIC MEMORY -/* MEMCPY_ELEMENT_ATOMIC_1 */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMCPY_ELEMENT_ATOMIC_2 */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMCPY_ELEMENT_ATOMIC_4 */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMCPY_ELEMENT_ATOMIC_8 */ iPTR_func_iPTR_iPTR_iPTR, -/* MEMCPY_ELEMENT_ATOMIC_16 */ iPTR_func_iPTR_iPTR_iPTR, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported, + +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ unsupported, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ unsupported, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ unsupported, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ unsupported, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ unsupported, // EXCEPTION HANDLING /* UNWIND_RESUME */ unsupported, @@ -839,11 +850,21 @@ RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = { /* MEMCPY */ "memcpy", /* MEMMOVE */ "memset", /* MEMSET */ "memmove", -/* MEMCPY_ELEMENT_ATOMIC_1 */ "MEMCPY_ELEMENT_ATOMIC_1", -/* MEMCPY_ELEMENT_ATOMIC_2 */ "MEMCPY_ELEMENT_ATOMIC_2", -/* MEMCPY_ELEMENT_ATOMIC_4 */ "MEMCPY_ELEMENT_ATOMIC_4", -/* MEMCPY_ELEMENT_ATOMIC_8 */ "MEMCPY_ELEMENT_ATOMIC_8", -/* MEMCPY_ELEMENT_ATOMIC_16 */ "MEMCPY_ELEMENT_ATOMIC_16", +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr, +/* MEMCPY_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr, +/* MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_1 */ nullptr, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_2 */ nullptr, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_4 */ nullptr, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_8 */ nullptr, +/* MEMSET_ELEMENT_UNORDERED_ATOMIC_16 */ nullptr, /* UNWIND_RESUME */ "_Unwind_Resume", /* SYNC_VAL_COMPARE_AND_SWAP_1 */ "__sync_val_compare_and_swap_1", /* SYNC_VAL_COMPARE_AND_SWAP_2 */ "__sync_val_compare_and_swap_2", diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index 825f23dc52d9..c1d216c8b7af 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -2453,8 +2453,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, break; } - // In MS inline asm curly braces mark the begining/end of a block, therefore - // they should be interepreted as end of statement + // In MS inline asm curly braces mark the beginning/end of a block, + // therefore they should be interepreted as end of statement CurlyAsEndOfStatement = isParsingIntelSyntax() && isParsingInlineAsm() && (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly)); diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index 5e809c34325e..f5f3a4cc83dc 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -1038,7 +1038,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::EXTRQI: if (MI->getOperand(2).isImm() && MI->getOperand(3).isImm()) - DecodeEXTRQIMask(MI->getOperand(2).getImm(), + DecodeEXTRQIMask(MVT::v16i8, MI->getOperand(2).getImm(), MI->getOperand(3).getImm(), ShuffleMask); @@ -1049,7 +1049,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::INSERTQI: if (MI->getOperand(3).isImm() && MI->getOperand(4).isImm()) - DecodeINSERTQIMask(MI->getOperand(3).getImm(), + DecodeINSERTQIMask(MVT::v16i8, MI->getOperand(3).getImm(), MI->getOperand(4).getImm(), ShuffleMask); diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 914fb36f91a7..733eac7c0321 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -110,7 +110,7 @@ public: void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup, const MCValue &Target, MutableArrayRef<char> Data, - uint64_t Value, bool IsPCRel) const override { + uint64_t Value, bool IsResolved) const override { unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind()); assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!"); diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp index 1be5aec849fc..8a0fbfb45b22 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp +++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp @@ -452,15 +452,20 @@ void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &Mask) { Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i); } -void DecodeEXTRQIMask(int Len, int Idx, +void DecodeEXTRQIMask(MVT VT, int Len, int Idx, SmallVectorImpl<int> &ShuffleMask) { + assert(VT.is128BitVector() && "Expected 128-bit vector"); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSize = VT.getScalarSizeInBits(); + unsigned HalfElts = NumElts / 2; + // Only the bottom 6 bits are valid for each immediate. Len &= 0x3F; Idx &= 0x3F; // We can only decode this bit extraction instruction as a shuffle if both the - // length and index work with whole bytes. - if (0 != (Len % 8) || 0 != (Idx % 8)) + // length and index work with whole elements. + if (0 != (Len % EltSize) || 0 != (Idx % EltSize)) return; // A length of zero is equivalent to a bit length of 64. @@ -469,33 +474,38 @@ void DecodeEXTRQIMask(int Len, int Idx, // If the length + index exceeds the bottom 64 bits the result is undefined. if ((Len + Idx) > 64) { - ShuffleMask.append(16, SM_SentinelUndef); + ShuffleMask.append(NumElts, SM_SentinelUndef); return; } - // Convert index and index to work with bytes. - Len /= 8; - Idx /= 8; + // Convert index and index to work with elements. + Len /= EltSize; + Idx /= EltSize; - // EXTRQ: Extract Len bytes starting from Idx. Zero pad the remaining bytes - // of the lower 64-bits. The upper 64-bits are undefined. + // EXTRQ: Extract Len elements starting from Idx. Zero pad the remaining + // elements of the lower 64-bits. The upper 64-bits are undefined. for (int i = 0; i != Len; ++i) ShuffleMask.push_back(i + Idx); - for (int i = Len; i != 8; ++i) + for (int i = Len; i != (int)HalfElts; ++i) ShuffleMask.push_back(SM_SentinelZero); - for (int i = 8; i != 16; ++i) + for (int i = HalfElts; i != (int)NumElts; ++i) ShuffleMask.push_back(SM_SentinelUndef); } -void DecodeINSERTQIMask(int Len, int Idx, +void DecodeINSERTQIMask(MVT VT, int Len, int Idx, SmallVectorImpl<int> &ShuffleMask) { + assert(VT.is128BitVector() && "Expected 128-bit vector"); + unsigned NumElts = VT.getVectorNumElements(); + unsigned EltSize = VT.getScalarSizeInBits(); + unsigned HalfElts = NumElts / 2; + // Only the bottom 6 bits are valid for each immediate. Len &= 0x3F; Idx &= 0x3F; // We can only decode this bit insertion instruction as a shuffle if both the - // length and index work with whole bytes. - if (0 != (Len % 8) || 0 != (Idx % 8)) + // length and index work with whole elements. + if (0 != (Len % EltSize) || 0 != (Idx % EltSize)) return; // A length of zero is equivalent to a bit length of 64. @@ -504,24 +514,24 @@ void DecodeINSERTQIMask(int Len, int Idx, // If the length + index exceeds the bottom 64 bits the result is undefined. if ((Len + Idx) > 64) { - ShuffleMask.append(16, SM_SentinelUndef); + ShuffleMask.append(NumElts, SM_SentinelUndef); return; } - // Convert index and index to work with bytes. - Len /= 8; - Idx /= 8; + // Convert index and index to work with elements. + Len /= EltSize; + Idx /= EltSize; - // INSERTQ: Extract lowest Len bytes from lower half of second source and - // insert over first source starting at Idx byte. The upper 64-bits are + // INSERTQ: Extract lowest Len elements from lower half of second source and + // insert over first source starting at Idx element. The upper 64-bits are // undefined. for (int i = 0; i != Idx; ++i) ShuffleMask.push_back(i); for (int i = 0; i != Len; ++i) - ShuffleMask.push_back(i + 16); - for (int i = Idx + Len; i != 8; ++i) + ShuffleMask.push_back(i + NumElts); + for (int i = Idx + Len; i != (int)HalfElts; ++i) ShuffleMask.push_back(i); - for (int i = 8; i != 16; ++i) + for (int i = HalfElts; i != (int)NumElts; ++i) ShuffleMask.push_back(SM_SentinelUndef); } diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h index 17619d09d059..251c9f7558ec 100644 --- a/lib/Target/X86/Utils/X86ShuffleDecode.h +++ b/lib/Target/X86/Utils/X86ShuffleDecode.h @@ -134,12 +134,12 @@ void DecodeZeroMoveLowMask(MVT VT, SmallVectorImpl<int> &ShuffleMask); void DecodeScalarMoveMask(MVT VT, bool IsLoad, SmallVectorImpl<int> &ShuffleMask); -/// Decode a SSE4A EXTRQ instruction as a v16i8 shuffle mask. -void DecodeEXTRQIMask(int Len, int Idx, +/// Decode a SSE4A EXTRQ instruction as a shuffle mask. +void DecodeEXTRQIMask(MVT VT, int Len, int Idx, SmallVectorImpl<int> &ShuffleMask); -/// Decode a SSE4A INSERTQ instruction as a v16i8 shuffle mask. -void DecodeINSERTQIMask(int Len, int Idx, +/// Decode a SSE4A INSERTQ instruction as a shuffle mask. +void DecodeINSERTQIMask(MVT VT, int Len, int Idx, SmallVectorImpl<int> &ShuffleMask); /// Decode a VPERMILPD/VPERMILPS variable mask from a raw array of constants. diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 7437ebacfac3..4ca57fe9fb00 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -451,6 +451,7 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [ FeatureLAHFSAHF, FeatureMPX, FeatureSHA, + FeatureRDRAND, FeatureRDSEED, FeatureXSAVE, FeatureXSAVEOPT, diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index 161bfa7b5474..99aeec67c326 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -19,6 +19,7 @@ #include "X86InstrInfo.h" #include "X86TargetMachine.h" +#include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" @@ -35,7 +36,7 @@ using namespace llvm; X86CallLowering::X86CallLowering(const X86TargetLowering &TLI) : CallLowering(&TLI) {} -void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, +bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, @@ -43,14 +44,24 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, const X86TargetLowering &TLI = *getTLI<X86TargetLowering>(); LLVMContext &Context = OrigArg.Ty->getContext(); - EVT VT = TLI.getValueType(DL, OrigArg.Ty); + + SmallVector<EVT, 4> SplitVTs; + SmallVector<uint64_t, 4> Offsets; + ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0); + + if (SplitVTs.size() != 1) { + // TODO: support struct/array split + return false; + } + + EVT VT = SplitVTs[0]; unsigned NumParts = TLI.getNumRegisters(Context, VT); if (NumParts == 1) { // replace the original type ( pointer -> GPR ). SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context), OrigArg.Flags, OrigArg.IsFixed); - return; + return true; } SmallVector<unsigned, 8> SplitRegs; @@ -67,6 +78,7 @@ void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, } PerformArgSplit(SplitRegs); + return true; } namespace { @@ -113,9 +125,11 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); SmallVector<ArgInfo, 8> SplitArgs; - splitToValueTypes( - OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs) { MIRBuilder.buildUnmerge(Regs, VReg); }); + if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + [&](ArrayRef<unsigned> Regs) { + MIRBuilder.buildUnmerge(Regs, VReg); + })) + return false; FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) @@ -181,12 +195,23 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, SmallVector<ArgInfo, 8> SplitArgs; unsigned Idx = 0; for (auto &Arg : F.args()) { + + // TODO: handle not simple cases. + if (Arg.hasAttribute(Attribute::ByVal) || + Arg.hasAttribute(Attribute::InReg) || + Arg.hasAttribute(Attribute::StructRet) || + Arg.hasAttribute(Attribute::SwiftSelf) || + Arg.hasAttribute(Attribute::SwiftError) || + Arg.hasAttribute(Attribute::Nest)) + return false; + ArgInfo OrigArg(VRegs[Idx], Arg.getType()); - setArgFlags(OrigArg, Idx + 1, DL, F); - splitToValueTypes(OrigArg, SplitArgs, DL, MRI, - [&](ArrayRef<unsigned> Regs) { - MIRBuilder.buildMerge(VRegs[Idx], Regs); - }); + setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F); + if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + [&](ArrayRef<unsigned> Regs) { + MIRBuilder.buildMerge(VRegs[Idx], Regs); + })) + return false; Idx++; } diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index 8a8afb568298..6a5dabf33a0a 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -39,7 +39,7 @@ private: /// A function of this type is used to perform value split action. typedef std::function<void(ArrayRef<unsigned>)> SplitArgTy; - void splitToValueTypes(const ArgInfo &OrigArgInfo, + bool splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, SplitArgTy SplitArg) const; diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 7d146d050a5c..6decb550ad5f 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -651,7 +651,15 @@ def CC_X86_64_GHC : CallingConv<[ // Pass in STG registers: F1, F2, F3, F4, D1, D2 CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", - CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>> + CCAssignToReg<[XMM1, XMM2, XMM3, XMM4, XMM5, XMM6]>>>, + // AVX + CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], + CCIfSubtarget<"hasAVX()", + CCAssignToReg<[YMM1, YMM2, YMM3, YMM4, YMM5, YMM6]>>>, + // AVX-512 + CCIfType<[v64i8, v32i16, v16i32, v8i64, v16f32, v8f64], + CCIfSubtarget<"hasAVX512()", + CCAssignToReg<[ZMM1, ZMM2, ZMM3, ZMM4, ZMM5, ZMM6]>>> ]>; def CC_X86_64_HiPE : CallingConv<[ diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 621505aaded9..ee9e78146305 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -3039,6 +3039,9 @@ bool X86FastISel::fastLowerArguments() { if (!Subtarget->is64Bit()) return false; + if (Subtarget->useSoftFloat()) + return false; + // Only handle simple cases. i.e. Up to 6 i32/i64 scalar arguments. unsigned GPRCnt = 0; unsigned FPRCnt = 0; diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index e3aa227702be..f294e819090b 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -972,7 +972,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->setCalleeSavedFrameSize( X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta); - bool UseRedZone = false; bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty(); // The default stack probe size is 4096 if the function has no stackprobesize @@ -1011,7 +1010,6 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, X86FI->setUsesRedZone(MinSize > 0 || StackSize > 0); StackSize = std::max(MinSize, StackSize > 128 ? StackSize - 128 : 0); MFI.setStackSize(StackSize); - UseRedZone = true; } // Insert stack pointer adjustment for later moving of return addr. Only @@ -1189,7 +1187,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF)) AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign); if (AlignedNumBytes >= StackProbeSize && UseStackProbe) { - assert(!UseRedZone && "The Red Zone is not accounted for in stack probes"); + assert(!X86FI->getUsesRedZone() && + "The Red Zone is not accounted for in stack probes"); // Check whether EAX is livein for this block. bool isEAXAlive = isEAXLiveIn(MBB); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index b89914f8893e..65486cf7f529 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -4217,6 +4217,8 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::PSHUFLW: case X86ISD::SHUFP: case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: @@ -5554,6 +5556,24 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; + case X86ISD::EXTRQI: + if (isa<ConstantSDNode>(N->getOperand(1)) && + isa<ConstantSDNode>(N->getOperand(2))) { + int BitLen = N->getConstantOperandVal(1); + int BitIdx = N->getConstantOperandVal(2); + DecodeEXTRQIMask(VT, BitLen, BitIdx, Mask); + IsUnary = true; + } + break; + case X86ISD::INSERTQI: + if (isa<ConstantSDNode>(N->getOperand(2)) && + isa<ConstantSDNode>(N->getOperand(3))) { + int BitLen = N->getConstantOperandVal(2); + int BitIdx = N->getConstantOperandVal(3); + DecodeINSERTQIMask(VT, BitLen, BitIdx, Mask); + IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + } + break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -9317,11 +9337,11 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getBitcast(VT, V); } -/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. -static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, - SDValue V2, ArrayRef<int> Mask, - const APInt &Zeroable, - SelectionDAG &DAG) { +// EXTRQ: Extract Len elements from lower half of source, starting at Idx. +// Remainder of lower half result is zero and upper half is all undef. +static bool matchVectorShuffleAsEXTRQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx, const APInt &Zeroable) { int Size = Mask.size(); int HalfSize = Size / 2; assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); @@ -9329,120 +9349,133 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, // Upper half must be undefined. if (!isUndefInRange(Mask, HalfSize, HalfSize)) - return SDValue(); + return false; - // EXTRQ: Extract Len elements from lower half of source, starting at Idx. - // Remainder of lower half result is zero and upper half is all undef. - auto LowerAsEXTRQ = [&]() { - // Determine the extraction length from the part of the - // lower half that isn't zeroable. - int Len = HalfSize; - for (; Len > 0; --Len) - if (!Zeroable[Len - 1]) - break; - assert(Len > 0 && "Zeroable shuffle mask"); + // Determine the extraction length from the part of the + // lower half that isn't zeroable. + int Len = HalfSize; + for (; Len > 0; --Len) + if (!Zeroable[Len - 1]) + break; + assert(Len > 0 && "Zeroable shuffle mask"); - // Attempt to match first Len sequential elements from the lower half. - SDValue Src; - int Idx = -1; - for (int i = 0; i != Len; ++i) { - int M = Mask[i]; - if (M < 0) - continue; - SDValue &V = (M < Size ? V1 : V2); - M = M % Size; + // Attempt to match first Len sequential elements from the lower half. + SDValue Src; + int Idx = -1; + for (int i = 0; i != Len; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + SDValue &V = (M < Size ? V1 : V2); + M = M % Size; - // The extracted elements must start at a valid index and all mask - // elements must be in the lower half. - if (i > M || M >= HalfSize) - return SDValue(); + // The extracted elements must start at a valid index and all mask + // elements must be in the lower half. + if (i > M || M >= HalfSize) + return false; - if (Idx < 0 || (Src == V && Idx == (M - i))) { - Src = V; - Idx = M - i; - continue; - } - return SDValue(); + if (Idx < 0 || (Src == V && Idx == (M - i))) { + Src = V; + Idx = M - i; + continue; } + return false; + } - if (Idx < 0) - return SDValue(); + if (!Src || Idx < 0) + return false; - assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); - int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; - int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; - return DAG.getNode(X86ISD::EXTRQI, DL, VT, Src, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); - }; + assert((Idx + Len) <= HalfSize && "Illegal extraction mask"); + BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + V1 = Src; + return true; +} + +// INSERTQ: Extract lowest Len elements from lower half of second source and +// insert over first source, starting at Idx. +// { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } +static bool matchVectorShuffleAsINSERTQ(MVT VT, SDValue &V1, SDValue &V2, + ArrayRef<int> Mask, uint64_t &BitLen, + uint64_t &BitIdx) { + int Size = Mask.size(); + int HalfSize = Size / 2; + assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size"); + + // Upper half must be undefined. + if (!isUndefInRange(Mask, HalfSize, HalfSize)) + return false; + + for (int Idx = 0; Idx != HalfSize; ++Idx) { + SDValue Base; + + // Attempt to match first source from mask before insertion point. + if (isUndefInRange(Mask, 0, Idx)) { + /* EMPTY */ + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + Base = V1; + } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + Base = V2; + } else { + continue; + } - if (SDValue ExtrQ = LowerAsEXTRQ()) - return ExtrQ; + // Extend the extraction length looking to match both the insertion of + // the second source and the remaining elements of the first. + for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { + SDValue Insert; + int Len = Hi - Idx; - // INSERTQ: Extract lowest Len elements from lower half of second source and - // insert over first source, starting at Idx. - // { A[0], .., A[Idx-1], B[0], .., B[Len-1], A[Idx+Len], .., UNDEF, ... } - auto LowerAsInsertQ = [&]() { - for (int Idx = 0; Idx != HalfSize; ++Idx) { - SDValue Base; + // Match insertion. + if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { + Insert = V1; + } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { + Insert = V2; + } else { + continue; + } - // Attempt to match first source from mask before insertion point. - if (isUndefInRange(Mask, 0, Idx)) { + // Match the remaining elements of the lower half. + if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { /* EMPTY */ - } else if (isSequentialOrUndefInRange(Mask, 0, Idx, 0)) { + } else if ((!Base || (Base == V1)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { Base = V1; - } else if (isSequentialOrUndefInRange(Mask, 0, Idx, Size)) { + } else if ((!Base || (Base == V2)) && + isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, + Size + Hi)) { Base = V2; } else { continue; } - // Extend the extraction length looking to match both the insertion of - // the second source and the remaining elements of the first. - for (int Hi = Idx + 1; Hi <= HalfSize; ++Hi) { - SDValue Insert; - int Len = Hi - Idx; - - // Match insertion. - if (isSequentialOrUndefInRange(Mask, Idx, Len, 0)) { - Insert = V1; - } else if (isSequentialOrUndefInRange(Mask, Idx, Len, Size)) { - Insert = V2; - } else { - continue; - } - - // Match the remaining elements of the lower half. - if (isUndefInRange(Mask, Hi, HalfSize - Hi)) { - /* EMPTY */ - } else if ((!Base || (Base == V1)) && - isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, Hi)) { - Base = V1; - } else if ((!Base || (Base == V2)) && - isSequentialOrUndefInRange(Mask, Hi, HalfSize - Hi, - Size + Hi)) { - Base = V2; - } else { - continue; - } - - // We may not have a base (first source) - this can safely be undefined. - if (!Base) - Base = DAG.getUNDEF(VT); - - int BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; - int BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; - return DAG.getNode(X86ISD::INSERTQI, DL, VT, Base, Insert, - DAG.getConstant(BitLen, DL, MVT::i8), - DAG.getConstant(BitIdx, DL, MVT::i8)); - } + BitLen = (Len * VT.getScalarSizeInBits()) & 0x3f; + BitIdx = (Idx * VT.getScalarSizeInBits()) & 0x3f; + V1 = Base; + V2 = Insert; + return true; } + } - return SDValue(); - }; + return false; +} + +/// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ. +static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1, + SDValue V2, ArrayRef<int> Mask, + const APInt &Zeroable, + SelectionDAG &DAG) { + uint64_t BitLen, BitIdx; + if (matchVectorShuffleAsEXTRQ(VT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) + return DAG.getNode(X86ISD::EXTRQI, DL, VT, V1, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); - if (SDValue InsertQ = LowerAsInsertQ()) - return InsertQ; + if (matchVectorShuffleAsINSERTQ(VT, V1, V2, Mask, BitLen, BitIdx)) + return DAG.getNode(X86ISD::INSERTQI, DL, VT, V1 ? V1 : DAG.getUNDEF(VT), + V2 ? V2 : DAG.getUNDEF(VT), + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); return SDValue(); } @@ -22817,7 +22850,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { auto Builder = IRBuilder<>(AI); Module *M = Builder.GetInsertBlock()->getParent()->getParent(); - auto SynchScope = AI->getSynchScope(); + auto SSID = AI->getSyncScopeID(); // We must restrict the ordering to avoid generating loads with Release or // ReleaseAcquire orderings. auto Order = AtomicCmpXchgInst::getStrongestFailureOrdering(AI->getOrdering()); @@ -22839,7 +22872,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // otherwise, we might be able to be more aggressive on relaxed idempotent // rmw. In practice, they do not look useful, so we don't try to be // especially clever. - if (SynchScope == SingleThread) + if (SSID == SyncScope::SingleThread) // FIXME: we could just insert an X86ISD::MEMBARRIER here, except we are at // the IR level, so we must wrap it in an intrinsic. return nullptr; @@ -22858,7 +22891,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const { // Finally we can emit the atomic load. LoadInst *Loaded = Builder.CreateAlignedLoad(Ptr, AI->getType()->getPrimitiveSizeInBits()); - Loaded->setAtomic(Order, SynchScope); + Loaded->setAtomic(Order, SSID); AI->replaceAllUsesWith(Loaded); AI->eraseFromParent(); return Loaded; @@ -22869,13 +22902,13 @@ static SDValue LowerATOMIC_FENCE(SDValue Op, const X86Subtarget &Subtarget, SDLoc dl(Op); AtomicOrdering FenceOrdering = static_cast<AtomicOrdering>( cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue()); - SynchronizationScope FenceScope = static_cast<SynchronizationScope>( + SyncScope::ID FenceSSID = static_cast<SyncScope::ID>( cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue()); // The only fence that needs an instruction is a sequentially-consistent // cross-thread fence. if (FenceOrdering == AtomicOrdering::SequentiallyConsistent && - FenceScope == CrossThread) { + FenceSSID == SyncScope::System) { if (Subtarget.hasMFence()) return DAG.getNode(X86ISD::MFENCE, dl, MVT::Other, Op.getOperand(0)); @@ -23203,6 +23236,20 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, SDLoc DL(Op.getNode()); SDValue Op0 = Op.getOperand(0); + // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. + if (Subtarget.hasVPOPCNTDQ()) { + if (VT == MVT::v8i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + if (VT == MVT::v16i8 || VT == MVT::v16i16) { + Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op); + return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + } + } + if (!Subtarget.hasSSSE3()) { // We can't use the fast LUT approach, so fall back on vectorized bitmath. assert(VT.is128BitVector() && "Only 128-bit vectors supported in SSE!"); @@ -27101,6 +27148,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // permute instructions. // TODO: Investigate sharing more of this with shuffle lowering. static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, const X86Subtarget &Subtarget, @@ -27111,38 +27159,67 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts; MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits); - bool ContainsZeros = false; - APInt Zeroable(NumMaskElts, false); - for (unsigned i = 0; i != NumMaskElts; ++i) { - int M = Mask[i]; - if (isUndefOrZero(M)) - Zeroable.setBit(i); - ContainsZeros |= (M == SM_SentinelZero); - } + bool ContainsZeros = + llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }); - // Attempt to match against byte/bit shifts. - // FIXME: Add 512-bit support. - if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || - (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { - int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, - MaskScalarSizeInBits, Mask, - 0, Zeroable, Subtarget); - if (0 < ShiftAmt) { - PermuteImm = (unsigned)ShiftAmt; + // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns. + if (!ContainsZeros && MaskScalarSizeInBits == 64) { + // Check for lane crossing permutes. + if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { + // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). + if (Subtarget.hasAVX2() && MaskVT.is256BitVector()) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); + PermuteImm = getV4X86ShuffleImm(Mask); + return true; + } + if (Subtarget.hasAVX512() && MaskVT.is512BitVector()) { + SmallVector<int, 4> RepeatedMask; + if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { + Shuffle = X86ISD::VPERMI; + ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); + PermuteImm = getV4X86ShuffleImm(RepeatedMask); + return true; + } + } + } else if (AllowFloatDomain && Subtarget.hasAVX()) { + // VPERMILPD can permute with a non-repeating shuffle. + Shuffle = X86ISD::VPERMILPI; + ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); + PermuteImm = 0; + for (int i = 0, e = Mask.size(); i != e; ++i) { + int M = Mask[i]; + if (M == SM_SentinelUndef) + continue; + assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); + PermuteImm |= (M & 1) << i; + } return true; } } - // Ensure we don't contain any zero elements. - if (ContainsZeros) - return false; - - assert(llvm::all_of(Mask, [&](int M) { - return SM_SentinelUndef <= M && M < (int)NumMaskElts; - }) && "Expected unary shuffle"); + // Handle PSHUFD/VPERMILPI vXi32/vXf32 repeated patterns. + // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we + // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). + if ((MaskScalarSizeInBits == 64 || MaskScalarSizeInBits == 32) && + !ContainsZeros && (AllowIntDomain || Subtarget.hasAVX())) { + SmallVector<int, 4> RepeatedMask; + if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { + // Narrow the repeated mask to create 32-bit element permutes. + SmallVector<int, 4> WordMask = RepeatedMask; + if (MaskScalarSizeInBits == 64) + scaleShuffleMask(2, RepeatedMask, WordMask); + + Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); + ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); + ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); + PermuteImm = getV4X86ShuffleImm(WordMask); + return true; + } + } - // Handle PSHUFLW/PSHUFHW repeated patterns. - if (MaskScalarSizeInBits == 16) { + // Handle PSHUFLW/PSHUFHW vXi16 repeated patterns. + if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits == 16) { SmallVector<int, 4> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) { ArrayRef<int> LoMask(Mask.data() + 0, 4); @@ -27170,78 +27247,23 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, PermuteImm = getV4X86ShuffleImm(OffsetHiMask); return true; } - - return false; } - return false; - } - - // We only support permutation of 32/64 bit elements after this. - if (MaskScalarSizeInBits != 32 && MaskScalarSizeInBits != 64) - return false; - - // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we - // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here). - if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX()) - return false; - - // Pre-AVX2 we must use float shuffles on 256-bit vectors. - if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) { - AllowFloatDomain = true; - AllowIntDomain = false; } - // Check for lane crossing permutes. - if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) { - // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+). - if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) { - Shuffle = X86ISD::VPERMI; - ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64); - PermuteImm = getV4X86ShuffleImm(Mask); + // Attempt to match against byte/bit shifts. + // FIXME: Add 512-bit support. + if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) || + (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) { + int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle, + MaskScalarSizeInBits, Mask, + 0, Zeroable, Subtarget); + if (0 < ShiftAmt) { + PermuteImm = (unsigned)ShiftAmt; return true; } - if (Subtarget.hasAVX512() && MaskVT.is512BitVector() && Mask.size() == 8) { - SmallVector<int, 4> RepeatedMask; - if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) { - Shuffle = X86ISD::VPERMI; - ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64); - PermuteImm = getV4X86ShuffleImm(RepeatedMask); - return true; - } - } - return false; } - // VPERMILPD can permute with a non-repeating shuffle. - if (AllowFloatDomain && MaskScalarSizeInBits == 64) { - Shuffle = X86ISD::VPERMILPI; - ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size()); - PermuteImm = 0; - for (int i = 0, e = Mask.size(); i != e; ++i) { - int M = Mask[i]; - if (M == SM_SentinelUndef) - continue; - assert(((M / 2) == (i / 2)) && "Out of range shuffle mask index"); - PermuteImm |= (M & 1) << i; - } - return true; - } - - // We need a repeating shuffle mask for VPERMILPS/PSHUFD. - SmallVector<int, 4> RepeatedMask; - if (!is128BitLaneRepeatedShuffleMask(MaskEltVT, Mask, RepeatedMask)) - return false; - - // Narrow the repeated mask for 32-bit element permutes. - SmallVector<int, 4> WordMask = RepeatedMask; - if (MaskScalarSizeInBits == 64) - scaleShuffleMask(2, RepeatedMask, WordMask); - - Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD); - ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32); - ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32); - PermuteImm = getV4X86ShuffleImm(WordMask); - return true; + return false; } // Attempt to match a combined unary shuffle mask against supported binary @@ -27303,6 +27325,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, + const APInt &Zeroable, bool AllowFloatDomain, bool AllowIntDomain, SDValue &V1, SDValue &V2, SDLoc &DL, @@ -27388,11 +27411,6 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // Attempt to combine to INSERTPS. if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() && MaskVT.is128BitVector()) { - APInt Zeroable(4, 0); - for (unsigned i = 0; i != NumMaskElts; ++i) - if (Mask[i] < 0) - Zeroable.setBit(i); - if (Zeroable.getBoolValue() && matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) { Shuffle = X86ISD::INSERTPS; @@ -27578,7 +27596,14 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. bool AllowFloatDomain = FloatDomain || (Depth > 3); - bool AllowIntDomain = !FloatDomain || (Depth > 3); + bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && + (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); + + // Determine zeroable mask elements. + APInt Zeroable(NumMaskElts, 0); + for (unsigned i = 0; i != NumMaskElts; ++i) + if (isUndefOrZero(Mask[i])) + Zeroable.setBit(i); if (UnaryShuffle) { // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load @@ -27612,7 +27637,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) @@ -27648,7 +27673,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } - if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain, + if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { @@ -27668,6 +27693,45 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, return true; } + // Typically from here on, we need an integer version of MaskVT. + MVT IntMaskVT = MVT::getIntegerVT(MaskEltSizeInBits); + IntMaskVT = MVT::getVectorVT(IntMaskVT, NumMaskElts); + + // Annoyingly, SSE4A instructions don't map into the above match helpers. + if (Subtarget.hasSSE4A() && AllowIntDomain && RootSizeInBits == 128) { + uint64_t BitLen, BitIdx; + if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, + Zeroable)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) + return false; // Nothing to do! + V1 = DAG.getBitcast(IntMaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + + if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { + if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) + return false; // Nothing to do! + V1 = DAG.getBitcast(IntMaskVT, V1); + DCI.AddToWorklist(V1.getNode()); + V2 = DAG.getBitcast(IntMaskVT, V2); + DCI.AddToWorklist(V2.getNode()); + Res = DAG.getNode(X86ISD::INSERTQI, DL, IntMaskVT, V1, V2, + DAG.getConstant(BitLen, DL, MVT::i8), + DAG.getConstant(BitIdx, DL, MVT::i8)); + DCI.AddToWorklist(Res.getNode()); + DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), + /*AddTo*/ true); + return true; + } + } + // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) @@ -27688,9 +27752,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27719,9 +27781,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (Mask[i] == SM_SentinelZero) Mask[i] = NumMaskElts + i; - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27746,9 +27806,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, (Subtarget.hasBWI() && Subtarget.hasVLX() && MaskVT == MVT::v16i16) || (Subtarget.hasVBMI() && MaskVT == MVT::v64i8) || (Subtarget.hasVBMI() && Subtarget.hasVLX() && MaskVT == MVT::v32i8))) { - MVT VPermMaskSVT = MVT::getIntegerVT(MaskEltSizeInBits); - MVT VPermMaskVT = MVT::getVectorVT(VPermMaskSVT, NumMaskElts); - SDValue VPermMask = getConstVector(Mask, VPermMaskVT, DAG, DL, true); + SDValue VPermMask = getConstVector(Mask, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPermMask.getNode()); V1 = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(V1.getNode()); @@ -27807,8 +27865,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, M < 0 ? DAG.getUNDEF(MVT::i32) : DAG.getConstant(M % 4, DL, MVT::i32); VPermIdx.push_back(Idx); } - MVT VPermMaskVT = MVT::getVectorVT(MVT::i32, NumMaskElts); - SDValue VPermMask = DAG.getBuildVector(VPermMaskVT, DL, VPermIdx); + SDValue VPermMask = DAG.getBuildVector(IntMaskVT, DL, VPermIdx); DCI.AddToWorklist(VPermMask.getNode()); Res = DAG.getBitcast(MaskVT, V1); DCI.AddToWorklist(Res.getNode()); @@ -27831,8 +27888,6 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumLanes = MaskVT.getSizeInBits() / 128; unsigned NumEltsPerLane = NumMaskElts / NumLanes; SmallVector<int, 8> VPerm2Idx; - MVT MaskIdxSVT = MVT::getIntegerVT(MaskVT.getScalarSizeInBits()); - MVT MaskIdxVT = MVT::getVectorVT(MaskIdxSVT, NumMaskElts); unsigned M2ZImm = 0; for (int M : Mask) { if (M == SM_SentinelUndef) { @@ -27852,7 +27907,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(MaskVT, V2); DCI.AddToWorklist(V2.getNode()); - SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, MaskIdxVT, DAG, DL, true); + SDValue VPerm2MaskOp = getConstVector(VPerm2Idx, IntMaskVT, DAG, DL, true); DCI.AddToWorklist(VPerm2MaskOp.getNode()); Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getConstant(M2ZImm, DL, MVT::i8)); @@ -29163,9 +29218,9 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // v8i16 and v16i16. // For these two cases, we can shuffle the upper element bytes to a // consecutive sequence at the start of the vector and treat the results as - // v16i8 or v32i8, and for v61i8 this is the prefferable solution. However, + // v16i8 or v32i8, and for v61i8 this is the preferable solution. However, // for v16i16 this is not the case, because the shuffle is expensive, so we - // avoid sign-exteding to this type entirely. + // avoid sign-extending to this type entirely. // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: // (v16i8 shuffle <0,2,4,6,8,10,12,14,u,u,...,u> (v16i8 bitcast t0), undef) MVT SExtVT; @@ -29207,7 +29262,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SExtVT = MVT::v16i8; // For the case (i16 bitcast (v16i1 setcc v16i16 v1, v2)), // it is not profitable to sign-extend to 256-bit because this will - // require an extra cross-lane shuffle which is more exprensive than + // require an extra cross-lane shuffle which is more expensive than // truncating the result of the compare to 128-bits. break; case MVT::v32i1: @@ -29580,8 +29635,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, // (extends the sign bit which is zero). // So it is correct to skip the sign/zero extend instruction. if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND || - Root.getOpcode() == ISD::ZERO_EXTEND || - Root.getOpcode() == ISD::ANY_EXTEND)) + Root.getOpcode() == ISD::ZERO_EXTEND || + Root.getOpcode() == ISD::ANY_EXTEND)) Root = Root.getOperand(0); // If there was a match, we want Root to be a select that is the root of an @@ -34950,6 +35005,40 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0); + // If X is -1 or 0, then we have an opportunity to avoid constants required in + // the general case below. + auto *ConstantX = dyn_cast<ConstantSDNode>(X); + if (ConstantX) { + if ((!IsSub && CC == X86::COND_AE && ConstantX->isAllOnesValue()) || + (IsSub && CC == X86::COND_B && ConstantX->isNullValue())) { + // This is a complicated way to get -1 or 0 from the carry flag: + // -1 + SETAE --> -1 + (!CF) --> CF ? -1 : 0 --> SBB %eax, %eax + // 0 - SETB --> 0 - (CF) --> CF ? -1 : 0 --> SBB %eax, %eax + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + Y.getOperand(1)); + } + + if ((!IsSub && CC == X86::COND_BE && ConstantX->isAllOnesValue()) || + (IsSub && CC == X86::COND_A && ConstantX->isNullValue())) { + SDValue EFLAGS = Y->getOperand(1); + if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() && + EFLAGS.getValueType().isInteger() && + !isa<ConstantSDNode>(EFLAGS.getOperand(1))) { + // Swap the operands of a SUB, and we have the same pattern as above. + // -1 + SETBE (SUB A, B) --> -1 + SETAE (SUB B, A) --> SUB + SBB + // 0 - SETA (SUB A, B) --> 0 - SETB (SUB B, A) --> SUB + SBB + SDValue NewSub = DAG.getNode( + X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(), + EFLAGS.getOperand(1), EFLAGS.getOperand(0)); + SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo()); + return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, + DAG.getConstant(X86::COND_B, DL, MVT::i8), + NewEFLAGS); + } + } + } + if (CC == X86::COND_B) { // X + SETB Z --> X + (mask SBB Z, Z) // X - SETB Z --> X - (mask SBB Z, Z) @@ -34996,7 +35085,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { // If X is -1 or 0, then we have an opportunity to avoid constants required in // the general case below. - if (auto *ConstantX = dyn_cast<ConstantSDNode>(X)) { + if (ConstantX) { // 'neg' sets the carry flag when Z != 0, so create 0 or -1 using 'sbb' with // fake operands: // 0 - (Z != 0) --> sbb %eax, %eax, (neg Z) @@ -35549,6 +35638,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::PINSRW: return combineVectorInsert(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles case X86ISD::INSERTPS: + case X86ISD::EXTRQI: + case X86ISD::INSERTQI: case X86ISD::PALIGNR: case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index e1ade92979dc..dbbc2bbba6a4 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -767,6 +767,19 @@ namespace llvm { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + // Return true if it is profitable to combine a BUILD_VECTOR to a TRUNCATE + // for given operand and result types. + // Example of such a combine: + // v4i32 build_vector((extract_elt V, 0), + // (extract_elt V, 2), + // (extract_elt V, 4), + // (extract_elt V, 6)) + // --> + // v4i32 truncate (bitcast V to v4i64) + bool isDesirableToCombineBuildVectorToTruncate() const override { + return true; + } + /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the /// given node type. e.g. On x86 i16 is legal, but undesirable since i16 diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index f3094b781c49..34d4816a2518 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -10488,7 +10488,7 @@ namespace { return Copy; } - // Create a virtal register in *TLSBaseAddrReg, and populate it by + // Create a virtual register in *TLSBaseAddrReg, and populate it by // inserting a copy instruction after I. Returns the new instruction. MachineInstr *SetRegister(MachineInstr &I, unsigned *TLSBaseAddrReg) { MachineFunction *MF = I.getParent()->getParent(); diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index e34a90e975b8..859d3288db89 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -32,6 +32,8 @@ #define DEBUG_TYPE "X86-isel" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" + using namespace llvm; #ifndef LLVM_BUILD_GLOBAL_ISEL @@ -56,7 +58,7 @@ private: /// the patterns that don't require complex C++. bool selectImpl(MachineInstr &I) const; - // TODO: remove after suported by Tablegen-erated instruction selection. + // TODO: remove after supported by Tablegen-erated instruction selection. unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const; @@ -64,6 +66,8 @@ private: MachineFunction &MF) const; bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, @@ -75,6 +79,8 @@ private: bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; + bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, @@ -262,6 +268,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectFrameIndexOrGep(I, MRI, MF)) return true; + if (selectGlobalValue(I, MRI, MF)) + return true; if (selectConstant(I, MRI, MF)) return true; if (selectTrunc(I, MRI, MF)) @@ -272,6 +280,8 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectUadde(I, MRI, MF)) return true; + if (selectUnmergeValues(I, MRI, MF)) + return true; if (selectMergeValues(I, MRI, MF)) return true; if (selectExtract(I, MRI, MF)) @@ -423,6 +433,15 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } +static unsigned getLeaOP(LLT Ty, const X86Subtarget &STI) { + if (Ty == LLT::pointer(0, 64)) + return X86::LEA64r; + else if (Ty == LLT::pointer(0, 32)) + return STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r; + else + llvm_unreachable("Can't get LEA opcode. Unsupported type."); +} + bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { @@ -435,14 +454,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, LLT Ty = MRI.getType(DefReg); // Use LEA to calculate frame index and GEP - unsigned NewOpc; - if (Ty == LLT::pointer(0, 64)) - NewOpc = X86::LEA64r; - else if (Ty == LLT::pointer(0, 32)) - NewOpc = STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r; - else - llvm_unreachable("Can't select G_FRAME_INDEX/G_GEP, unsupported type."); - + unsigned NewOpc = getLeaOP(Ty, STI); I.setDesc(TII.get(NewOpc)); MachineInstrBuilder MIB(MF, I); @@ -458,6 +470,54 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } +bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + unsigned Opc = I.getOpcode(); + + if (Opc != TargetOpcode::G_GLOBAL_VALUE) + return false; + + auto GV = I.getOperand(1).getGlobal(); + if (GV->isThreadLocal()) { + return false; // TODO: we don't support TLS yet. + } + + // Can't handle alternate code models yet. + if (TM.getCodeModel() != CodeModel::Small) + return 0; + + X86AddressMode AM; + AM.GV = GV; + AM.GVOpFlags = STI.classifyGlobalReference(GV); + + // TODO: The ABI requires an extra load. not supported yet. + if (isGlobalStubReference(AM.GVOpFlags)) + return false; + + // TODO: This reference is relative to the pic base. not supported yet. + if (isGlobalRelativeToPICBase(AM.GVOpFlags)) + return false; + + if (STI.isPICStyleRIPRel()) { + // Use rip-relative addressing. + assert(AM.Base.Reg == 0 && AM.IndexReg == 0); + AM.Base.Reg = X86::RIP; + } + + const unsigned DefReg = I.getOperand(0).getReg(); + LLT Ty = MRI.getType(DefReg); + unsigned NewOpc = getLeaOP(Ty, STI); + + I.setDesc(TII.get(NewOpc)); + MachineInstrBuilder MIB(MF, I); + + I.RemoveOperand(1); + addFullAddress(MIB, AM); + + return constrainSelectedInstRegOperands(I, TII, TRI, RBI); +} + bool X86InstructionSelector::selectConstant(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { @@ -467,7 +527,8 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I, const unsigned DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); - assert(Ty.isScalar() && "invalid element type."); + if (RBI.getRegBank(DefReg, MRI, TRI)->getID() != X86::GPRRegBankID) + return false; uint64_t Val = 0; if (I.getOperand(1).isCImm()) { @@ -576,37 +637,40 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); - if (SrcTy == LLT::scalar(1)) { - - unsigned AndOpc; - if (DstTy == LLT::scalar(32)) - AndOpc = X86::AND32ri8; - else if (DstTy == LLT::scalar(64)) - AndOpc = X86::AND64ri8; - else - return false; + if (SrcTy != LLT::scalar(1)) + return false; - unsigned DefReg = - MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI)); + unsigned AndOpc; + if (DstTy == LLT::scalar(8)) + AndOpc = X86::AND8ri; + else if (DstTy == LLT::scalar(16)) + AndOpc = X86::AND16ri8; + else if (DstTy == LLT::scalar(32)) + AndOpc = X86::AND32ri8; + else if (DstTy == LLT::scalar(64)) + AndOpc = X86::AND64ri8; + else + return false; + unsigned DefReg = SrcReg; + if (DstTy != LLT::scalar(8)) { + DefReg = MRI.createVirtualRegister(getRegClass(DstTy, DstReg, MRI)); BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::SUBREG_TO_REG), DefReg) .addImm(0) .addReg(SrcReg) .addImm(X86::sub_8bit); + } - MachineInstr &AndInst = - *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg) - .addReg(DefReg) - .addImm(1); - - constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI); + MachineInstr &AndInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg) + .addReg(DefReg) + .addImm(1); - I.eraseFromParent(); - return true; - } + constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI); - return false; + I.eraseFromParent(); + return true; } bool X86InstructionSelector::selectCmp(MachineInstr &I, @@ -918,6 +982,33 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } +bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_UNMERGE_VALUES) + return false; + + // Split to extracts. + unsigned NumDefs = I.getNumOperands() - 1; + unsigned SrcReg = I.getOperand(NumDefs).getReg(); + unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); + + for (unsigned Idx = 0; Idx < NumDefs; ++Idx) { + + MachineInstr &ExtrInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::G_EXTRACT), I.getOperand(Idx).getReg()) + .addReg(SrcReg) + .addImm(Idx * DefSize); + + if (!select(ExtrInst)) + return false; + } + + I.eraseFromParent(); + return true; +} + bool X86InstructionSelector::selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index a5fa3340c3f1..744ba21011af 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -69,12 +69,14 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32, p0}) setAction({MemOp, Ty}, Legal); + setAction({MemOp, s1}, WidenScalar); // And everything's fine in addrspace 0. setAction({MemOp, 1, p0}, Legal); } // Pointer-handling setAction({G_FRAME_INDEX, p0}, Legal); + setAction({G_GLOBAL_VALUE, p0}, Legal); setAction({G_GEP, p0}, Legal); setAction({G_GEP, 1, s32}, Legal); @@ -90,8 +92,10 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar); // Extensions - setAction({G_ZEXT, s32}, Legal); - setAction({G_SEXT, s32}, Legal); + for (auto Ty : {s8, s16, s32}) { + setAction({G_ZEXT, Ty}, Legal); + setAction({G_SEXT, Ty}, Legal); + } for (auto Ty : {s1, s8, s16}) { setAction({G_ZEXT, 1, Ty}, Legal); @@ -125,12 +129,14 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { for (auto Ty : {s8, s16, s32, s64, p0}) setAction({MemOp, Ty}, Legal); + setAction({MemOp, s1}, WidenScalar); // And everything's fine in addrspace 0. setAction({MemOp, 1, p0}, Legal); } // Pointer-handling setAction({G_FRAME_INDEX, p0}, Legal); + setAction({G_GLOBAL_VALUE, p0}, Legal); setAction({G_GEP, p0}, Legal); setAction({G_GEP, 1, s32}, Legal); @@ -146,7 +152,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar); // Extensions - for (auto Ty : {s32, s64}) { + for (auto Ty : {s8, s16, s32, s64}) { setAction({G_ZEXT, Ty}, Legal); setAction({G_SEXT, Ty}, Legal); } diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index 33bc8e11a572..fd2837b79103 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -1042,7 +1042,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI, void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, X86MCInstLower &MCIL) { - assert(Subtarget->is64Bit() && "XRay custom events only suports X86-64"); + assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64"); // We want to emit the following pattern, which follows the x86 calling // convention to prepare for the trampoline call to be patched in. @@ -1332,6 +1332,32 @@ static std::string getShuffleComment(const MachineInstr *MI, return Comment; } +static void printConstant(const Constant *COp, raw_ostream &CS) { + if (isa<UndefValue>(COp)) { + CS << "u"; + } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { + if (CI->getBitWidth() <= 64) { + CS << CI->getZExtValue(); + } else { + // print multi-word constant as (w0,w1) + const auto &Val = CI->getValue(); + CS << "("; + for (int i = 0, N = Val.getNumWords(); i < N; ++i) { + if (i > 0) + CS << ","; + CS << Val.getRawData()[i]; + } + CS << ")"; + } + } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { + SmallString<32> Str; + CF->getValueAPF().toString(Str); + CS << Str; + } else { + CS << "?"; + } +} + void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); @@ -1766,59 +1792,73 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { // For loads from a constant pool to a vector register, print the constant // loaded. CASE_ALL_MOV_RM() + case X86::VBROADCASTF128: + case X86::VBROADCASTI128: + case X86::VBROADCASTF32X4Z256rm: + case X86::VBROADCASTF32X4rm: + case X86::VBROADCASTF32X8rm: + case X86::VBROADCASTF64X2Z128rm: + case X86::VBROADCASTF64X2rm: + case X86::VBROADCASTF64X4rm: + case X86::VBROADCASTI32X4Z256rm: + case X86::VBROADCASTI32X4rm: + case X86::VBROADCASTI32X8rm: + case X86::VBROADCASTI64X2Z128rm: + case X86::VBROADCASTI64X2rm: + case X86::VBROADCASTI64X4rm: if (!OutStreamer->isVerboseAsm()) break; if (MI->getNumOperands() <= 4) break; if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + int NumLanes = 1; + // Override NumLanes for the broadcast instructions. + switch (MI->getOpcode()) { + case X86::VBROADCASTF128: NumLanes = 2; break; + case X86::VBROADCASTI128: NumLanes = 2; break; + case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; break; + case X86::VBROADCASTF32X4rm: NumLanes = 4; break; + case X86::VBROADCASTF32X8rm: NumLanes = 2; break; + case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; break; + case X86::VBROADCASTF64X2rm: NumLanes = 4; break; + case X86::VBROADCASTF64X4rm: NumLanes = 2; break; + case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; break; + case X86::VBROADCASTI32X4rm: NumLanes = 4; break; + case X86::VBROADCASTI32X8rm: NumLanes = 2; break; + case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; break; + case X86::VBROADCASTI64X2rm: NumLanes = 4; break; + case X86::VBROADCASTI64X4rm: NumLanes = 2; break; + } + std::string Comment; raw_string_ostream CS(Comment); const MachineOperand &DstOp = MI->getOperand(0); CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) { CS << "["; - for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) { - if (i != 0) - CS << ","; - if (CDS->getElementType()->isIntegerTy()) - CS << CDS->getElementAsInteger(i); - else if (CDS->getElementType()->isFloatTy()) - CS << CDS->getElementAsFloat(i); - else if (CDS->getElementType()->isDoubleTy()) - CS << CDS->getElementAsDouble(i); - else - CS << "?"; + for (int l = 0; l != NumLanes; ++l) { + for (int i = 0, NumElements = CDS->getNumElements(); i < NumElements; ++i) { + if (i != 0 || l != 0) + CS << ","; + if (CDS->getElementType()->isIntegerTy()) + CS << CDS->getElementAsInteger(i); + else if (CDS->getElementType()->isFloatTy()) + CS << CDS->getElementAsFloat(i); + else if (CDS->getElementType()->isDoubleTy()) + CS << CDS->getElementAsDouble(i); + else + CS << "?"; + } } CS << "]"; OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); } else if (auto *CV = dyn_cast<ConstantVector>(C)) { CS << "<"; - for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { - if (i != 0) - CS << ","; - Constant *COp = CV->getOperand(i); - if (isa<UndefValue>(COp)) { - CS << "u"; - } else if (auto *CI = dyn_cast<ConstantInt>(COp)) { - if (CI->getBitWidth() <= 64) { - CS << CI->getZExtValue(); - } else { - // print multi-word constant as (w0,w1) - const auto &Val = CI->getValue(); - CS << "("; - for (int i = 0, N = Val.getNumWords(); i < N; ++i) { - if (i > 0) - CS << ","; - CS << Val.getRawData()[i]; - } - CS << ")"; - } - } else if (auto *CF = dyn_cast<ConstantFP>(COp)) { - SmallString<32> Str; - CF->getValueAPF().toString(Str); - CS << Str; - } else { - CS << "?"; + for (int l = 0; l != NumLanes; ++l) { + for (int i = 0, NumOperands = CV->getNumOperands(); i < NumOperands; ++i) { + if (i != 0 || l != 0) + CS << ","; + printConstant(CV->getOperand(i), CS); } } CS << ">"; @@ -1826,6 +1866,85 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { } } break; + case X86::VBROADCASTSSrm: + case X86::VBROADCASTSSYrm: + case X86::VBROADCASTSSZ128m: + case X86::VBROADCASTSSZ256m: + case X86::VBROADCASTSSZm: + case X86::VBROADCASTSDYrm: + case X86::VBROADCASTSDZ256m: + case X86::VBROADCASTSDZm: + case X86::VPBROADCASTBrm: + case X86::VPBROADCASTBYrm: + case X86::VPBROADCASTBZ128m: + case X86::VPBROADCASTBZ256m: + case X86::VPBROADCASTBZm: + case X86::VPBROADCASTDrm: + case X86::VPBROADCASTDYrm: + case X86::VPBROADCASTDZ128m: + case X86::VPBROADCASTDZ256m: + case X86::VPBROADCASTDZm: + case X86::VPBROADCASTQrm: + case X86::VPBROADCASTQYrm: + case X86::VPBROADCASTQZ128m: + case X86::VPBROADCASTQZ256m: + case X86::VPBROADCASTQZm: + case X86::VPBROADCASTWrm: + case X86::VPBROADCASTWYrm: + case X86::VPBROADCASTWZ128m: + case X86::VPBROADCASTWZ256m: + case X86::VPBROADCASTWZm: + if (!OutStreamer->isVerboseAsm()) + break; + if (MI->getNumOperands() <= 4) + break; + if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) { + int NumElts; + switch (MI->getOpcode()) { + default: llvm_unreachable("Invalid opcode"); + case X86::VBROADCASTSSrm: NumElts = 4; break; + case X86::VBROADCASTSSYrm: NumElts = 8; break; + case X86::VBROADCASTSSZ128m: NumElts = 4; break; + case X86::VBROADCASTSSZ256m: NumElts = 8; break; + case X86::VBROADCASTSSZm: NumElts = 16; break; + case X86::VBROADCASTSDYrm: NumElts = 4; break; + case X86::VBROADCASTSDZ256m: NumElts = 4; break; + case X86::VBROADCASTSDZm: NumElts = 8; break; + case X86::VPBROADCASTBrm: NumElts = 16; break; + case X86::VPBROADCASTBYrm: NumElts = 32; break; + case X86::VPBROADCASTBZ128m: NumElts = 16; break; + case X86::VPBROADCASTBZ256m: NumElts = 32; break; + case X86::VPBROADCASTBZm: NumElts = 64; break; + case X86::VPBROADCASTDrm: NumElts = 4; break; + case X86::VPBROADCASTDYrm: NumElts = 8; break; + case X86::VPBROADCASTDZ128m: NumElts = 4; break; + case X86::VPBROADCASTDZ256m: NumElts = 8; break; + case X86::VPBROADCASTDZm: NumElts = 16; break; + case X86::VPBROADCASTQrm: NumElts = 2; break; + case X86::VPBROADCASTQYrm: NumElts = 4; break; + case X86::VPBROADCASTQZ128m: NumElts = 2; break; + case X86::VPBROADCASTQZ256m: NumElts = 4; break; + case X86::VPBROADCASTQZm: NumElts = 8; break; + case X86::VPBROADCASTWrm: NumElts = 8; break; + case X86::VPBROADCASTWYrm: NumElts = 16; break; + case X86::VPBROADCASTWZ128m: NumElts = 8; break; + case X86::VPBROADCASTWZ256m: NumElts = 16; break; + case X86::VPBROADCASTWZm: NumElts = 32; break; + } + + std::string Comment; + raw_string_ostream CS(Comment); + const MachineOperand &DstOp = MI->getOperand(0); + CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = "; + CS << "["; + for (int i = 0; i != NumElts; ++i) { + if (i != 0) + CS << ","; + printConstant(C, CS); + } + CS << "]"; + OutStreamer->AddComment(CS.str(), !EnablePrintSchedInfo); + } } MCInst TmpInst; diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index b8ec5883152c..6d85ca6cad64 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -24,8 +24,8 @@ def SandyBridgeModel : SchedMachineModel { // Based on the LSD (loop-stream detector) queue size. let LoopMicroOpBufferSize = 28; - // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow - // the scheduler to assign a default model to unrecognized opcodes. + // This flag is set to allow the scheduler to assign + // a default model to unrecognized opcodes. let CompleteModel = 0; } @@ -48,6 +48,7 @@ def SBPort23 : ProcResource<2>; def SBPort4 : ProcResource<1>; // Many micro-ops are capable of issuing on multiple ports. +def SBPort01 : ProcResGroup<[SBPort0, SBPort1]>; def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; @@ -115,10 +116,10 @@ def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { // Scalar and vector floating point. defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; defm : SBWriteResPair<WriteFMul, SBPort0, 5>; -defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. +defm : SBWriteResPair<WriteFDiv, SBPort0, 24>; defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>; -defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; +defm : SBWriteResPair<WriteFSqrt, SBPort0, 14>; defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>; @@ -134,11 +135,11 @@ def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> { } // Vector integer operations. -defm : SBWriteResPair<WriteVecShift, SBPort05, 1>; -defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>; -defm : SBWriteResPair<WriteVecALU, SBPort15, 1>; +defm : SBWriteResPair<WriteVecShift, SBPort5, 1>; +defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>; +defm : SBWriteResPair<WriteVecALU, SBPort1, 3>; defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>; -defm : SBWriteResPair<WriteShuffle, SBPort15, 1>; +defm : SBWriteResPair<WriteShuffle, SBPort5, 1>; defm : SBWriteResPair<WriteBlend, SBPort15, 1>; def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> { let Latency = 2; @@ -148,13 +149,15 @@ def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> { let Latency = 6; let ResourceCycles = [1, 1, 1]; } -def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; +def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; } -def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> { - let Latency = 6; - let ResourceCycles = [1, 1, 1, 1]; +def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; } //////////////////////////////////////////////////////////////////////////////// @@ -204,13 +207,15 @@ def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> { } // Packed Compare Implicit Length Strings, Return Index -def : WriteRes<WritePCmpIStrI, [SBPort015]> { - let Latency = 3; +def : WriteRes<WritePCmpIStrI, [SBPort0]> { + let Latency = 11; + let NumMicroOps = 3; let ResourceCycles = [3]; } -def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> { - let Latency = 3; - let ResourceCycles = [3, 1]; +def : WriteRes<WritePCmpIStrILd, [SBPort0,SBPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; } // Packed Compare Explicit Length Strings, Return Index @@ -224,22 +229,26 @@ def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> { } // AES Instructions. -def : WriteRes<WriteAESDecEnc, [SBPort015]> { - let Latency = 8; - let ResourceCycles = [2]; +def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> { - let Latency = 8; - let ResourceCycles = [2, 1]; +def : WriteRes<WriteAESDecEncLd, [SBPort5,SBPort23,SBPort015]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : WriteRes<WriteAESIMC, [SBPort015]> { - let Latency = 8; +def : WriteRes<WriteAESIMC, [SBPort5]> { + let Latency = 12; + let NumMicroOps = 2; let ResourceCycles = [2]; } -def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> { - let Latency = 8; - let ResourceCycles = [2, 1]; +def : WriteRes<WriteAESIMCLd, [SBPort5,SBPort23]> { + let Latency = 18; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; } def : WriteRes<WriteAESKeyGen, [SBPort015]> { @@ -272,4 +281,2407 @@ def : WriteRes<WriteNop, []>; defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>; defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>; defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>; + +// Remaining SNB instrs. + +def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup0], (instregex "CVTSS2SDrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSLLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSLLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSLLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRADri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRAWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VCVTSS2SDrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPMOVMSKBrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSLLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSLLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSLLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRADri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRAWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDYrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSYrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSrr")>; + +def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup1], (instregex "COMP_FST0r")>; +def: InstRW<[SBWriteResGroup1], (instregex "COM_FST0r")>; +def: InstRW<[SBWriteResGroup1], (instregex "UCOM_FPr")>; +def: InstRW<[SBWriteResGroup1], (instregex "UCOM_Fr")>; + +def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup2], (instregex "ANDNPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ANDNPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ANDPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ANDPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "FDECSTP")>; +def: InstRW<[SBWriteResGroup2], (instregex "FFREE")>; +def: InstRW<[SBWriteResGroup2], (instregex "FINCSTP")>; +def: InstRW<[SBWriteResGroup2], (instregex "FNOP")>; +def: InstRW<[SBWriteResGroup2], (instregex "INSERTPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "JMP64r")>; +def: InstRW<[SBWriteResGroup2], (instregex "LD_Frr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOV64toPQIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVAPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVAPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVDDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVDI2PDIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVHLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVLHPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSHDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSLDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVUPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVUPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ORPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "RETQ")>; +def: InstRW<[SBWriteResGroup2], (instregex "SHUFPDrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "SHUFPSrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "ST_FPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ST_Frr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VEXTRACTF128rr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VINSERTF128rr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VINSERTPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOV64toPQIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOV64toPQIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDrm")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrm")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDYrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSYrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VXORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VXORPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "XORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "XORPSrr")>; + +def SBWriteResGroup3 : SchedWriteRes<[SBPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup3], (instregex "LEA64_32r")>; + +def SBWriteResGroup4 : SchedWriteRes<[SBPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup4], (instregex "BLENDPDrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "BLENDPSrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "BT32ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BT32rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTC32ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTC32rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTR32ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTR32rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTS32ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTS32rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "CDQ")>; +def: InstRW<[SBWriteResGroup4], (instregex "CQO")>; +def: InstRW<[SBWriteResGroup4], (instregex "LAHF")>; +def: InstRW<[SBWriteResGroup4], (instregex "SAHF")>; +def: InstRW<[SBWriteResGroup4], (instregex "SAR32ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SAR8ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETAEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETBr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETGEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETGr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETLEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETLr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNOr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNPr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNSr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETOr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETPr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETSr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL32ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL64r1")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL8r1")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL8ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHR32ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHR8ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDYrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSYrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQAYrr")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQArr")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUYrr")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUrr")>; + +def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup5], (instregex "KORTESTBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSBrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSDrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSWrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PADDQirr")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "PABSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PABSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PABSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKSSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKSSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKUSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKUSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PALIGNRrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PAVGBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PAVGWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PBLENDWrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFDri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFHWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFLWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSIGNBrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSIGNDrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSIGNWrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSLLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSRLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPABSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPABSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPABSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPALIGNRrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPAVGBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPAVGWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPBLENDWrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFDri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFLWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNBrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNDrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNWrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSLLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSRLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLWDrr")>; + +def SBWriteResGroup6 : SchedWriteRes<[SBPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup6], (instregex "ADD32ri8")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD32rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND32ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND64ri8")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "CBW")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMC")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP16ri8")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP32i32")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "CWDE")>; +def: InstRW<[SBWriteResGroup6], (instregex "DEC64r")>; +def: InstRW<[SBWriteResGroup6], (instregex "DEC8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "INC64r")>; +def: InstRW<[SBWriteResGroup6], (instregex "INC8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOV32rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOV8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOV8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVDQArr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVDQUrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVPQI2QIrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVSX32rr16")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVSX32rr8")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVZX32rr16")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVZX32rr8")>; +def: InstRW<[SBWriteResGroup6], (instregex "NEG64r")>; +def: InstRW<[SBWriteResGroup6], (instregex "NEG8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "NOT64r")>; +def: InstRW<[SBWriteResGroup6], (instregex "NOT8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR64ri8")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PANDNrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PANDrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PXORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "STC")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB64ri8")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPANDNrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPANDrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPXORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR32rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR64ri8")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR8rr")>; + +def SBWriteResGroup7 : SchedWriteRes<[SBPort0]> { + let Latency = 2; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPDrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPSrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "MOVPDI2DIrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "MOVPQIto64rr")>; +def: InstRW<[SBWriteResGroup7], (instregex "PMOVMSKBrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVPQIto64rr")>; + +def SBWriteResGroup9 : SchedWriteRes<[SBPort0]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPDrr0")>; +def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPSrr0")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROL32ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROL8ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROR32ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROR8ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "SETAr")>; +def: InstRW<[SBWriteResGroup9], (instregex "SETBEr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDYrr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDrr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSYrr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSrr")>; + +def SBWriteResGroup10 : SchedWriteRes<[SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup10], (instregex "VPBLENDVBrr")>; + +def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup11], (instregex "SCASB")>; +def: InstRW<[SBWriteResGroup11], (instregex "SCASL")>; +def: InstRW<[SBWriteResGroup11], (instregex "SCASQ")>; +def: InstRW<[SBWriteResGroup11], (instregex "SCASW")>; + +def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup12], (instregex "COMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "COMISSrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "UCOMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "UCOMISSrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VCOMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VCOMISSrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISSrr")>; + +def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup13], (instregex "CVTPS2PDrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "PTESTrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VPTESTYrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VPTESTrr")>; + +def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup14], (instregex "PSLLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSLLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSLLWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRADrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRAWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRLWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRADrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRAWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRLWrr")>; + +def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup15], (instregex "FNSTSW16r")>; + +def SBWriteResGroup16 : SchedWriteRes<[SBPort1,SBPort0]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup16], (instregex "BSWAP32r")>; + +def SBWriteResGroup17 : SchedWriteRes<[SBPort5,SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup17], (instregex "PINSRBrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "PINSRDrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "PINSRQrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "PINSRWrri")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRBrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRDrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRQrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRWrri")>; + +def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>; + +def SBWriteResGroup19 : SchedWriteRes<[SBPort0,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup19], (instregex "ADC64ri8")>; +def: InstRW<[SBWriteResGroup19], (instregex "ADC64rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "ADC8ri")>; +def: InstRW<[SBWriteResGroup19], (instregex "ADC8rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVAE32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVB32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVE32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVG32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVGE32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVL32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVLE32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNE32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNO32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNP32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNS32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVO32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVP32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVS32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB32rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB64ri8")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB8ri")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB8rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "SHLD32rri8")>; +def: InstRW<[SBWriteResGroup19], (instregex "SHRD32rri8")>; + +def SBWriteResGroup20 : SchedWriteRes<[SBPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMADDUBSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMADDWDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULHRSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULHUWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULHWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULLDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULLWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULUDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PSADBWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMADDUBSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMADDWDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULHRSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULHWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULLDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULLWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPSADBWrr")>; + +def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup21], (instregex "ADDPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADD_FPrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADD_FST0r")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADD_FrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "BSF32rr")>; +def: InstRW<[SBWriteResGroup21], (instregex "BSR32rr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPPDrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPPSrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CRC32r32r32")>; +def: InstRW<[SBWriteResGroup21], (instregex "CRC32r32r8")>; +def: InstRW<[SBWriteResGroup21], (instregex "CVTDQ2PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CVTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CVTTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAXPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAXPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAXSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAXSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MINPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MINPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MINSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MINSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MUL8r")>; +def: InstRW<[SBWriteResGroup21], (instregex "POPCNT32rr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FPrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FST0r")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUB_FPrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUB_FST0r")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUB_FrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VBROADCASTF128")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDYrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSYrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAXPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAXPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAXPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAXPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAXSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAXSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMINPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMINPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMINSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMINSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSrr")>; + +def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup22], (instregex "EXTRACTPSrr")>; +def: InstRW<[SBWriteResGroup22], (instregex "VEXTRACTPSrr")>; + +def SBWriteResGroup23 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRBrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRDrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRQrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRWri")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRBrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRDrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRQrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRWri")>; +def: InstRW<[SBWriteResGroup23], (instregex "SHL64rCL")>; +def: InstRW<[SBWriteResGroup23], (instregex "SHL8rCL")>; + +def SBWriteResGroup24 : SchedWriteRes<[SBPort15]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBSWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHADDDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHADDSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHADDWrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHSUBDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHSUBSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHSUBWrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHADDDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHADDSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHADDWrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBWrr")>; + +def SBWriteResGroup25 : SchedWriteRes<[SBPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup25], (instregex "LEAVE64")>; +def: InstRW<[SBWriteResGroup25], (instregex "XADD32rr")>; +def: InstRW<[SBWriteResGroup25], (instregex "XADD8rr")>; + +def SBWriteResGroup26 : SchedWriteRes<[SBPort0,SBPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup26], (instregex "CMOVA32rr")>; +def: InstRW<[SBWriteResGroup26], (instregex "CMOVBE32rr")>; + +def SBWriteResGroup27 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup27], (instregex "MUL64r")>; + +def SBWriteResGroup28 : SchedWriteRes<[SBPort1,SBPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup28], (instregex "CVTDQ2PDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2DQrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2PSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTSD2SSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTSI2SD64rr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTSI2SDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTTPD2DQrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI2SD64rr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI2SDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQrr")>; + +def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup29], (instregex "MOV64sr")>; +def: InstRW<[SBWriteResGroup29], (instregex "PAUSE")>; + +def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup30], (instregex "MULPDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MULPSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MULSDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MULSSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MUL_FPrST0")>; +def: InstRW<[SBWriteResGroup30], (instregex "MUL_FST0r")>; +def: InstRW<[SBWriteResGroup30], (instregex "MUL_FrST0")>; +def: InstRW<[SBWriteResGroup30], (instregex "PCMPGTQrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[SBWriteResGroup30], (instregex "RCPPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "RCPSSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "RSQRTPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "RSQRTSSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPDYrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPSYrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULSDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULSSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VPCMPGTQrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTSSr")>; + +def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup31], (instregex "MOV32rm")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOV8rm")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVSX32rm16")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVSX32rm8")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVZX32rm16")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVZX32rm8")>; +def: InstRW<[SBWriteResGroup31], (instregex "PREFETCH")>; + +def SBWriteResGroup32 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SIrr")>; + +def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup33], (instregex "MOV64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOV8mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVAPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVAPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVDQAmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVDQUmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVHPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVHPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVLPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVLPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTDQmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTI_64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVPDI2DImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVPQI2QImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVPQIto64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVSSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVUPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVUPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "PUSH64i8")>; +def: InstRW<[SBWriteResGroup33], (instregex "PUSH64r")>; +def: InstRW<[SBWriteResGroup33], (instregex "VEXTRACTF128mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVPDI2DImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQI2QImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQIto64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVSDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVSSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSmr")>; + +def SBWriteResGroup34 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup34], (instregex "MPSADBWrri")>; +def: InstRW<[SBWriteResGroup34], (instregex "VMPSADBWrri")>; + +def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup35], (instregex "CLI")>; +def: InstRW<[SBWriteResGroup35], (instregex "CVTSI2SS64rr")>; +def: InstRW<[SBWriteResGroup35], (instregex "CVTSI2SSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HADDPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HADDPSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HSUBPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HSUBPSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI2SS64rr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI2SSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSrr")>; + +def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup36], (instregex "CALL64r")>; +def: InstRW<[SBWriteResGroup36], (instregex "EXTRACTPSmr")>; +def: InstRW<[SBWriteResGroup36], (instregex "VEXTRACTPSmr")>; + +def SBWriteResGroup37 : SchedWriteRes<[SBPort4,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDmr")>; +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSmr")>; + +def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup38], (instregex "SETAEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETBm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETGEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETGm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETLEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETLm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNOm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNPm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNSm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETOm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETPm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETSm")>; + +def SBWriteResGroup39 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup39], (instregex "PEXTRBmr")>; +def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRBmr")>; +def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRDmr")>; +def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRWmr")>; + +def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup40], (instregex "MOV8mi")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSB")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSL")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSQ")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSW")>; + +def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup41], (instregex "FNINIT")>; + +def SBWriteResGroup42 : SchedWriteRes<[SBPort0,SBPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG32rr")>; +def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG8rr")>; + +def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup43], (instregex "SETAm")>; +def: InstRW<[SBWriteResGroup43], (instregex "SETBEm")>; + +def SBWriteResGroup44 : SchedWriteRes<[SBPort0,SBPort4,SBPort5,SBPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup44], (instregex "LDMXCSR")>; +def: InstRW<[SBWriteResGroup44], (instregex "STMXCSR")>; +def: InstRW<[SBWriteResGroup44], (instregex "VLDMXCSR")>; +def: InstRW<[SBWriteResGroup44], (instregex "VSTMXCSR")>; + +def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup45], (instregex "PEXTRDmr")>; +def: InstRW<[SBWriteResGroup45], (instregex "PEXTRQmr")>; +def: InstRW<[SBWriteResGroup45], (instregex "VPEXTRQmr")>; +def: InstRW<[SBWriteResGroup45], (instregex "PUSHF16")>; +def: InstRW<[SBWriteResGroup45], (instregex "PUSHF64")>; + +def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup46], (instregex "CLFLUSH")>; + +def SBWriteResGroup47 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup47], (instregex "FXRSTOR")>; + +def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup48], (instregex "LDDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOV64toPQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVAPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVAPSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDI2PDIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVNTDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSHDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSLDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVUPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVUPSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "POP64r")>; +def: InstRW<[SBWriteResGroup48], (instregex "VBROADCASTSSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUYrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOV64toPQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVNTDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSHDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSLDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPSrm")>; + +def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup49], (instregex "JMP64m")>; +def: InstRW<[SBWriteResGroup49], (instregex "MOV64sm")>; + +def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort0]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup50], (instregex "BT64mi8")>; + +def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSBrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSDrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSWrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNWrm64")>; + +def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup52], (instregex "ADD64rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "ADD8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "AND64rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "AND8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP64mi8")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP64mr")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP64rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP8mi")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP8mr")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "LODSL")>; +def: InstRW<[SBWriteResGroup52], (instregex "LODSQ")>; +def: InstRW<[SBWriteResGroup52], (instregex "OR64rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "OR8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "SUB64rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "SUB8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "XOR64rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "XOR8rm")>; + +def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup53], (instregex "POP64rmm")>; +def: InstRW<[SBWriteResGroup53], (instregex "PUSH64rmm")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_F32m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_F64m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_FP32m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_FP64m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_FP80m")>; + +def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSSrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPDYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPSYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVDDUPYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQAYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQUYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPDYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPSYrm")>; + +def SBWriteResGroup55 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup55], (instregex "CVTPS2PDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "CVTSS2SDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDYrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VCVTSS2SDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VTESTPDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VTESTPSrm")>; + +def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup56], (instregex "ANDNPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ANDNPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ANDPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ANDPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "INSERTPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ORPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "SHUFPDrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "SHUFPSrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDNPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDNPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128")>; +def: InstRW<[SBWriteResGroup56], (instregex "VINSERTPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VORPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDri")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSri")>; +def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPDrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPSrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VXORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VXORPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "XORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "XORPSrm")>; + +def SBWriteResGroup57 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup57], (instregex "AESDECLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "AESDECrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "AESENCLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "AESENCrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "KANDQrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESDECLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESDECrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESENCrr")>; + +def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort0]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup58], (instregex "BLENDPDrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "BLENDPSrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPDrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPSrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "VINSERTF128rm")>; + +def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PABSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PABSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PABSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKSSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKSSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKUSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKUSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PALIGNRrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PAVGBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PAVGWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PBLENDWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFDmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFHWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFLWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSIGNBrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSIGNDrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSIGNWrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPABSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPABSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPABSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPALIGNRrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPAVGBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPAVGWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPBLENDWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFDmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFHWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFLWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNBrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNDrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNWrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLWDrm")>; + +def SBWriteResGroup60 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup60], (instregex "PANDNrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "PANDrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "PORrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "PXORrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPANDNrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPANDrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPORrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPXORrm")>; + +def SBWriteResGroup61 : SchedWriteRes<[SBPort0,SBPort0]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup61], (instregex "VRCPPSr")>; +def: InstRW<[SBWriteResGroup61], (instregex "VRSQRTPSYr")>; + +def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup62], (instregex "VERRm")>; +def: InstRW<[SBWriteResGroup62], (instregex "VERWm")>; + +def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup63], (instregex "LODSB")>; +def: InstRW<[SBWriteResGroup63], (instregex "LODSW")>; + +def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup64], (instregex "FARJMP64")>; + +def SBWriteResGroup65 : SchedWriteRes<[SBPort23,SBPort0,SBPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup65], (instregex "ADC64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "ADC8rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVAE64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVB64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVE64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVG64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVGE64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVL64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVLE64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNE64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNO64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNP64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNS64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVO64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVP64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVS64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "SBB64rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "SBB8rm")>; + +def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup66], (instregex "FNSTSWm")>; + +def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup67], (instregex "SLDT32r")>; +def: InstRW<[SBWriteResGroup67], (instregex "STR32r")>; + +def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup68], (instregex "CALL64m")>; +def: InstRW<[SBWriteResGroup68], (instregex "FNSTCW16m")>; + +def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup69], (instregex "BTC64mi8")>; +def: InstRW<[SBWriteResGroup69], (instregex "BTR64mi8")>; +def: InstRW<[SBWriteResGroup69], (instregex "BTS64mi8")>; +def: InstRW<[SBWriteResGroup69], (instregex "SAR64mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SAR8mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL64m1")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL64mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL8m1")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL8mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHR64mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHR8mi")>; + +def SBWriteResGroup70 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup70], (instregex "ADD64mi8")>; +def: InstRW<[SBWriteResGroup70], (instregex "ADD64mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "ADD8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "ADD8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND64mi8")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND64mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "DEC64m")>; +def: InstRW<[SBWriteResGroup70], (instregex "DEC8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "INC64m")>; +def: InstRW<[SBWriteResGroup70], (instregex "INC8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NEG64m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NEG8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NOT64m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NOT8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR64mi8")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR64mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB64mi8")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB64mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "TEST64rm")>; +def: InstRW<[SBWriteResGroup70], (instregex "TEST8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "TEST8rm")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR64mi8")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR64mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR8mr")>; + +def SBWriteResGroup71 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[SBWriteResGroup71], (instregex "VTESTPDYrm")>; +def: InstRW<[SBWriteResGroup71], (instregex "VTESTPSYrm")>; + +def SBWriteResGroup72 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup72], (instregex "BSF64rm")>; +def: InstRW<[SBWriteResGroup72], (instregex "BSR64rm")>; +def: InstRW<[SBWriteResGroup72], (instregex "CRC32r32m16")>; +def: InstRW<[SBWriteResGroup72], (instregex "CRC32r32m8")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOM32m")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOM64m")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOMP32m")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOMP64m")>; +def: InstRW<[SBWriteResGroup72], (instregex "MUL8m")>; + +def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup73], (instregex "VANDNPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VANDNPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VANDPDrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VANDPSrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VORPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VORPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYri")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYri")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPDYrmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPSYrmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPDrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPSrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VXORPDrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VXORPSrm")>; + +def SBWriteResGroup74 : SchedWriteRes<[SBPort23,SBPort0]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPDYrmi")>; +def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPSYrmi")>; + +def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort0]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPDrm0")>; +def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPSrm0")>; +def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPDrm")>; +def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPSrm")>; +def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPDrm")>; +def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPSrm")>; + +def SBWriteResGroup76 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup76], (instregex "PBLENDVBrr0")>; +def: InstRW<[SBWriteResGroup76], (instregex "VPBLENDVBrm")>; + +def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup77], (instregex "COMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "COMISSrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "UCOMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "UCOMISSrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VCOMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VCOMISSrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISSrm")>; + +def SBWriteResGroup78 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup78], (instregex "PTESTrm")>; +def: InstRW<[SBWriteResGroup78], (instregex "VPTESTrm")>; + +def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup79], (instregex "PSLLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSLLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSLLWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRADrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRAWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRLWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSLLDri")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSLLQri")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSLLWri")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRADrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRAWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRLWrm")>; + +def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBSWrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBWrm64")>; + +def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG64rm")>; +def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG8rm")>; + +def SBWriteResGroup82 : SchedWriteRes<[SBPort23,SBPort0,SBPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup82], (instregex "CMOVA64rm")>; +def: InstRW<[SBWriteResGroup82], (instregex "CMOVBE64rm")>; + +def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SBWriteResGroup83], (instregex "CMPSB")>; +def: InstRW<[SBWriteResGroup83], (instregex "CMPSL")>; +def: InstRW<[SBWriteResGroup83], (instregex "CMPSQ")>; +def: InstRW<[SBWriteResGroup83], (instregex "CMPSW")>; + +def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup84], (instregex "FLDCW16m")>; + +def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort0]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup85], (instregex "ROL64mi")>; +def: InstRW<[SBWriteResGroup85], (instregex "ROL8mi")>; +def: InstRW<[SBWriteResGroup85], (instregex "ROR64mi")>; +def: InstRW<[SBWriteResGroup85], (instregex "ROR8mi")>; + +def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup86], (instregex "MOVSB")>; +def: InstRW<[SBWriteResGroup86], (instregex "MOVSL")>; +def: InstRW<[SBWriteResGroup86], (instregex "MOVSQ")>; +def: InstRW<[SBWriteResGroup86], (instregex "MOVSW")>; +def: InstRW<[SBWriteResGroup86], (instregex "XADD64rm")>; +def: InstRW<[SBWriteResGroup86], (instregex "XADD8rm")>; + +def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SBWriteResGroup87], (instregex "FARCALL64")>; + +def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup88], (instregex "SHLD64mri8")>; +def: InstRW<[SBWriteResGroup88], (instregex "SHRD64mri8")>; + +def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup89], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMADDUBSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMADDWDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULHRSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULHUWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULHWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULLDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULLWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULUDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PSADBWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMADDUBSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMADDWDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULHRSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULHUWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULHWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULLDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULLWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULUDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPSADBWrm")>; + +def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup90], (instregex "ADDPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPPDrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPPSrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTDQ2PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTSI2SD64rm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTSI2SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAXPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAXPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAXSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAXSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MINPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MINPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MINSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MINSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[SBWriteResGroup90], (instregex "POPCNT64rm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPPDrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPPSrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI2SD64rm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI2SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAXPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAXPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAXSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAXSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMINPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMINPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMINSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMINSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBSSrm")>; + +def SBWriteResGroup91 : SchedWriteRes<[SBPort23,SBPort0]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPDYrm")>; +def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPSYrm")>; +def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPDrm")>; +def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPSrm")>; + +def SBWriteResGroup92 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup92], (instregex "DPPDrri")>; +def: InstRW<[SBWriteResGroup92], (instregex "VDPPDrri")>; + +def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "MUL64m")>; + +def SBWriteResGroup94 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup94], (instregex "VPTESTYrm")>; + +def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup95], (instregex "LD_F32m")>; +def: InstRW<[SBWriteResGroup95], (instregex "LD_F64m")>; +def: InstRW<[SBWriteResGroup95], (instregex "LD_F80m")>; + +def SBWriteResGroup96 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup96], (instregex "PHADDDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHADDSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHADDWrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHSUBDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHSUBSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHSUBWrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHADDDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHADDSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHADDWrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBWrm")>; + +def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup97], (instregex "IST_F16m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_F32m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_FP16m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_FP32m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_FP64m")>; +def: InstRW<[SBWriteResGroup97], (instregex "SHL64mCL")>; +def: InstRW<[SBWriteResGroup97], (instregex "SHL8mCL")>; + +def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,3]; +} +def: InstRW<[SBWriteResGroup98], (instregex "ADC64mi8")>; +def: InstRW<[SBWriteResGroup98], (instregex "ADC8mi")>; +def: InstRW<[SBWriteResGroup98], (instregex "SBB64mi8")>; +def: InstRW<[SBWriteResGroup98], (instregex "SBB8mi")>; + +def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort0,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,2,1]; +} +def: InstRW<[SBWriteResGroup99], (instregex "ADC64mr")>; +def: InstRW<[SBWriteResGroup99], (instregex "ADC8mr")>; +def: InstRW<[SBWriteResGroup99], (instregex "SBB64mr")>; +def: InstRW<[SBWriteResGroup99], (instregex "SBB8mr")>; + +def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort0,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,2,1,1]; +} +def: InstRW<[SBWriteResGroup100], (instregex "BT64mr")>; +def: InstRW<[SBWriteResGroup100], (instregex "BTC64mr")>; +def: InstRW<[SBWriteResGroup100], (instregex "BTR64mr")>; +def: InstRW<[SBWriteResGroup100], (instregex "BTS64mr")>; + +def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup101], (instregex "ADD_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ADD_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ILD_F16m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ILD_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ILD_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUB_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUB_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDPSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCMPPDYrmi")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCMPPSYrmi")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMAXPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMAXPSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMINPDrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMINPSrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VROUNDPDm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VROUNDPSm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VSUBPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VSUBPSYrm")>; + +def SBWriteResGroup102 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SIrm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SIrm")>; + +def SBWriteResGroup103 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup103], (instregex "CVTDQ2PDrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2DQrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2PSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTSD2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTSI2SS64rm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTSI2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTTPD2DQrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTTPD2PIirm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDYrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2DQrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2PSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTSD2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI2SS64rm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTTPD2DQrm")>; + +def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup104], (instregex "MULPDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "MULPSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "MULSDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "MULSSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "PCMPGTQrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[SBWriteResGroup104], (instregex "RCPPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "RCPSSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "RSQRTPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "RSQRTSSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULPDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULPSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULSDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULSSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VPCMPGTQrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRCPPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRCPSSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTSSm")>; + +def SBWriteResGroup105 : SchedWriteRes<[SBPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRIrr")>; +def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRM128rr")>; +def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRIrr")>; +def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRM128rr")>; + +def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup106], (instregex "FICOM16m")>; +def: InstRW<[SBWriteResGroup106], (instregex "FICOM32m")>; +def: InstRW<[SBWriteResGroup106], (instregex "FICOMP16m")>; +def: InstRW<[SBWriteResGroup106], (instregex "FICOMP32m")>; + +def SBWriteResGroup107 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2DQYrm")>; +def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2PSYrm")>; +def: InstRW<[SBWriteResGroup107], (instregex "VCVTTPD2DQYrm")>; + +def SBWriteResGroup108 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup108], (instregex "MPSADBWrmi")>; +def: InstRW<[SBWriteResGroup108], (instregex "VMPSADBWrmi")>; + +def SBWriteResGroup109 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup109], (instregex "HADDPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "HADDPSrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "HSUBPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "HSUBPSrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHADDPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHADDPSrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPSrm")>; + +def SBWriteResGroup110 : SchedWriteRes<[SBPort5]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup110], (instregex "AESIMCrr")>; +def: InstRW<[SBWriteResGroup110], (instregex "VAESIMCrr")>; + +def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup111], (instregex "MUL_F32m")>; +def: InstRW<[SBWriteResGroup111], (instregex "MUL_F64m")>; +def: InstRW<[SBWriteResGroup111], (instregex "VMULPDYrm")>; +def: InstRW<[SBWriteResGroup111], (instregex "VMULPSYrm")>; + +def SBWriteResGroup112 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup112], (instregex "DPPSrri")>; +def: InstRW<[SBWriteResGroup112], (instregex "VDPPSYrri")>; +def: InstRW<[SBWriteResGroup112], (instregex "VDPPSrri")>; + +def SBWriteResGroup113 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup113], (instregex "VHADDPDrm")>; +def: InstRW<[SBWriteResGroup113], (instregex "VHADDPSYrm")>; +def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPDYrm")>; +def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPSYrm")>; + +def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI16m")>; +def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI32m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI16m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI32m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI16m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI32m")>; + +def SBWriteResGroup115 : SchedWriteRes<[SBPort5,SBPort23,SBPort015]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup115], (instregex "AESDECLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "AESDECrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "AESENCLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "AESENCrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESDECLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESDECrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESENCLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESENCrm")>; + +def SBWriteResGroup116 : SchedWriteRes<[SBPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup116], (instregex "DIVPSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "DIVSSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "SQRTPSr")>; +def: InstRW<[SBWriteResGroup116], (instregex "SQRTSSr")>; +def: InstRW<[SBWriteResGroup116], (instregex "VDIVPSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "VDIVSSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "VSQRTPSr")>; + +def SBWriteResGroup117 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup117], (instregex "VSQRTSSm")>; + +def SBWriteResGroup118 : SchedWriteRes<[SBPort0,SBPort23,SBPort0]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SBWriteResGroup118], (instregex "VRCPPSm")>; +def: InstRW<[SBWriteResGroup118], (instregex "VRSQRTPSYm")>; + +def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI16m")>; +def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI32m")>; + +def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup120], (instregex "DPPDrmi")>; +def: InstRW<[SBWriteResGroup120], (instregex "VDPPDrmi")>; + +def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRIrm")>; +def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRM128rm")>; +def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRIrm")>; +def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRM128rm")>; + +def SBWriteResGroup122 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 18; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup122], (instregex "AESIMCrm")>; +def: InstRW<[SBWriteResGroup122], (instregex "VAESIMCrm")>; + +def SBWriteResGroup123 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup123], (instregex "DIVPSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "DIVSSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "SQRTPSm")>; +def: InstRW<[SBWriteResGroup123], (instregex "SQRTSSm")>; +def: InstRW<[SBWriteResGroup123], (instregex "VDIVPSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "VDIVSSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "VSQRTPSm")>; + +def SBWriteResGroup124 : SchedWriteRes<[SBPort0]> { + let Latency = 21; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup124], (instregex "VSQRTSDr")>; + +def SBWriteResGroup125 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup125], (instregex "VSQRTSDm")>; + +def SBWriteResGroup126 : SchedWriteRes<[SBPort0]> { + let Latency = 22; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup126], (instregex "DIVPDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "DIVSDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "SQRTPDr")>; +def: InstRW<[SBWriteResGroup126], (instregex "SQRTSDr")>; +def: InstRW<[SBWriteResGroup126], (instregex "VDIVPDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "VDIVSDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "VSQRTPDr")>; + +def SBWriteResGroup127 : SchedWriteRes<[SBPort0]> { + let Latency = 24; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FPrST0")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FST0r")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FrST0")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIV_FPrST0")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIV_FST0r")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIV_FrST0")>; + +def SBWriteResGroup128 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 28; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup128], (instregex "DIVPDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "DIVSDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "SQRTPDm")>; +def: InstRW<[SBWriteResGroup128], (instregex "SQRTSDm")>; +def: InstRW<[SBWriteResGroup128], (instregex "VDIVPDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "VDIVSDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "VSQRTPDm")>; + +def SBWriteResGroup129 : SchedWriteRes<[SBPort0,SBPort0]> { + let Latency = 29; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup129], (instregex "VDIVPSYrr")>; +def: InstRW<[SBWriteResGroup129], (instregex "VSQRTPSYr")>; + +def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 31; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F32m")>; +def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F64m")>; +def: InstRW<[SBWriteResGroup130], (instregex "DIV_F32m")>; +def: InstRW<[SBWriteResGroup130], (instregex "DIV_F64m")>; + +def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 34; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI16m")>; +def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI32m")>; +def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI16m")>; +def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI32m")>; + +def SBWriteResGroup132 : SchedWriteRes<[SBPort0,SBPort23,SBPort0]> { + let Latency = 36; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SBWriteResGroup132], (instregex "VDIVPSYrm")>; +def: InstRW<[SBWriteResGroup132], (instregex "VSQRTPSYm")>; + +def SBWriteResGroup133 : SchedWriteRes<[SBPort0,SBPort0]> { + let Latency = 45; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup133], (instregex "VDIVPDYrr")>; +def: InstRW<[SBWriteResGroup133], (instregex "VSQRTPDYr")>; + +def SBWriteResGroup134 : SchedWriteRes<[SBPort0,SBPort23,SBPort0]> { + let Latency = 52; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SBWriteResGroup134], (instregex "VDIVPDYrm")>; +def: InstRW<[SBWriteResGroup134], (instregex "VSQRTPDYm")>; + +def SBWriteResGroup135 : SchedWriteRes<[SBPort0]> { + let Latency = 114; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup135], (instregex "VSQRTSSr")>; + } // SchedModel diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 6cb2a3694d92..ed53893b779c 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -369,5 +369,82 @@ def : WriteRes<WriteSystem, [JAny]> { let Latency = 100; } def : WriteRes<WriteMicrocoded, [JAny]> { let Latency = 100; } def : WriteRes<WriteFence, [JSAGU]>; def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// AVX instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WriteFAddY: SchedWriteRes<[JFPU0]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : InstRW<[WriteFAddY], (instregex "VADD(SUB)?P(S|D)Yrr", "VSUBP(S|D)Yrr")>; + +def WriteFAddYLd: SchedWriteRes<[JLAGU, JFPU0]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteFAddYLd, ReadAfterLd], (instregex "VADD(SUB)?P(S|D)Yrm", "VSUBP(S|D)Yrm")>; + +def WriteFDivY: SchedWriteRes<[JFPU1]> { + let Latency = 38; + let ResourceCycles = [38]; +} +def : InstRW<[WriteFDivY], (instregex "VDIVP(D|S)Yrr")>; + +def WriteFDivYLd: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 43; + let ResourceCycles = [1, 38]; +} +def : InstRW<[WriteFDivYLd, ReadAfterLd], (instregex "VDIVP(S|D)Yrm")>; + +def WriteVMULYPD: SchedWriteRes<[JFPU1]> { + let Latency = 4; + let ResourceCycles = [4]; +} +def : InstRW<[WriteVMULYPD], (instregex "VMULPDYrr")>; + +def WriteVMULYPDLd: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 9; + let ResourceCycles = [1, 4]; +} +def : InstRW<[WriteVMULYPDLd, ReadAfterLd], (instregex "VMULPDYrm")>; + +def WriteVMULYPS: SchedWriteRes<[JFPU1]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WriteVMULYPS], (instregex "VMULPSYrr", "VRCPPSYr", "VRSQRTPSYr")>; + +def WriteVMULYPSLd: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteVMULYPSLd, ReadAfterLd], (instregex "VMULPSYrm", "VRCPPSYm", "VRSQRTPSYm")>; + +def WriteVSQRTYPD: SchedWriteRes<[JFPU1]> { + let Latency = 54; + let ResourceCycles = [54]; +} +def : InstRW<[WriteVSQRTYPD], (instregex "VSQRTPDYr")>; + +def WriteVSQRTYPDLd: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 59; + let ResourceCycles = [1, 54]; +} +def : InstRW<[WriteVSQRTYPDLd], (instregex "VSQRTPDYm")>; + +def WriteVSQRTYPS: SchedWriteRes<[JFPU1]> { + let Latency = 42; + let ResourceCycles = [42]; +} +def : InstRW<[WriteVSQRTYPS], (instregex "VSQRTPSYr")>; + +def WriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 47; + let ResourceCycles = [1, 42]; +} +def : InstRW<[WriteVSQRTYPSLd], (instregex "VSQRTPSYm")>; + } // SchedModel diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index 5ba8534d32d3..c9924f264939 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -142,10 +142,15 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::FDIV, MVT::v2f64, 69 }, // divpd { ISD::FADD, MVT::v2f64, 2 }, // addpd { ISD::FSUB, MVT::v2f64, 2 }, // subpd - // v2i64/v4i64 mul is custom lowered as a series of long - // multiplies(3), shifts(3) and adds(2). - // slm muldq version throughput is 2 - { ISD::MUL, MVT::v2i64, 11 }, + // v2i64/v4i64 mul is custom lowered as a series of long: + // multiplies(3), shifts(3) and adds(2) + // slm muldq version throughput is 2 and addq throughput 4 + // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) + + // 3X4 (addq throughput) = 17 + { ISD::MUL, MVT::v2i64, 17 }, + // slm addq\subq throughput is 4 + { ISD::ADD, MVT::v2i64, 4 }, + { ISD::SUB, MVT::v2i64, 4 }, }; if (ST->isSLM()) { |