diff options
Diffstat (limited to 'lib/Target/X86')
114 files changed, 41020 insertions, 12429 deletions
diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt index 9f94d5d38864..67c0d1358d80 100644 --- a/lib/Target/X86/AsmParser/LLVMBuild.txt +++ b/lib/Target/X86/AsmParser/LLVMBuild.txt @@ -19,5 +19,5 @@ type = Library name = X86AsmParser parent = X86 -required_libraries = MC MCParser Support X86Desc X86Info +required_libraries = MC MCParser Support X86Desc X86Info X86AsmPrinter add_to_library_groups = X86 diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp index f7e31de65f6d..2c376fd062ca 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp @@ -38,7 +38,7 @@ // Currently we have only AddressSanitizer instrumentation, but we're // planning to implement MemorySanitizer for inline assembly too. If // you're not familiar with AddressSanitizer algorithm, please, read -// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm. +// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm // // When inline assembly is parsed by an instance of X86AsmParser, all // instructions are emitted via EmitInstruction method. That's the @@ -193,11 +193,10 @@ public: ~X86AddressSanitizer() override = default; // X86AsmInstrumentation implementation: - void InstrumentAndEmitInstruction(const MCInst &Inst, - OperandVector &Operands, - MCContext &Ctx, - const MCInstrInfo &MII, - MCStreamer &Out) override { + void InstrumentAndEmitInstruction(const MCInst &Inst, OperandVector &Operands, + MCContext &Ctx, const MCInstrInfo &MII, + MCStreamer &Out, + /* unused */ bool) override { InstrumentMOVS(Inst, Operands, Ctx, MII, Out); if (RepPrefix) EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX)); @@ -611,7 +610,7 @@ private: EmitInstruction(Out, MCInstBuilder(X86::CLD)); EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS)); - EmitInstruction(Out, MCInstBuilder(X86::AND64ri8) + EmitInstruction(Out, MCInstBuilder(X86::AND32ri8) .addReg(X86::ESP) .addReg(X86::ESP) .addImm(-16)); @@ -1045,13 +1044,13 @@ X86AsmInstrumentation::~X86AsmInstrumentation() = default; void X86AsmInstrumentation::InstrumentAndEmitInstruction( const MCInst &Inst, OperandVector &Operands, MCContext &Ctx, - const MCInstrInfo &MII, MCStreamer &Out) { - EmitInstruction(Out, Inst); + const MCInstrInfo &MII, MCStreamer &Out, bool PrintSchedInfoEnabled) { + EmitInstruction(Out, Inst, PrintSchedInfoEnabled); } -void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, - const MCInst &Inst) { - Out.EmitInstruction(Inst, *STI); +void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst, + bool PrintSchedInfoEnabled) { + Out.EmitInstruction(Inst, *STI, PrintSchedInfoEnabled); } unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx, diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h index 97a55cd8ad98..42a9dc3ba26a 100644 --- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h +++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h @@ -42,7 +42,8 @@ public: virtual void InstrumentAndEmitInstruction( const MCInst &Inst, SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands, - MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out); + MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out, + bool PrintSchedInfoEnabled); protected: friend X86AsmInstrumentation * @@ -54,7 +55,8 @@ protected: unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out); - void EmitInstruction(MCStreamer &Out, const MCInst &Inst); + void EmitInstruction(MCStreamer &Out, const MCInst &Inst, + bool PrintSchedInfoEnabled = false); const MCSubtargetInfo *&STI; diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp index c1d216c8b7af..87c65347e334 100644 --- a/lib/Target/X86/AsmParser/X86AsmParser.cpp +++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp @@ -7,7 +7,9 @@ // //===----------------------------------------------------------------------===// +#include "InstPrinter/X86IntelInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86TargetStreamer.h" #include "X86AsmInstrumentation.h" #include "X86AsmParserCommon.h" #include "X86Operand.h" @@ -37,6 +39,14 @@ using namespace llvm; +static bool checkScale(unsigned Scale, StringRef &ErrMsg) { + if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) { + ErrMsg = "scale factor in address must be 1, 2, 4 or 8"; + return true; + } + return false; +} + namespace { static const char OpPrecedence[] = { @@ -59,7 +69,6 @@ static const char OpPrecedence[] = { }; class X86AsmParser : public MCTargetAsmParser { - const MCInstrInfo &MII; ParseInstructionInfo *InstInfo; std::unique_ptr<X86AsmInstrumentation> Instrumentation; bool Code16GCC; @@ -72,6 +81,13 @@ private: return Result; } + X86TargetStreamer &getTargetStreamer() { + assert(getParser().getStreamer().getTargetStreamer() && + "do not have a target streamer"); + MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer(); + return static_cast<X86TargetStreamer &>(TS); + } + unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst, uint64_t &ErrorInfo, bool matchingInlineAsm, unsigned VariantID = 0) { @@ -125,8 +141,8 @@ private: int64_t popOperand() { assert (!PostfixStack.empty() && "Poped an empty stack!"); ICToken Op = PostfixStack.pop_back_val(); - assert ((Op.first == IC_IMM || Op.first == IC_REGISTER) - && "Expected and immediate or register!"); + if (!(Op.first == IC_IMM || Op.first == IC_REGISTER)) + return -1; // The invalid Scale value will be caught later by checkScale return Op.second; } void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) { @@ -293,6 +309,7 @@ private: }; enum IntelExprState { + IES_INIT, IES_OR, IES_XOR, IES_AND, @@ -320,16 +337,20 @@ private: int64_t Imm; const MCExpr *Sym; StringRef SymName; - bool StopOnLBrac, AddImmPrefix; InfixCalculator IC; InlineAsmIdentifierInfo Info; + short BracCount; + bool MemExpr; public: - IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) : - State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0), - Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac), - AddImmPrefix(addimmprefix) { Info.clear(); } - + IntelExprStateMachine() + : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), + TmpReg(0), Scale(1), Imm(0), Sym(nullptr), BracCount(0), + MemExpr(false) {} + + void addImm(int64_t imm) { Imm += imm; } + short getBracCount() { return BracCount; } + bool isMemExpr() { return MemExpr; } unsigned getBaseReg() { return BaseReg; } unsigned getIndexReg() { return IndexReg; } unsigned getScale() { return Scale; } @@ -339,13 +360,8 @@ private: bool isValidEndState() { return State == IES_RBRAC || State == IES_INTEGER; } - bool getStopOnLBrac() { return StopOnLBrac; } - bool getAddImmPrefix() { return AddImmPrefix; } bool hadError() { return State == IES_ERROR; } - - InlineAsmIdentifierInfo &getIdentifierInfo() { - return Info; - } + InlineAsmIdentifierInfo &getIdentifierInfo() { return Info; } void onOr() { IntelExprState CurrState = State; @@ -422,7 +438,7 @@ private: } PrevState = CurrState; } - void onPlus() { + bool onPlus(StringRef &ErrMsg) { IntelExprState CurrState = State; switch (State) { default: @@ -439,7 +455,10 @@ private: if (!BaseReg) { BaseReg = TmpReg; } else { - assert (!IndexReg && "BaseReg/IndexReg already set!"); + if (IndexReg) { + ErrMsg = "BaseReg/IndexReg already set!"; + return true; + } IndexReg = TmpReg; Scale = 1; } @@ -447,8 +466,9 @@ private: break; } PrevState = CurrState; + return false; } - void onMinus() { + bool onMinus(StringRef &ErrMsg) { IntelExprState CurrState = State; switch (State) { default: @@ -470,12 +490,17 @@ private: case IES_RBRAC: case IES_INTEGER: case IES_REGISTER: + case IES_INIT: State = IES_MINUS; // push minus operator if it is not a negate operator if (CurrState == IES_REGISTER || CurrState == IES_RPAREN || CurrState == IES_INTEGER || CurrState == IES_RBRAC) IC.pushOperator(IC_MINUS); - else + else if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { + // We have negate operator for Scale: it's illegal + ErrMsg = "Scale can't be negative"; + return true; + } else IC.pushOperator(IC_NEG); if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { // If we already have a BaseReg, then assume this is the IndexReg with @@ -483,7 +508,10 @@ private: if (!BaseReg) { BaseReg = TmpReg; } else { - assert (!IndexReg && "BaseReg/IndexReg already set!"); + if (IndexReg) { + ErrMsg = "BaseReg/IndexReg already set!"; + return true; + } IndexReg = TmpReg; Scale = 1; } @@ -491,6 +519,7 @@ private: break; } PrevState = CurrState; + return false; } void onNot() { IntelExprState CurrState = State; @@ -511,13 +540,15 @@ private: case IES_MOD: case IES_LPAREN: case IES_LBRAC: + case IES_INIT: State = IES_NOT; IC.pushOperator(IC_NOT); break; } PrevState = CurrState; } - void onRegister(unsigned Reg) { + + bool onRegister(unsigned Reg, StringRef &ErrMsg) { IntelExprState CurrState = State; switch (State) { default: @@ -525,6 +556,7 @@ private: break; case IES_PLUS: case IES_LPAREN: + case IES_LBRAC: State = IES_REGISTER; TmpReg = Reg; IC.pushOperand(IC_REGISTER); @@ -532,11 +564,16 @@ private: case IES_MULTIPLY: // Index Register - Scale * Register if (PrevState == IES_INTEGER) { - assert (!IndexReg && "IndexReg already set!"); + if (IndexReg) { + ErrMsg = "BaseReg/IndexReg already set!"; + return true; + } State = IES_REGISTER; IndexReg = Reg; // Get the scale and replace the 'Scale * Register' with '0'. Scale = IC.popOperand(); + if (checkScale(Scale, ErrMsg)) + return true; IC.pushOperand(IC_IMM); IC.popOperator(); } else { @@ -545,9 +582,20 @@ private: break; } PrevState = CurrState; + return false; } - void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) { + bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName, + const InlineAsmIdentifierInfo &IDInfo, + bool ParsingInlineAsm, StringRef &ErrMsg) { + // InlineAsm: Treat an enum value as an integer + if (ParsingInlineAsm) + if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) + return onInteger(IDInfo.Enum.EnumVal, ErrMsg); + // Treat a symbolic constant like an integer + if (auto *CE = dyn_cast<MCConstantExpr>(SymRef)) + return onInteger(CE->getValue(), ErrMsg); PrevState = State; + bool HasSymbol = Sym != nullptr; switch (State) { default: State = IES_ERROR; @@ -555,12 +603,20 @@ private: case IES_PLUS: case IES_MINUS: case IES_NOT: + case IES_INIT: + case IES_LBRAC: + MemExpr = true; State = IES_INTEGER; Sym = SymRef; SymName = SymRefName; IC.pushOperand(IC_IMM); + if (ParsingInlineAsm) + Info = IDInfo; break; } + if (HasSymbol) + ErrMsg = "cannot use more than one symbol in memory operand"; + return HasSymbol; } bool onInteger(int64_t TmpInt, StringRef &ErrMsg) { IntelExprState CurrState = State; @@ -580,16 +636,19 @@ private: case IES_MOD: case IES_MULTIPLY: case IES_LPAREN: + case IES_INIT: + case IES_LBRAC: State = IES_INTEGER; if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) { // Index Register - Register * Scale - assert (!IndexReg && "IndexReg already set!"); + if (IndexReg) { + ErrMsg = "BaseReg/IndexReg already set!"; + return true; + } IndexReg = TmpReg; Scale = TmpInt; - if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) { - ErrMsg = "scale factor in address must be 1, 2, 4 or 8"; + if (checkScale(Scale, ErrMsg)) return true; - } // Get the scale and replace the 'Register * Scale' with '0'. IC.popOperator(); } else { @@ -640,19 +699,30 @@ private: break; } } - void onLBrac() { + bool onLBrac() { + if (BracCount) + return true; PrevState = State; switch (State) { default: State = IES_ERROR; break; case IES_RBRAC: + case IES_INTEGER: + case IES_RPAREN: State = IES_PLUS; IC.pushOperator(IC_PLUS); break; + case IES_INIT: + assert(!BracCount && "BracCount should be zero on parsing's start"); + State = IES_LBRAC; + break; } + MemExpr = true; + BracCount++; + return false; } - void onRBrac() { + bool onRBrac() { IntelExprState CurrState = State; switch (State) { default: @@ -661,6 +731,8 @@ private: case IES_INTEGER: case IES_REGISTER: case IES_RPAREN: + if (BracCount-- != 1) + return true; State = IES_RBRAC; if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) { // If we already have a BaseReg, then assume this is the IndexReg with @@ -676,6 +748,7 @@ private: break; } PrevState = CurrState; + return false; } void onLParen() { IntelExprState CurrState = State; @@ -695,6 +768,8 @@ private: case IES_DIVIDE: case IES_MOD: case IES_LPAREN: + case IES_INIT: + case IES_LBRAC: State = IES_LPAREN; IC.pushOperator(IC_LPAREN); break; @@ -747,34 +822,41 @@ private: std::unique_ptr<X86Operand> ParseATTOperand(); std::unique_ptr<X86Operand> ParseIntelOperand(); std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator(); - bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp); - unsigned IdentifyIntelOperator(StringRef Name); - unsigned ParseIntelOperator(unsigned OpKind); - std::unique_ptr<X86Operand> - ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size); + bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End); + unsigned IdentifyIntelInlineAsmOperator(StringRef Name); + unsigned ParseIntelInlineAsmOperator(unsigned OpKind); std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End); bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM); + void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start, + SMLoc End); bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End); - std::unique_ptr<X86Operand> - ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp, - bool isSymbol, unsigned Size); - bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier, - InlineAsmIdentifierInfo &Info, - bool IsUnevaluatedOperand, SMLoc &End); + bool ParseIntelInlineAsmIdentifier(const MCExpr *&Val, StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, SMLoc &End); std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc); + bool ParseIntelMemoryOperandSize(unsigned &Size); std::unique_ptr<X86Operand> CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info, - bool AllowBetterSizeMatch = false); + const InlineAsmIdentifierInfo &Info); bool parseDirectiveEven(SMLoc L); bool ParseDirectiveWord(unsigned Size, SMLoc L); bool ParseDirectiveCode(StringRef IDVal, SMLoc L); + /// CodeView FPO data directives. + bool parseDirectiveFPOProc(SMLoc L); + bool parseDirectiveFPOSetFrame(SMLoc L); + bool parseDirectiveFPOPushReg(SMLoc L); + bool parseDirectiveFPOStackAlloc(SMLoc L); + bool parseDirectiveFPOEndPrologue(SMLoc L); + bool parseDirectiveFPOEndProc(SMLoc L); + bool parseDirectiveFPOData(SMLoc L); + + bool validateInstruction(MCInst &Inst, const OperandVector &Ops); bool processInstruction(MCInst &Inst, const OperandVector &Ops); /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds @@ -828,7 +910,7 @@ private: MCSubtargetInfo &STI = copySTI(); FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit}); FeatureBitset OldMode = STI.getFeatureBits() & AllModes; - unsigned FB = ComputeAvailableFeatures( + uint64_t FB = ComputeAvailableFeatures( STI.ToggleFeature(OldMode.flip(mode))); setAvailableFeatures(FB); @@ -858,7 +940,7 @@ public: X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser, const MCInstrInfo &mii, const MCTargetOptions &Options) - : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr), + : MCTargetAsmParser(Options, sti, mii), InstInfo(nullptr), Code16GCC(false) { // Initialize the set of available features. @@ -885,8 +967,8 @@ static unsigned MatchRegisterName(StringRef Name); /// } -static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg, - StringRef &ErrMsg) { +static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg, + unsigned Scale, StringRef &ErrMsg) { // If we have both a base register and an index register make sure they are // both 64-bit or 32-bit registers. // To support VSIB, IndexReg can be 128-bit or 256-bit registers. @@ -925,7 +1007,7 @@ static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg, } } } - return false; + return checkScale(Scale, ErrMsg); } bool X86AsmParser::ParseRegister(unsigned &RegNo, @@ -1016,19 +1098,31 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo, EndLoc = Parser.getTok().getEndLoc(); - // If this is "db[0-7]", match it as an alias - // for dr[0-7]. - if (RegNo == 0 && Tok.getString().size() == 3 && - Tok.getString().startswith("db")) { - switch (Tok.getString()[2]) { - case '0': RegNo = X86::DR0; break; - case '1': RegNo = X86::DR1; break; - case '2': RegNo = X86::DR2; break; - case '3': RegNo = X86::DR3; break; - case '4': RegNo = X86::DR4; break; - case '5': RegNo = X86::DR5; break; - case '6': RegNo = X86::DR6; break; - case '7': RegNo = X86::DR7; break; + // If this is "db[0-15]", match it as an alias + // for dr[0-15]. + if (RegNo == 0 && Tok.getString().startswith("db")) { + if (Tok.getString().size() == 3) { + switch (Tok.getString()[2]) { + case '0': RegNo = X86::DR0; break; + case '1': RegNo = X86::DR1; break; + case '2': RegNo = X86::DR2; break; + case '3': RegNo = X86::DR3; break; + case '4': RegNo = X86::DR4; break; + case '5': RegNo = X86::DR5; break; + case '6': RegNo = X86::DR6; break; + case '7': RegNo = X86::DR7; break; + case '8': RegNo = X86::DR8; break; + case '9': RegNo = X86::DR9; break; + } + } else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') { + switch (Tok.getString()[3]) { + case '0': RegNo = X86::DR10; break; + case '1': RegNo = X86::DR11; break; + case '2': RegNo = X86::DR12; break; + case '3': RegNo = X86::DR13; break; + case '4': RegNo = X86::DR14; break; + case '5': RegNo = X86::DR15; break; + } } if (RegNo != 0) { @@ -1198,124 +1292,48 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() { return ParseATTOperand(); } -/// getIntelMemOperandSize - Return intel memory operand size. -static unsigned getIntelMemOperandSize(StringRef OpStr) { - unsigned Size = StringSwitch<unsigned>(OpStr) - .Cases("BYTE", "byte", 8) - .Cases("WORD", "word", 16) - .Cases("DWORD", "dword", 32) - .Cases("FWORD", "fword", 48) - .Cases("QWORD", "qword", 64) - .Cases("MMWORD","mmword", 64) - .Cases("XWORD", "xword", 80) - .Cases("TBYTE", "tbyte", 80) - .Cases("XMMWORD", "xmmword", 128) - .Cases("YMMWORD", "ymmword", 256) - .Cases("ZMMWORD", "zmmword", 512) - .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter - .Default(0); - return Size; -} - std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm( unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier, - InlineAsmIdentifierInfo &Info, bool AllowBetterSizeMatch) { + const InlineAsmIdentifierInfo &Info) { // If we found a decl other than a VarDecl, then assume it is a FuncDecl or // some other label reference. - if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) { + if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) { // Insert an explicit size if the user didn't have one. if (!Size) { Size = getPointerWidth(); InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start, /*Len=*/0, Size); } - // Create an absolute memory reference in order to match against // instructions taking a PC relative operand. return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size, - Identifier, Info.OpDecl); + Identifier, Info.Label.Decl); } - - // We either have a direct symbol reference, or an offset from a symbol. The // parser always puts the symbol on the LHS, so look there for size // calculation purposes. unsigned FrontendSize = 0; - const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp); - bool IsSymRef = - isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp); - if (IsSymRef && !Size && Info.Type) - FrontendSize = Info.Type * 8; // Size is in terms of bits in this context. - - // When parsing inline assembly we set the base register to a non-zero value + void *Decl = nullptr; + bool IsGlobalLV = false; + if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { + // Size is in terms of bits in this context. + FrontendSize = Info.Var.Type * 8; + Decl = Info.Var.Decl; + IsGlobalLV = Info.Var.IsGlobalLV; + } + // It is widely common for MS InlineAsm to use a global variable and one/two + // registers in a mmory expression, and though unaccessible via rip/eip. + if (IsGlobalLV && (BaseReg || IndexReg)) { + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End); + // Otherwise, we set the base register to a non-zero value // if we don't know the actual value at this time. This is necessary to // get the matching correct in some cases. - BaseReg = BaseReg ? BaseReg : 1; - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, - IndexReg, Scale, Start, End, Size, Identifier, - Info.OpDecl, FrontendSize); -} - -static void -RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites, - StringRef SymName, int64_t ImmDisp, - int64_t FinalImmDisp, SMLoc &BracLoc, - SMLoc &StartInBrac, SMLoc &End) { - // Remove the '[' and ']' from the IR string. - AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1); - AsmRewrites.emplace_back(AOK_Skip, End, 1); - - // If ImmDisp is non-zero, then we parsed a displacement before the - // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp]) - // If ImmDisp doesn't match the displacement computed by the state machine - // then we have an additional displacement in the bracketed expression. - if (ImmDisp != FinalImmDisp) { - if (ImmDisp) { - // We have an immediate displacement before the bracketed expression. - // Adjust this to match the final immediate displacement. - bool Found = false; - for (AsmRewrite &AR : AsmRewrites) { - if (AR.Loc.getPointer() > BracLoc.getPointer()) - continue; - if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) { - assert (!Found && "ImmDisp already rewritten."); - AR.Kind = AOK_Imm; - AR.Len = BracLoc.getPointer() - AR.Loc.getPointer(); - AR.Val = FinalImmDisp; - Found = true; - break; - } - } - assert (Found && "Unable to rewrite ImmDisp."); - (void)Found; - } else { - // We have a symbolic and an immediate displacement, but no displacement - // before the bracketed expression. Put the immediate displacement - // before the bracketed expression. - AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp); - } - } - // Remove all the ImmPrefix rewrites within the brackets. - // We may have some Imm rewrties as a result of an operator applying, - // remove them as well - for (AsmRewrite &AR : AsmRewrites) { - if (AR.Loc.getPointer() < StartInBrac.getPointer()) - continue; - if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) - AR.Kind = AOK_Delete; - } - const char *SymLocPtr = SymName.data(); - // Skip everything before the symbol. - if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) { - assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len); - } - // Skip everything after the symbol. - if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) { - SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size()); - assert(Len > 0 && "Expected a non-negative length."); - AsmRewrites.emplace_back(AOK_Skip, Loc, Len); + } else { + BaseReg = BaseReg ? BaseReg : 1; + return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, + IndexReg, Scale, Start, End, Size, Identifier, + Decl, FrontendSize); } } @@ -1348,77 +1366,80 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); + StringRef ErrMsg; AsmToken::TokenKind PrevTK = AsmToken::Error; bool Done = false; while (!Done) { bool UpdateLocLex = true; - AsmToken::TokenKind TK = getLexer().getKind(); - // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an - // identifier. Don't try an parse it as a register. - if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") && - TK != AsmToken::Identifier) - break; - - // If we're parsing an immediate expression, we don't expect a '['. - if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac) - break; switch (TK) { - default: { - if (SM.isValidEndState()) { - Done = true; + default: + if ((Done = SM.isValidEndState())) break; - } return Error(Tok.getLoc(), "unknown token in expression"); - } - case AsmToken::EndOfStatement: { + case AsmToken::EndOfStatement: Done = true; break; - } + case AsmToken::Real: + // DotOperator: [ebx].0 + UpdateLocLex = false; + if (ParseIntelDotOperator(SM, End)) + return true; + break; case AsmToken::String: case AsmToken::Identifier: { - // This could be a register or a symbolic displacement. - unsigned TmpReg; - const MCExpr *Val; SMLoc IdentLoc = Tok.getLoc(); StringRef Identifier = Tok.getString(); UpdateLocLex = false; - if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) { - SM.onRegister(TmpReg); - } else if (ParseIntelNamedOperator(Identifier, SM)) { - UpdateLocLex = true; - } else if (!isParsingInlineAsm()) { - if (getParser().parsePrimaryExpr(Val, End)) + // Register + unsigned Reg; + if (Tok.isNot(AsmToken::String) && !ParseRegister(Reg, IdentLoc, End)) { + if (SM.onRegister(Reg, ErrMsg)) + return Error(Tok.getLoc(), ErrMsg); + break; + } + // Operator synonymous ("not", "or" etc.) + if ((UpdateLocLex = ParseIntelNamedOperator(Identifier, SM))) + break; + // Symbol reference, when parsing assembly content + InlineAsmIdentifierInfo Info; + const MCExpr *Val; + if (!isParsingInlineAsm()) { + if (getParser().parsePrimaryExpr(Val, End)) { return Error(Tok.getLoc(), "Unexpected identifier!"); - SM.onIdentifierExpr(Val, Identifier); - } else if (unsigned OpKind = IdentifyIntelOperator(Identifier)) { - if (OpKind == IOK_OFFSET) + } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) { + return Error(IdentLoc, ErrMsg); + } else + break; + } + // MS InlineAsm operators (TYPE/LENGTH/SIZE) + if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) { + if (OpKind == IOK_OFFSET) return Error(IdentLoc, "Dealing OFFSET operator as part of" "a compound immediate expression is yet to be supported"); - int64_t Val = ParseIntelOperator(OpKind); - if (!Val) + if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) { + if (SM.onInteger(Val, ErrMsg)) + return Error(IdentLoc, ErrMsg); + } else return true; - StringRef ErrMsg; - if (SM.onInteger(Val, ErrMsg)) - return Error(IdentLoc, ErrMsg); - } else if (Identifier.find('.') != StringRef::npos && - PrevTK == AsmToken::RBrac) { - return false; - } else { - InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); - if (ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated=*/false, End)) + break; + } + // MS Dot Operator expression + if (Identifier.count('.') && PrevTK == AsmToken::RBrac) { + if (ParseIntelDotOperator(SM, End)) return true; - SM.onIdentifierExpr(Val, Identifier); + break; } + // MS InlineAsm identifier + if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End)) + return true; + else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg)) + return Error(IdentLoc, ErrMsg); break; } case AsmToken::Integer: { - StringRef ErrMsg; - if (isParsingInlineAsm() && SM.getAddImmPrefix()) - InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc()); // Look for 'b' or 'f' following an Integer as a directional label SMLoc Loc = getTok().getLoc(); int64_t IntVal = getTok().getIntVal(); @@ -1435,7 +1456,10 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { if (IDVal == "b" && Sym->isUndefined()) return Error(Loc, "invalid reference to undefined symbol"); StringRef Identifier = Sym->getName(); - SM.onIdentifierExpr(Val, Identifier); + InlineAsmIdentifierInfo Info; + if (SM.onIdentifierExpr(Val, Identifier, Info, + isParsingInlineAsm(), ErrMsg)) + return Error(Loc, ErrMsg); End = consumeToken(); } else { if (SM.onInteger(IntVal, ErrMsg)) @@ -1447,11 +1471,18 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { } break; } - case AsmToken::Plus: SM.onPlus(); break; - case AsmToken::Minus: SM.onMinus(); break; + case AsmToken::Plus: + if (SM.onPlus(ErrMsg)) + return Error(getTok().getLoc(), ErrMsg); + break; + case AsmToken::Minus: + if (SM.onMinus(ErrMsg)) + return Error(getTok().getLoc(), ErrMsg); + break; case AsmToken::Tilde: SM.onNot(); break; case AsmToken::Star: SM.onStar(); break; case AsmToken::Slash: SM.onDivide(); break; + case AsmToken::Percent: SM.onMod(); break; case AsmToken::Pipe: SM.onOr(); break; case AsmToken::Caret: SM.onXor(); break; case AsmToken::Amp: SM.onAnd(); break; @@ -1459,8 +1490,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { SM.onLShift(); break; case AsmToken::GreaterGreater: SM.onRShift(); break; - case AsmToken::LBrac: SM.onLBrac(); break; - case AsmToken::RBrac: SM.onRBrac(); break; + case AsmToken::LBrac: + if (SM.onLBrac()) + return Error(Tok.getLoc(), "unexpected bracket encountered"); + break; + case AsmToken::RBrac: + if (SM.onRBrac()) + return Error(Tok.getLoc(), "unexpected bracket encountered"); + break; case AsmToken::LParen: SM.onLParen(); break; case AsmToken::RParen: SM.onRParen(); break; } @@ -1475,112 +1512,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) { return false; } -std::unique_ptr<X86Operand> -X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start, - int64_t ImmDisp, bool isSymbol, - unsigned Size) { - MCAsmParser &Parser = getParser(); - const AsmToken &Tok = Parser.getTok(); - SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc(); - if (getLexer().isNot(AsmToken::LBrac)) - return ErrorOperand(BracLoc, "Expected '[' token!"); - Parser.Lex(); // Eat '[' - - SMLoc StartInBrac = Parser.getTok().getLoc(); - // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We - // may have already parsed an immediate displacement before the bracketed - // expression. - IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true); - if (ParseIntelExpression(SM, End)) - return nullptr; - - const MCExpr *Disp = nullptr; - if (const MCExpr *Sym = SM.getSym()) { - // A symbolic displacement. - Disp = Sym; - if (isParsingInlineAsm()) - RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(), - ImmDisp, SM.getImm(), BracLoc, StartInBrac, - End); - } - - if (SM.getImm() || !Disp) { - const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext()); - if (Disp) - Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext()); - else - Disp = Imm; // An immediate displacement only. - } - - // Parse struct field access. Intel requires a dot, but MSVC doesn't. MSVC - // will in fact do global lookup the field name inside all global typedefs, - // but we don't emulate that. - if ((Parser.getTok().getKind() == AsmToken::Identifier || - Parser.getTok().getKind() == AsmToken::Dot || - Parser.getTok().getKind() == AsmToken::Real) && - Parser.getTok().getString().find('.') != StringRef::npos) { - const MCExpr *NewDisp; - if (ParseIntelDotOperator(Disp, NewDisp)) - return nullptr; - - End = Tok.getEndLoc(); - Parser.Lex(); // Eat the field. - Disp = NewDisp; - } - - if (isSymbol) { - if (SM.getSym()) { - Error(Start, "cannot use more than one symbol in memory operand"); - return nullptr; - } - if (SM.getBaseReg()) { - Error(Start, "cannot use base register with variable reference"); - return nullptr; - } - if (SM.getIndexReg()) { - Error(Start, "cannot use index register with variable reference"); - return nullptr; - } - } - - int BaseReg = SM.getBaseReg(); - int IndexReg = SM.getIndexReg(); - int Scale = SM.getScale(); - if (!isParsingInlineAsm()) { - // handle [-42] - if (!BaseReg && !IndexReg) { - if (!SegReg) - return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1, - Start, End, Size); - } - StringRef ErrMsg; - if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { - Error(StartInBrac, ErrMsg); - return nullptr; - } - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg, - IndexReg, Scale, Start, End, Size); - } - - InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo(); - return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start, - End, Size, SM.getSymName(), Info, - isParsingInlineAsm()); +void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM, + SMLoc Start, SMLoc End) { + SMLoc Loc = Start; + unsigned ExprLen = End.getPointer() - Start.getPointer(); + // Skip everything before a symbol displacement (if we have one) + if (SM.getSym()) { + StringRef SymName = SM.getSymName(); + if (unsigned Len = SymName.data() - Start.getPointer()) + InstInfo->AsmRewrites->emplace_back(AOK_Skip, Start, Len); + Loc = SMLoc::getFromPointer(SymName.data() + SymName.size()); + ExprLen = End.getPointer() - (SymName.data() + SymName.size()); + // If we have only a symbol than there's no need for complex rewrite, + // simply skip everything after it + if (!(SM.getBaseReg() || SM.getIndexReg() || SM.getImm())) { + if (ExprLen) + InstInfo->AsmRewrites->emplace_back(AOK_Skip, Loc, ExprLen); + return; + } + } + // Build an Intel Expression rewrite + StringRef BaseRegStr; + StringRef IndexRegStr; + if (SM.getBaseReg()) + BaseRegStr = X86IntelInstPrinter::getRegisterName(SM.getBaseReg()); + if (SM.getIndexReg()) + IndexRegStr = X86IntelInstPrinter::getRegisterName(SM.getIndexReg()); + // Emit it + IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), SM.getImm(), SM.isMemExpr()); + InstInfo->AsmRewrites->emplace_back(Loc, ExprLen, Expr); } // Inline assembly may use variable names with namespace alias qualifiers. -bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, - StringRef &Identifier, - InlineAsmIdentifierInfo &Info, - bool IsUnevaluatedOperand, SMLoc &End) { +bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val, + StringRef &Identifier, + InlineAsmIdentifierInfo &Info, + bool IsUnevaluatedOperand, + SMLoc &End) { MCAsmParser &Parser = getParser(); assert(isParsingInlineAsm() && "Expected to be parsing inline assembly."); Val = nullptr; StringRef LineBuf(Identifier.data()); - void *Result = - SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); + SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand); const AsmToken &Tok = Parser.getTok(); SMLoc Loc = Tok.getLoc(); @@ -1596,12 +1570,13 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, // The frontend should end parsing on an assembler token boundary, unless it // failed parsing. - assert((End.getPointer() == EndPtr || !Result) && - "frontend claimed part of a token?"); + assert((End.getPointer() == EndPtr || + Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) && + "frontend claimed part of a token?"); // If the identifier lookup was unsuccessful, assume that we are dealing with // a label. - if (!Result) { + if (Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) { StringRef InternalName = SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(), Loc, false); @@ -1609,8 +1584,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, // Push a rewrite for replacing the identifier name with the internal name. InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(), InternalName); - } - + } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) + return false; // Create the symbol reference. MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier); MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None; @@ -1618,57 +1593,6 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val, return false; } -/// \brief Parse intel style segment override. -std::unique_ptr<X86Operand> -X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, - unsigned Size) { - MCAsmParser &Parser = getParser(); - assert(SegReg != 0 && "Tried to parse a segment override without a segment!"); - const AsmToken &Tok = Parser.getTok(); // Eat colon. - if (Tok.isNot(AsmToken::Colon)) - return ErrorOperand(Tok.getLoc(), "Expected ':' token!"); - Parser.Lex(); // Eat ':' - - int64_t ImmDisp = 0; - if (getLexer().is(AsmToken::Integer)) { - ImmDisp = Tok.getIntVal(); - AsmToken ImmDispToken = Parser.Lex(); // Eat the integer. - - if (isParsingInlineAsm()) - InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc()); - - if (getLexer().isNot(AsmToken::LBrac)) { - // An immediate following a 'segment register', 'colon' token sequence can - // be followed by a bracketed expression. If it isn't we know we have our - // final segment override. - const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext()); - return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, - /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1, - Start, ImmDispToken.getEndLoc(), Size); - } - } - - if (getLexer().is(AsmToken::LBrac)) - return ParseIntelBracExpression(SegReg, Start, ImmDisp, false, Size); - - const MCExpr *Val; - SMLoc End; - if (!isParsingInlineAsm()) { - if (getParser().parsePrimaryExpr(Val, End)) - return ErrorOperand(Tok.getLoc(), "unknown token in expression"); - - return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size); - } - - InlineAsmIdentifierInfo Info; - StringRef Identifier = Tok.getString(); - if (ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated=*/false, End)) - return nullptr; - return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0, - /*Scale=*/1, Start, End, Size, Identifier, Info); -} - //ParseRoundingModeOp - Parse AVX-512 rounding mode operand std::unique_ptr<X86Operand> X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { @@ -1708,17 +1632,9 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) { } /// Parse the '.' operator. -bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, - const MCExpr *&NewDisp) { - MCAsmParser &Parser = getParser(); - const AsmToken &Tok = Parser.getTok(); - int64_t OrigDispVal, DotDispVal; - - // FIXME: Handle non-constant expressions. - if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp)) - OrigDispVal = OrigDisp->getValue(); - else - return Error(Tok.getLoc(), "Non-constant offsets are not supported!"); +bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) { + const AsmToken &Tok = getTok(); + unsigned Offset; // Drop the optional '.'. StringRef DotDispStr = Tok.getString(); @@ -1729,24 +1645,21 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp, if (Tok.is(AsmToken::Real)) { APInt DotDisp; DotDispStr.getAsInteger(10, DotDisp); - DotDispVal = DotDisp.getZExtValue(); + Offset = DotDisp.getZExtValue(); } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { - unsigned DotDisp; std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.'); if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second, - DotDisp)) + Offset)) return Error(Tok.getLoc(), "Unable to lookup field reference!"); - DotDispVal = DotDisp; } else return Error(Tok.getLoc(), "Unexpected token type!"); - if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) { - SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data()); - unsigned Len = DotDispStr.size(); - InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal); - } - - NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext()); + // Eat the DotExpression and update End + End = SMLoc::getFromPointer(DotDispStr.data()); + const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size(); + while (Tok.getLoc().getPointer() < DotExprEndLoc) + Lex(); + SM.addImm(Offset); return false; } @@ -1762,10 +1675,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { InlineAsmIdentifierInfo Info; SMLoc Start = Tok.getLoc(), End; StringRef Identifier = Tok.getString(); - if (ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated=*/false, End)) + if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, + /*Unevaluated=*/false, End)) return nullptr; + void *Decl = nullptr; + // FIXME: MS evaluates "offset <Constant>" to the underlying integral + if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) + return ErrorOperand(Start, "offset operator cannot yet handle constants"); + else if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) + Decl = Info.Var.Decl; // Don't emit the offset operator. InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7); @@ -1776,12 +1695,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() { unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX); return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true, - OffsetOfLoc, Identifier, Info.OpDecl); + OffsetOfLoc, Identifier, Decl); } // Query a candidate string for being an Intel assembly operator // Report back its kind, or IOK_INVALID if does not evaluated as a known one -unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) { +unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) { return StringSwitch<unsigned>(Name) .Cases("TYPE","type",IOK_TYPE) .Cases("SIZE","size",IOK_SIZE) @@ -1796,41 +1715,62 @@ unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) { /// variable. A variable's size is the product of its LENGTH and TYPE. The /// TYPE operator returns the size of a C or C++ type or variable. If the /// variable is an array, TYPE returns the size of a single element. -unsigned X86AsmParser::ParseIntelOperator(unsigned OpKind) { +unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); - SMLoc TypeLoc = Tok.getLoc(); Parser.Lex(); // Eat operator. const MCExpr *Val = nullptr; InlineAsmIdentifierInfo Info; SMLoc Start = Tok.getLoc(), End; StringRef Identifier = Tok.getString(); - if (ParseIntelIdentifier(Val, Identifier, Info, - /*Unevaluated=*/true, End)) + if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, + /*Unevaluated=*/true, End)) return 0; - if (!Info.OpDecl) { + if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) { Error(Start, "unable to lookup expression"); return 0; } - + unsigned CVal = 0; switch(OpKind) { default: llvm_unreachable("Unexpected operand kind!"); - case IOK_LENGTH: CVal = Info.Length; break; - case IOK_SIZE: CVal = Info.Size; break; - case IOK_TYPE: CVal = Info.Type; break; + case IOK_LENGTH: CVal = Info.Var.Length; break; + case IOK_SIZE: CVal = Info.Var.Size; break; + case IOK_TYPE: CVal = Info.Var.Type; break; } - // Rewrite the type operator and the C or C++ type or variable in terms of an - // immediate. E.g. TYPE foo -> $$4 - unsigned Len = End.getPointer() - TypeLoc.getPointer(); - InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal); - return CVal; } +bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) { + Size = StringSwitch<unsigned>(getTok().getString()) + .Cases("BYTE", "byte", 8) + .Cases("WORD", "word", 16) + .Cases("DWORD", "dword", 32) + .Cases("FLOAT", "float", 32) + .Cases("LONG", "long", 32) + .Cases("FWORD", "fword", 48) + .Cases("DOUBLE", "double", 64) + .Cases("QWORD", "qword", 64) + .Cases("MMWORD","mmword", 64) + .Cases("XWORD", "xword", 80) + .Cases("TBYTE", "tbyte", 80) + .Cases("XMMWORD", "xmmword", 128) + .Cases("YMMWORD", "ymmword", 256) + .Cases("ZMMWORD", "zmmword", 512) + .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter + .Default(0); + if (Size) { + const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word). + if (!(Tok.getString().equals("PTR") || Tok.getString().equals("ptr"))) + return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); + Lex(); // Eat ptr. + } + return false; +} + std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { MCAsmParser &Parser = getParser(); const AsmToken &Tok = Parser.getTok(); @@ -1840,100 +1780,76 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() { // Should be handled as part of immediate expression, as other operators // Currently, only supported as a stand-alone operand if (isParsingInlineAsm()) - if (IdentifyIntelOperator(Tok.getString()) == IOK_OFFSET) + if (IdentifyIntelInlineAsmOperator(Tok.getString()) == IOK_OFFSET) return ParseIntelOffsetOfOperator(); - bool PtrInOperand = false; - unsigned Size = getIntelMemOperandSize(Tok.getString()); - if (Size) { - Parser.Lex(); // Eat operand size (e.g., byte, word). - if (Tok.getString() != "PTR" && Tok.getString() != "ptr") - return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!"); - Parser.Lex(); // Eat ptr. - PtrInOperand = true; - } + // Parse optional Size directive. + unsigned Size; + if (ParseIntelMemoryOperandSize(Size)) + return nullptr; + bool PtrInOperand = bool(Size); Start = Tok.getLoc(); - // rounding mode token + // Rounding mode operand. if (getSTI().getFeatureBits()[X86::FeatureAVX512] && getLexer().is(AsmToken::LCurly)) return ParseRoundingModeOp(Start, End); - // Register. + // Register operand. unsigned RegNo = 0; - if (getLexer().is(AsmToken::Identifier) && - !ParseRegister(RegNo, Start, End)) { - // If this is a segment register followed by a ':', then this is the start - // of a segment override, otherwise this is a normal register reference. - // In case it is a normal register and there is ptr in the operand this - // is an error + if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) { if (RegNo == X86::RIP) return ErrorOperand(Start, "rip can only be used as a base register"); - if (getLexer().isNot(AsmToken::Colon)) { - if (PtrInOperand) { - return ErrorOperand(Start, "expected memory operand after " - "'ptr', found register operand instead"); - } - return X86Operand::CreateReg(RegNo, Start, End); - } - return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size); + // A Register followed by ':' is considered a segment override + if (Tok.isNot(AsmToken::Colon)) + return !PtrInOperand ? X86Operand::CreateReg(RegNo, Start, End) : + ErrorOperand(Start, "expected memory operand after 'ptr', " + "found register operand instead"); + // An alleged segment override. check if we have a valid segment register + if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo)) + return ErrorOperand(Start, "invalid segment register"); + // Eat ':' and update Start location + Start = Lex().getLoc(); } // Immediates and Memory - - // Parse [ BaseReg + Scale*IndexReg + Disp ]. - if (getLexer().is(AsmToken::LBrac)) - return ParseIntelBracExpression(/*SegReg=*/0, Start, /*ImmDisp=*/0, false, - Size); - - AsmToken StartTok = Tok; - IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true, - /*AddImmPrefix=*/false); + IntelExprStateMachine SM; if (ParseIntelExpression(SM, End)) return nullptr; - bool isSymbol = SM.getSym() && SM.getSym()->getKind() != MCExpr::Constant; + if (isParsingInlineAsm()) + RewriteIntelExpression(SM, Start, Tok.getLoc()); + int64_t Imm = SM.getImm(); - if (SM.getSym() && SM.getSym()->getKind() == MCExpr::Constant) - SM.getSym()->evaluateAsAbsolute(Imm); - - if (StartTok.isNot(AsmToken::Identifier) && - StartTok.isNot(AsmToken::String) && isParsingInlineAsm()) { - unsigned Len = Tok.getLoc().getPointer() - Start.getPointer(); - if (StartTok.getString().size() == Len) - // Just add a prefix if this wasn't a complex immediate expression. - InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start); - else - // Otherwise, rewrite the complex expression as a single immediate. - InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm); - } - - if (getLexer().isNot(AsmToken::LBrac)) { - // If a directional label (ie. 1f or 2b) was parsed above from - // ParseIntelExpression() then SM.getSym() was set to a pointer to - // to the MCExpr with the directional local symbol and this is a - // memory operand not an immediate operand. - if (isSymbol) { - if (isParsingInlineAsm()) - return CreateMemForInlineAsm(/*SegReg=*/0, SM.getSym(), /*BaseReg=*/0, - /*IndexReg=*/0, - /*Scale=*/1, Start, End, Size, - SM.getSymName(), SM.getIdentifierInfo()); - return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End, - Size); - } - - const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext()); - return X86Operand::CreateImm(ImmExpr, Start, End); - } - - // Only positive immediates are valid. - if (Imm < 0) - return ErrorOperand(Start, "expected a positive immediate displacement " - "before bracketed expr."); - - return ParseIntelBracExpression(/*SegReg=*/0, Start, Imm, isSymbol, Size); + const MCExpr *Disp = SM.getSym(); + const MCExpr *ImmDisp = MCConstantExpr::create(Imm, getContext()); + if (Disp && Imm) + Disp = MCBinaryExpr::createAdd(Disp, ImmDisp, getContext()); + if (!Disp) + Disp = ImmDisp; + + // RegNo != 0 specifies a valid segment register, + // and we are parsing a segment override + if (!SM.isMemExpr() && !RegNo) + return X86Operand::CreateImm(Disp, Start, End); + + StringRef ErrMsg; + unsigned BaseReg = SM.getBaseReg(); + unsigned IndexReg = SM.getIndexReg(); + unsigned Scale = SM.getScale(); + + if ((BaseReg || IndexReg) && + CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg)) + return ErrorOperand(Start, ErrMsg); + if (isParsingInlineAsm()) + return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg, + Scale, Start, End, Size, SM.getSymName(), + SM.getIdentifierInfo()); + if (!(BaseReg || IndexReg || RegNo)) + return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size); + return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, + BaseReg, IndexReg, Scale, Start, End, Size); } std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() { @@ -2055,14 +1971,20 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, // no errors. // Query for the need of further parsing for a {%k<NUM>} mark if (!Z || getLexer().is(AsmToken::LCurly)) { - const SMLoc StartLoc = Z ? consumeToken() : consumedToken; + SMLoc StartLoc = Z ? consumeToken() : consumedToken; // Parse an op-mask register mark ({%k<NUM>}), which is now to be // expected - if (std::unique_ptr<X86Operand> Op = ParseOperand()) { + unsigned RegNo; + SMLoc RegLoc; + if (!ParseRegister(RegNo, RegLoc, StartLoc) && + X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) { + if (RegNo == X86::K0) + return Error(RegLoc, "Register k0 can't be used as write mask"); if (!getLexer().is(AsmToken::RCurly)) return Error(getLexer().getLoc(), "Expected } at this point"); Operands.push_back(X86Operand::CreateToken("{", StartLoc)); - Operands.push_back(std::move(Op)); + Operands.push_back( + X86Operand::CreateReg(RegNo, StartLoc, StartLoc)); Operands.push_back(X86Operand::CreateToken("}", consumeToken())); } else return Error(getLexer().getLoc(), @@ -2072,7 +1994,8 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands, // Have we've found a parsing error, or found no (expected) {z} mark // - report an error if (ParseZ(Z, consumeToken()) || !Z) - return true; + return Error(getLexer().getLoc(), + "Expected a {z} mark at this point"); } // '{z}' on its own is meaningless, hence should be ignored. @@ -2125,9 +2048,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, // memory operand consumed. } else { SMLoc ExprEnd; + getLexer().UnLex(AsmToken(AsmToken::LParen, "(")); - // It must be an parenthesized expression, parse it now. - if (getParser().parseParenExpression(Disp, ExprEnd)) + // It must be either an parenthesized expression, or an expression that + // begins from a parenthesized expression, parse it now. Example: (1+2) or + // (1+2)+3 + if (getParser().parseExpression(Disp, ExprEnd)) return nullptr; // After parsing the base expression we could either have a parenthesized @@ -2258,7 +2184,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg, } StringRef ErrMsg; - if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) { + if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg)) { Error(BaseLoc, ErrMsg); return nullptr; } @@ -2275,7 +2201,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, InstInfo = &Info; StringRef PatchedName = Name; - if (Name == "jmp" && isParsingIntelSyntax() && isParsingInlineAsm()) { + if ((Name.equals("jmp") || Name.equals("jc") || Name.equals("jz")) && + isParsingIntelSyntax() && isParsingInlineAsm()) { StringRef NextTok = Parser.getTok().getString(); if (NextTok == "short") { SMLoc NameEndLoc = @@ -2417,22 +2344,57 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } } - Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); // Determine whether this is an instruction prefix. - bool isPrefix = - Name == "lock" || Name == "rep" || - Name == "repe" || Name == "repz" || - Name == "repne" || Name == "repnz" || - Name == "rex64" || Name == "data16" || Name == "data32"; + // FIXME: + // Enhance prefixes integrity robustness. for example, following forms + // are currently tolerated: + // repz repnz <insn> ; GAS errors for the use of two similar prefixes + // lock addq %rax, %rbx ; Destination operand must be of memory type + // xacquire <insn> ; xacquire must be accompanied by 'lock' + bool isPrefix = StringSwitch<bool>(Name) + .Cases("rex64", "data32", "data16", true) + .Cases("xacquire", "xrelease", true) + .Cases("acquire", "release", isParsingIntelSyntax()) + .Default(false); + + auto isLockRepeatPrefix = [](StringRef N) { + return StringSwitch<bool>(N) + .Cases("lock", "rep", "repe", "repz", "repne", "repnz", true) + .Default(false); + }; bool CurlyAsEndOfStatement = false; + + unsigned Flags = X86::IP_NO_PREFIX; + while (isLockRepeatPrefix(Name.lower())) { + unsigned Prefix = + StringSwitch<unsigned>(Name) + .Cases("lock", "lock", X86::IP_HAS_LOCK) + .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT) + .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE) + .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible) + Flags |= Prefix; + Name = Parser.getTok().getString(); + Parser.Lex(); // eat the prefix + // Hack: we could have something like + // "lock; cmpxchg16b $1" or "lock\0A\09incl" or "lock/incl" + while (Name.startswith(";") || Name.startswith("\n") || + Name.startswith("\t") || Name.startswith("/")) { + Name = Parser.getTok().getString(); + Parser.Lex(); // go to next prefix or instr + } + } + + if (Flags) + PatchedName = Name; + Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc)); + // This does the actual operand parsing. Don't parse any more if we have a // prefix juxtaposed with an operation like "lock incl 4(%rax)", because we // just want to parse the "lock" as the first instruction and the "incl" as // the next one. if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) { - // Parse '*' modifier. if (getLexer().is(AsmToken::Star)) Operands.push_back(X86Operand::CreateToken("*", consumeToken())); @@ -2670,6 +2632,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name, } } + if (Flags) + Operands.push_back(X86Operand::CreatePrefix(Flags, NameLoc, NameLoc)); return false; } @@ -2677,12 +2641,79 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) { return false; } +bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) { + const MCRegisterInfo *MRI = getContext().getRegisterInfo(); + + switch (Inst.getOpcode()) { + case X86::VGATHERDPDYrm: + case X86::VGATHERDPDrm: + case X86::VGATHERDPSYrm: + case X86::VGATHERDPSrm: + case X86::VGATHERQPDYrm: + case X86::VGATHERQPDrm: + case X86::VGATHERQPSYrm: + case X86::VGATHERQPSrm: + case X86::VPGATHERDDYrm: + case X86::VPGATHERDDrm: + case X86::VPGATHERDQYrm: + case X86::VPGATHERDQrm: + case X86::VPGATHERQDYrm: + case X86::VPGATHERQDrm: + case X86::VPGATHERQQYrm: + case X86::VPGATHERQQrm: { + unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg()); + unsigned Index = + MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg()); + if (Dest == Mask || Dest == Index || Mask == Index) + return Warning(Ops[0]->getStartLoc(), "mask, index, and destination " + "registers should be distinct"); + break; + } + case X86::VGATHERDPDZ128rm: + case X86::VGATHERDPDZ256rm: + case X86::VGATHERDPDZrm: + case X86::VGATHERDPSZ128rm: + case X86::VGATHERDPSZ256rm: + case X86::VGATHERDPSZrm: + case X86::VGATHERQPDZ128rm: + case X86::VGATHERQPDZ256rm: + case X86::VGATHERQPDZrm: + case X86::VGATHERQPSZ128rm: + case X86::VGATHERQPSZ256rm: + case X86::VGATHERQPSZrm: + case X86::VPGATHERDDZ128rm: + case X86::VPGATHERDDZ256rm: + case X86::VPGATHERDDZrm: + case X86::VPGATHERDQZ128rm: + case X86::VPGATHERDQZ256rm: + case X86::VPGATHERDQZrm: + case X86::VPGATHERQDZ128rm: + case X86::VPGATHERQDZ256rm: + case X86::VPGATHERQDZrm: + case X86::VPGATHERQQZ128rm: + case X86::VPGATHERQQZ256rm: + case X86::VPGATHERQQZrm: { + unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg()); + unsigned Index = + MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg()); + if (Dest == Index) + return Warning(Ops[0]->getStartLoc(), "index and destination registers " + "should be distinct"); + break; + } + } + + return false; +} + static const char *getSubtargetFeatureName(uint64_t Val); void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out) { - Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(), - MII, Out); + Instrumentation->InstrumentAndEmitInstruction( + Inst, Operands, getContext(), MII, Out, + getParser().shouldPrintSchedInfo()); } bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode, @@ -2737,6 +2768,16 @@ bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo, return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm); } +static unsigned getPrefixes(OperandVector &Operands) { + unsigned Result = 0; + X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back()); + if (Prefix.isPrefix()) { + Result = Prefix.getPrefix(); + Operands.pop_back(); + } + return Result; +} + bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, OperandVector &Operands, MCStreamer &Out, @@ -2751,13 +2792,20 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode, MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); bool WasOriginallyInvalidOperand = false; + unsigned Prefixes = getPrefixes(Operands); + MCInst Inst; + if (Prefixes) + Inst.setFlags(Prefixes); + // First, try a direct match. switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm, isParsingIntelSyntax())) { default: llvm_unreachable("Unexpected match result!"); case Match_Success: + if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) + return true; // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the // individual transformations can chain off each other. @@ -2917,12 +2965,16 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, StringRef Mnemonic = Op.getToken(); SMRange EmptyRange = None; StringRef Base = Op.getToken(); + unsigned Prefixes = getPrefixes(Operands); // First, handle aliases that expand to multiple instructions. MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm); MCInst Inst; + if (Prefixes) + Inst.setFlags(Prefixes); + // Find one unsized memory operand, if present. X86Operand *UnsizedMemOp = nullptr; for (const auto &Op : Operands) { @@ -3043,6 +3095,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode, // instruction will already have been filled in correctly, since the failing // matches won't have modified it). if (NumSuccessfulMatches == 1) { + if (!MatchingInlineAsm && validateInstruction(Inst, Operands)) + return true; // Some instructions need post-processing to, for example, tweak which // encoding is selected. Loop on it while changes happen so the individual // transformations can chain off each other. @@ -3121,6 +3175,19 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) { return false; } else if (IDVal == ".even") return parseDirectiveEven(DirectiveID.getLoc()); + else if (IDVal == ".cv_fpo_proc") + return parseDirectiveFPOProc(DirectiveID.getLoc()); + else if (IDVal == ".cv_fpo_setframe") + return parseDirectiveFPOSetFrame(DirectiveID.getLoc()); + else if (IDVal == ".cv_fpo_pushreg") + return parseDirectiveFPOPushReg(DirectiveID.getLoc()); + else if (IDVal == ".cv_fpo_stackalloc") + return parseDirectiveFPOStackAlloc(DirectiveID.getLoc()); + else if (IDVal == ".cv_fpo_endprologue") + return parseDirectiveFPOEndPrologue(DirectiveID.getLoc()); + else if (IDVal == ".cv_fpo_endproc") + return parseDirectiveFPOEndProc(DirectiveID.getLoc()); + return true; } @@ -3218,6 +3285,71 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) { return false; } +// .cv_fpo_proc foo +bool X86AsmParser::parseDirectiveFPOProc(SMLoc L) { + MCAsmParser &Parser = getParser(); + StringRef ProcName; + int64_t ParamsSize; + if (Parser.parseIdentifier(ProcName)) + return Parser.TokError("expected symbol name"); + if (Parser.parseIntToken(ParamsSize, "expected parameter byte count")) + return true; + if (!isUIntN(32, ParamsSize)) + return Parser.TokError("parameters size out of range"); + if (Parser.parseEOL("unexpected tokens")) + return addErrorSuffix(" in '.cv_fpo_proc' directive"); + MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName); + return getTargetStreamer().emitFPOProc(ProcSym, ParamsSize, L); +} + +// .cv_fpo_setframe ebp +bool X86AsmParser::parseDirectiveFPOSetFrame(SMLoc L) { + MCAsmParser &Parser = getParser(); + unsigned Reg; + SMLoc DummyLoc; + if (ParseRegister(Reg, DummyLoc, DummyLoc) || + Parser.parseEOL("unexpected tokens")) + return addErrorSuffix(" in '.cv_fpo_setframe' directive"); + return getTargetStreamer().emitFPOSetFrame(Reg, L); +} + +// .cv_fpo_pushreg ebx +bool X86AsmParser::parseDirectiveFPOPushReg(SMLoc L) { + MCAsmParser &Parser = getParser(); + unsigned Reg; + SMLoc DummyLoc; + if (ParseRegister(Reg, DummyLoc, DummyLoc) || + Parser.parseEOL("unexpected tokens")) + return addErrorSuffix(" in '.cv_fpo_pushreg' directive"); + return getTargetStreamer().emitFPOPushReg(Reg, L); +} + +// .cv_fpo_stackalloc 20 +bool X86AsmParser::parseDirectiveFPOStackAlloc(SMLoc L) { + MCAsmParser &Parser = getParser(); + int64_t Offset; + if (Parser.parseIntToken(Offset, "expected offset") || + Parser.parseEOL("unexpected tokens")) + return addErrorSuffix(" in '.cv_fpo_stackalloc' directive"); + return getTargetStreamer().emitFPOStackAlloc(Offset, L); +} + +// .cv_fpo_endprologue +bool X86AsmParser::parseDirectiveFPOEndPrologue(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (Parser.parseEOL("unexpected tokens")) + return addErrorSuffix(" in '.cv_fpo_endprologue' directive"); + return getTargetStreamer().emitFPOEndPrologue(L); +} + +// .cv_fpo_endproc +bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) { + MCAsmParser &Parser = getParser(); + if (Parser.parseEOL("unexpected tokens")) + return addErrorSuffix(" in '.cv_fpo_endproc' directive"); + return getTargetStreamer().emitFPOEndProc(L); +} + // Force static initialization. extern "C" void LLVMInitializeX86AsmParser() { RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target()); diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h index 0fba15cc692c..43a0561e769b 100644 --- a/lib/Target/X86/AsmParser/X86Operand.h +++ b/lib/Target/X86/AsmParser/X86Operand.h @@ -10,6 +10,7 @@ #ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H +#include "MCTargetDesc/X86MCTargetDesc.h" #include "X86AsmParserCommon.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" @@ -28,12 +29,7 @@ namespace llvm { /// X86Operand - Instances of this class represent a parsed X86 machine /// instruction. struct X86Operand : public MCParsedAsmOperand { - enum KindTy { - Token, - Register, - Immediate, - Memory - } Kind; + enum KindTy { Token, Register, Immediate, Memory, Prefix } Kind; SMLoc StartLoc, EndLoc; SMLoc OffsetOfLoc; @@ -50,6 +46,10 @@ struct X86Operand : public MCParsedAsmOperand { unsigned RegNo; }; + struct PrefOp { + unsigned Prefixes; + }; + struct ImmOp { const MCExpr *Val; }; @@ -73,6 +73,7 @@ struct X86Operand : public MCParsedAsmOperand { struct RegOp Reg; struct ImmOp Imm; struct MemOp Mem; + struct PrefOp Pref; }; X86Operand(KindTy K, SMLoc Start, SMLoc End) @@ -111,6 +112,11 @@ struct X86Operand : public MCParsedAsmOperand { return Reg.RegNo; } + unsigned getPrefix() const { + assert(Kind == Prefix && "Invalid access!"); + return Pref.Prefixes; + } + const MCExpr *getImm() const { assert(Kind == Immediate && "Invalid access!"); return Imm.Val; @@ -387,6 +393,7 @@ struct X86Operand : public MCParsedAsmOperand { return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64); } + bool isPrefix() const { return Kind == Prefix; } bool isReg() const override { return Kind == Register; } bool isGR32orGR64() const { @@ -509,6 +516,13 @@ struct X86Operand : public MCParsedAsmOperand { return Res; } + static std::unique_ptr<X86Operand> + CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) { + auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc); + Res->Pref.Prefixes = Prefixes; + return Res; + } + static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val, SMLoc StartLoc, SMLoc EndLoc) { auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc); diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt index 6e08d4cff6ea..7e0df2941467 100644 --- a/lib/Target/X86/CMakeLists.txt +++ b/lib/Target/X86/CMakeLists.txt @@ -11,33 +11,21 @@ tablegen(LLVM X86GenFastISel.inc -gen-fast-isel) tablegen(LLVM X86GenCallingConv.inc -gen-callingconv) tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget) tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables) -if(LLVM_BUILD_GLOBAL_ISEL) - tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank) - tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) -endif() - -add_public_tablegen_target(X86CommonTableGen) - -# Add GlobalISel files if the build option was enabled. -set(GLOBAL_ISEL_FILES - X86CallLowering.cpp - X86LegalizerInfo.cpp - X86RegisterBankInfo.cpp - X86InstructionSelector.cpp - ) +tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank) +tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel) -if(LLVM_BUILD_GLOBAL_ISEL) - set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES}) -else() - set(GLOBAL_ISEL_BUILD_FILES "") - set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES}) +if (X86_GEN_FOLD_TABLES) + tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables) endif() +add_public_tablegen_target(X86CommonTableGen) set(sources X86AsmPrinter.cpp X86CallFrameOptimization.cpp + X86CallLowering.cpp X86CmovConversion.cpp + X86DomainReassignment.cpp X86ExpandPseudo.cpp X86FastISel.cpp X86FixupBWInsts.cpp @@ -45,17 +33,20 @@ set(sources X86FixupSetCC.cpp X86FloatingPoint.cpp X86FrameLowering.cpp + X86InstructionSelector.cpp X86ISelDAGToDAG.cpp X86ISelLowering.cpp X86InterleavedAccess.cpp X86InstrFMA3Info.cpp X86InstrInfo.cpp X86EvexToVex.cpp + X86LegalizerInfo.cpp X86MCInstLower.cpp X86MachineFunctionInfo.cpp X86MacroFusion.cpp X86OptimizeLEAs.cpp X86PadShortFunction.cpp + X86RegisterBankInfo.cpp X86RegisterInfo.cpp X86SelectionDAGInfo.cpp X86ShuffleDecodeConstantPool.cpp @@ -67,7 +58,6 @@ set(sources X86WinAllocaExpander.cpp X86WinEHState.cpp X86CallingConv.cpp - ${GLOBAL_ISEL_BUILD_FILES} ) add_llvm_target(X86CodeGen ${sources}) diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp index 4ce908b1da64..c58254ae38c1 100644 --- a/lib/Target/X86/Disassembler/X86Disassembler.cpp +++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp @@ -74,6 +74,7 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/X86BaseInfo.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86DisassemblerDecoder.h" #include "llvm/MC/MCContext.h" @@ -232,7 +233,24 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction( return Fail; } else { Size = InternalInstr.length; - return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail; + bool Ret = translateInstruction(Instr, InternalInstr, this); + if (!Ret) { + unsigned Flags = X86::IP_NO_PREFIX; + if (InternalInstr.hasAdSize) + Flags |= X86::IP_HAS_AD_SIZE; + if (!InternalInstr.mandatoryPrefix) { + if (InternalInstr.hasOpSize) + Flags |= X86::IP_HAS_OP_SIZE; + if (InternalInstr.repeatPrefix == 0xf2) + Flags |= X86::IP_HAS_REPEAT_NE; + else if (InternalInstr.repeatPrefix == 0xf3 && + // It should not be 'pause' f3 90 + InternalInstr.opcode != 0x90) + Flags |= X86::IP_HAS_REPEAT; + } + Instr.setFlags(Flags); + } + return (!Ret) ? Success : Fail; } } @@ -315,12 +333,12 @@ static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) { unsigned baseRegNo; if (insn.mode == MODE_64BIT) - baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI; + baseRegNo = insn.hasAdSize ? X86::ESI : X86::RSI; else if (insn.mode == MODE_32BIT) - baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI; + baseRegNo = insn.hasAdSize ? X86::SI : X86::ESI; else { assert(insn.mode == MODE_16BIT); - baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI; + baseRegNo = insn.hasAdSize ? X86::ESI : X86::SI; } MCOperand baseReg = MCOperand::createReg(baseRegNo); mcInst.addOperand(baseReg); @@ -340,12 +358,12 @@ static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) { unsigned baseRegNo; if (insn.mode == MODE_64BIT) - baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI; + baseRegNo = insn.hasAdSize ? X86::EDI : X86::RDI; else if (insn.mode == MODE_32BIT) - baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI; + baseRegNo = insn.hasAdSize ? X86::DI : X86::EDI; else { assert(insn.mode == MODE_16BIT); - baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI; + baseRegNo = insn.hasAdSize ? X86::EDI : X86::DI; } MCOperand baseReg = MCOperand::createReg(baseRegNo); mcInst.addOperand(baseReg); @@ -746,102 +764,6 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn, baseReg = MCOperand::createReg(0); } - // Check whether we are handling VSIB addressing mode for GATHER. - // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and - // we should use SIB_INDEX_XMM4|YMM4 for VSIB. - // I don't see a way to get the correct IndexReg in readSIB: - // We can tell whether it is VSIB or SIB after instruction ID is decoded, - // but instruction ID may not be decoded yet when calling readSIB. - uint32_t Opcode = mcInst.getOpcode(); - bool IndexIs128 = (Opcode == X86::VGATHERDPDrm || - Opcode == X86::VGATHERDPDYrm || - Opcode == X86::VGATHERQPDrm || - Opcode == X86::VGATHERDPSrm || - Opcode == X86::VGATHERQPSrm || - Opcode == X86::VPGATHERDQrm || - Opcode == X86::VPGATHERDQYrm || - Opcode == X86::VPGATHERQQrm || - Opcode == X86::VPGATHERDDrm || - Opcode == X86::VPGATHERQDrm || - Opcode == X86::VGATHERDPDZ128rm || - Opcode == X86::VGATHERDPDZ256rm || - Opcode == X86::VGATHERDPSZ128rm || - Opcode == X86::VGATHERQPDZ128rm || - Opcode == X86::VGATHERQPSZ128rm || - Opcode == X86::VPGATHERDDZ128rm || - Opcode == X86::VPGATHERDQZ128rm || - Opcode == X86::VPGATHERDQZ256rm || - Opcode == X86::VPGATHERQDZ128rm || - Opcode == X86::VPGATHERQQZ128rm || - Opcode == X86::VSCATTERDPDZ128mr || - Opcode == X86::VSCATTERDPDZ256mr || - Opcode == X86::VSCATTERDPSZ128mr || - Opcode == X86::VSCATTERQPDZ128mr || - Opcode == X86::VSCATTERQPSZ128mr || - Opcode == X86::VPSCATTERDDZ128mr || - Opcode == X86::VPSCATTERDQZ128mr || - Opcode == X86::VPSCATTERDQZ256mr || - Opcode == X86::VPSCATTERQDZ128mr || - Opcode == X86::VPSCATTERQQZ128mr); - bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm || - Opcode == X86::VGATHERDPSYrm || - Opcode == X86::VGATHERQPSYrm || - Opcode == X86::VGATHERDPDZrm || - Opcode == X86::VPGATHERDQZrm || - Opcode == X86::VPGATHERQQYrm || - Opcode == X86::VPGATHERDDYrm || - Opcode == X86::VPGATHERQDYrm || - Opcode == X86::VGATHERDPSZ256rm || - Opcode == X86::VGATHERQPDZ256rm || - Opcode == X86::VGATHERQPSZ256rm || - Opcode == X86::VPGATHERDDZ256rm || - Opcode == X86::VPGATHERQQZ256rm || - Opcode == X86::VPGATHERQDZ256rm || - Opcode == X86::VSCATTERDPDZmr || - Opcode == X86::VPSCATTERDQZmr || - Opcode == X86::VSCATTERDPSZ256mr || - Opcode == X86::VSCATTERQPDZ256mr || - Opcode == X86::VSCATTERQPSZ256mr || - Opcode == X86::VPSCATTERDDZ256mr || - Opcode == X86::VPSCATTERQQZ256mr || - Opcode == X86::VPSCATTERQDZ256mr || - Opcode == X86::VGATHERPF0DPDm || - Opcode == X86::VGATHERPF1DPDm || - Opcode == X86::VSCATTERPF0DPDm || - Opcode == X86::VSCATTERPF1DPDm); - bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm || - Opcode == X86::VGATHERDPSZrm || - Opcode == X86::VGATHERQPSZrm || - Opcode == X86::VPGATHERQQZrm || - Opcode == X86::VPGATHERDDZrm || - Opcode == X86::VPGATHERQDZrm || - Opcode == X86::VSCATTERQPDZmr || - Opcode == X86::VSCATTERDPSZmr || - Opcode == X86::VSCATTERQPSZmr || - Opcode == X86::VPSCATTERQQZmr || - Opcode == X86::VPSCATTERDDZmr || - Opcode == X86::VPSCATTERQDZmr || - Opcode == X86::VGATHERPF0DPSm || - Opcode == X86::VGATHERPF0QPDm || - Opcode == X86::VGATHERPF0QPSm || - Opcode == X86::VGATHERPF1DPSm || - Opcode == X86::VGATHERPF1QPDm || - Opcode == X86::VGATHERPF1QPSm || - Opcode == X86::VSCATTERPF0DPSm || - Opcode == X86::VSCATTERPF0QPDm || - Opcode == X86::VSCATTERPF0QPSm || - Opcode == X86::VSCATTERPF1DPSm || - Opcode == X86::VSCATTERPF1QPDm || - Opcode == X86::VSCATTERPF1QPSm); - if (IndexIs128 || IndexIs256 || IndexIs512) { - unsigned IndexOffset = insn.sibIndex - - (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX); - SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 : - IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0; - insn.sibIndex = (SIBIndex)(IndexBase + - (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset)); - } - if (insn.sibIndex != SIB_INDEX_NONE) { switch (insn.sibIndex) { default: @@ -969,6 +891,9 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand, case TYPE_BNDR: return translateRMRegister(mcInst, insn); case TYPE_M: + case TYPE_MVSIBX: + case TYPE_MVSIBY: + case TYPE_MVSIBZ: return translateRMMemory(mcInst, insn, Dis); } } @@ -1034,6 +959,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand, insn, Dis); return false; + case ENCODING_IRC: + mcInst.addOperand(MCOperand::createImm(insn.RC)); + return false; case ENCODING_SI: return translateSrcIndex(mcInst, insn); case ENCODING_DI: diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp index 577b7a776c6d..843d037ad3cd 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp @@ -277,38 +277,44 @@ static void dbgprintf(struct InternalInstruction* insn, insn->dlog(insn->dlogArg, buffer); } -/* - * setPrefixPresent - Marks that a particular prefix is present at a particular - * location. - * - * @param insn - The instruction to be marked as having the prefix. - * @param prefix - The prefix that is present. - * @param location - The location where the prefix is located (in the address - * space of the instruction's reader). - */ -static void setPrefixPresent(struct InternalInstruction* insn, - uint8_t prefix, - uint64_t location) -{ - insn->prefixPresent[prefix] = 1; - insn->prefixLocations[prefix] = location; +static bool isREX(struct InternalInstruction *insn, uint8_t prefix) { + if (insn->mode == MODE_64BIT) + return prefix >= 0x40 && prefix <= 0x4f; + return false; } /* - * isPrefixAtLocation - Queries an instruction to determine whether a prefix is - * present at a given location. + * setPrefixPresent - Marks that a particular prefix is present as mandatory * - * @param insn - The instruction to be queried. - * @param prefix - The prefix. - * @param location - The location to query. - * @return - Whether the prefix is at that location. + * @param insn - The instruction to be marked as having the prefix. + * @param prefix - The prefix that is present. */ -static bool isPrefixAtLocation(struct InternalInstruction* insn, - uint8_t prefix, - uint64_t location) -{ - return insn->prefixPresent[prefix] == 1 && - insn->prefixLocations[prefix] == location; +static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) { + uint8_t nextByte; + switch (prefix) { + case 0xf2: + case 0xf3: + if (lookAtByte(insn, &nextByte)) + break; + // TODO: + // 1. There could be several 0x66 + // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then + // it's not mandatory prefix + // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need + // 0x0f exactly after it to be mandatory prefix + if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66) + // The last of 0xf2 /0xf3 is mandatory prefix + insn->mandatoryPrefix = prefix; + insn->repeatPrefix = prefix; + break; + case 0x66: + if (lookAtByte(insn, &nextByte)) + break; + // 0x66 can't overwrite existing mandatory prefix and should be ignored + if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte))) + insn->mandatoryPrefix = prefix; + break; + } } /* @@ -322,19 +328,12 @@ static bool isPrefixAtLocation(struct InternalInstruction* insn, */ static int readPrefixes(struct InternalInstruction* insn) { bool isPrefix = true; - bool prefixGroups[4] = { false }; - uint64_t prefixLocation; uint8_t byte = 0; uint8_t nextByte; - bool hasAdSize = false; - bool hasOpSize = false; - dbgprintf(insn, "readPrefixes()"); while (isPrefix) { - prefixLocation = insn->readerCursor; - /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */ if (consumeByte(insn, &byte)) break; @@ -343,13 +342,10 @@ static int readPrefixes(struct InternalInstruction* insn) { * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then * break and let it be disassembled as a normal "instruction". */ - if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) + if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK break; - if (insn->readerCursor - 1 == insn->startLocation - && (byte == 0xf2 || byte == 0xf3) - && !lookAtByte(insn, &nextByte)) - { + if ((byte == 0xf2 || byte == 0xf3) && !lookAtByte(insn, &nextByte)) { /* * If the byte is 0xf2 or 0xf3, and any of the following conditions are * met: @@ -357,39 +353,41 @@ static int readPrefixes(struct InternalInstruction* insn) { * - it is followed by an xchg instruction * then it should be disassembled as a xacquire/xrelease not repne/rep. */ - if ((byte == 0xf2 || byte == 0xf3) && - ((nextByte == 0xf0) || - ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) + if (((nextByte == 0xf0) || + ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) { insn->xAcquireRelease = true; + if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support + break; + } /* * Also if the byte is 0xf3, and the following condition is met: * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or * "mov mem, imm" (opcode 0xc6/0xc7) instructions. * then it should be disassembled as an xrelease not rep. */ - if (byte == 0xf3 && - (nextByte == 0x88 || nextByte == 0x89 || - nextByte == 0xc6 || nextByte == 0xc7)) + if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 || + nextByte == 0xc6 || nextByte == 0xc7)) { insn->xAcquireRelease = true; - if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) { - if (consumeByte(insn, &nextByte)) + if (nextByte != 0x90) // PAUSE instruction support + break; + } + if (isREX(insn, nextByte)) { + uint8_t nnextByte; + // Go to REX prefix after the current one + if (consumeByte(insn, &nnextByte)) return -1; - if (lookAtByte(insn, &nextByte)) + // We should be able to read next byte after REX prefix + if (lookAtByte(insn, &nnextByte)) return -1; unconsumeByte(insn); } - if (nextByte != 0x0f && nextByte != 0x90) - break; } switch (byte) { case 0xf0: /* LOCK */ case 0xf2: /* REPNE/REPNZ */ case 0xf3: /* REP or REPE/REPZ */ - if (prefixGroups[0]) - dbgprintf(insn, "Redundant Group 1 prefix"); - prefixGroups[0] = true; - setPrefixPresent(insn, byte, prefixLocation); + setPrefixPresent(insn, byte); break; case 0x2e: /* CS segment override -OR- Branch not taken */ case 0x36: /* SS segment override -OR- Branch taken */ @@ -420,24 +418,15 @@ static int readPrefixes(struct InternalInstruction* insn) { debug("Unhandled override"); return -1; } - if (prefixGroups[1]) - dbgprintf(insn, "Redundant Group 2 prefix"); - prefixGroups[1] = true; - setPrefixPresent(insn, byte, prefixLocation); + setPrefixPresent(insn, byte); break; case 0x66: /* Operand-size override */ - if (prefixGroups[2]) - dbgprintf(insn, "Redundant Group 3 prefix"); - prefixGroups[2] = true; - hasOpSize = true; - setPrefixPresent(insn, byte, prefixLocation); + insn->hasOpSize = true; + setPrefixPresent(insn, byte); break; case 0x67: /* Address-size override */ - if (prefixGroups[3]) - dbgprintf(insn, "Redundant Group 4 prefix"); - prefixGroups[3] = true; - hasAdSize = true; - setPrefixPresent(insn, byte, prefixLocation); + insn->hasAdSize = true; + setPrefixPresent(insn, byte); break; default: /* Not a prefix byte */ isPrefix = false; @@ -469,7 +458,6 @@ static int readPrefixes(struct InternalInstruction* insn) { } else { unconsumeByte(insn); /* unconsume byte1 */ unconsumeByte(insn); /* unconsume byte */ - insn->necessaryPrefixLocation = insn->readerCursor - 2; } if (insn->vectorExtensionType == TYPE_EVEX) { @@ -505,13 +493,10 @@ static int readPrefixes(struct InternalInstruction* insn) { return -1; } - if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) insn->vectorExtensionType = TYPE_VEX_3B; - insn->necessaryPrefixLocation = insn->readerCursor - 1; - } else { + else unconsumeByte(insn); - insn->necessaryPrefixLocation = insn->readerCursor - 1; - } if (insn->vectorExtensionType == TYPE_VEX_3B) { insn->vectorExtensionPrefix[0] = byte; @@ -520,13 +505,12 @@ static int readPrefixes(struct InternalInstruction* insn) { /* We simulate the REX prefix for simplicity's sake */ - if (insn->mode == MODE_64BIT) { + if (insn->mode == MODE_64BIT) insn->rexPrefix = 0x40 | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0); - } dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx", insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], @@ -540,26 +524,24 @@ static int readPrefixes(struct InternalInstruction* insn) { return -1; } - if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) { + if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) insn->vectorExtensionType = TYPE_VEX_2B; - } else { + else unconsumeByte(insn); - } if (insn->vectorExtensionType == TYPE_VEX_2B) { insn->vectorExtensionPrefix[0] = byte; consumeByte(insn, &insn->vectorExtensionPrefix[1]); - if (insn->mode == MODE_64BIT) { + if (insn->mode == MODE_64BIT) insn->rexPrefix = 0x40 | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2); - } switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) { default: break; case VEX_PREFIX_66: - hasOpSize = true; + insn->hasOpSize = true; break; } @@ -575,13 +557,10 @@ static int readPrefixes(struct InternalInstruction* insn) { return -1; } - if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */ + if ((byte1 & 0x38) != 0x0) /* 0 in these 3 bits is a POP instruction. */ insn->vectorExtensionType = TYPE_XOP; - insn->necessaryPrefixLocation = insn->readerCursor - 1; - } else { + else unconsumeByte(insn); - insn->necessaryPrefixLocation = insn->readerCursor - 1; - } if (insn->vectorExtensionType == TYPE_XOP) { insn->vectorExtensionPrefix[0] = byte; @@ -590,19 +569,18 @@ static int readPrefixes(struct InternalInstruction* insn) { /* We simulate the REX prefix for simplicity's sake */ - if (insn->mode == MODE_64BIT) { + if (insn->mode == MODE_64BIT) insn->rexPrefix = 0x40 | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0); - } switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) { default: break; case VEX_PREFIX_66: - hasOpSize = true; + insn->hasOpSize = true; break; } @@ -610,51 +588,35 @@ static int readPrefixes(struct InternalInstruction* insn) { insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1], insn->vectorExtensionPrefix[2]); } - } else { - if (insn->mode == MODE_64BIT) { - if ((byte & 0xf0) == 0x40) { - uint8_t opcodeByte; - - if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) { - dbgprintf(insn, "Redundant REX prefix"); - return -1; - } - - insn->rexPrefix = byte; - insn->necessaryPrefixLocation = insn->readerCursor - 2; - - dbgprintf(insn, "Found REX prefix 0x%hhx", byte); - } else { - unconsumeByte(insn); - insn->necessaryPrefixLocation = insn->readerCursor - 1; - } - } else { - unconsumeByte(insn); - insn->necessaryPrefixLocation = insn->readerCursor - 1; - } - } + } else if (isREX(insn, byte)) { + if (lookAtByte(insn, &nextByte)) + return -1; + insn->rexPrefix = byte; + dbgprintf(insn, "Found REX prefix 0x%hhx", byte); + } else + unconsumeByte(insn); if (insn->mode == MODE_16BIT) { - insn->registerSize = (hasOpSize ? 4 : 2); - insn->addressSize = (hasAdSize ? 4 : 2); - insn->displacementSize = (hasAdSize ? 4 : 2); - insn->immediateSize = (hasOpSize ? 4 : 2); + insn->registerSize = (insn->hasOpSize ? 4 : 2); + insn->addressSize = (insn->hasAdSize ? 4 : 2); + insn->displacementSize = (insn->hasAdSize ? 4 : 2); + insn->immediateSize = (insn->hasOpSize ? 4 : 2); } else if (insn->mode == MODE_32BIT) { - insn->registerSize = (hasOpSize ? 2 : 4); - insn->addressSize = (hasAdSize ? 2 : 4); - insn->displacementSize = (hasAdSize ? 2 : 4); - insn->immediateSize = (hasOpSize ? 2 : 4); + insn->registerSize = (insn->hasOpSize ? 2 : 4); + insn->addressSize = (insn->hasAdSize ? 2 : 4); + insn->displacementSize = (insn->hasAdSize ? 2 : 4); + insn->immediateSize = (insn->hasOpSize ? 2 : 4); } else if (insn->mode == MODE_64BIT) { if (insn->rexPrefix && wFromREX(insn->rexPrefix)) { insn->registerSize = 8; - insn->addressSize = (hasAdSize ? 4 : 8); + insn->addressSize = (insn->hasAdSize ? 4 : 8); insn->displacementSize = 4; insn->immediateSize = 4; } else { - insn->registerSize = (hasOpSize ? 2 : 4); - insn->addressSize = (hasAdSize ? 4 : 8); - insn->displacementSize = (hasOpSize ? 2 : 4); - insn->immediateSize = (hasOpSize ? 2 : 4); + insn->registerSize = (insn->hasOpSize ? 2 : 4); + insn->addressSize = (insn->hasAdSize ? 4 : 8); + insn->displacementSize = (insn->hasOpSize ? 2 : 4); + insn->immediateSize = (insn->hasOpSize ? 2 : 4); } } @@ -758,7 +720,10 @@ static int readOpcode(struct InternalInstruction* insn) { insn->opcodeType = TWOBYTE; } - } + } else if (insn->mandatoryPrefix) + // The opcode with mandatory prefix must start with opcode escape. + // If not it's legacy repeat prefix + insn->mandatoryPrefix = 0; /* * At this point we have consumed the full opcode. @@ -950,19 +915,44 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { } else { return -1; } - } else { - if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) + } else if (!insn->mandatoryPrefix) { + // If we don't have mandatory prefix we should use legacy prefixes here + if (insn->hasOpSize && (insn->mode != MODE_16BIT)) attrMask |= ATTR_OPSIZE; - else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation)) + if (insn->hasAdSize) attrMask |= ATTR_ADSIZE; - else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation)) - attrMask |= ATTR_XS; - else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation)) + if (insn->opcodeType == ONEBYTE) { + if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90)) + // Special support for PAUSE + attrMask |= ATTR_XS; + } else { + if (insn->repeatPrefix == 0xf2) + attrMask |= ATTR_XD; + else if (insn->repeatPrefix == 0xf3) + attrMask |= ATTR_XS; + } + } else { + switch (insn->mandatoryPrefix) { + case 0xf2: attrMask |= ATTR_XD; + break; + case 0xf3: + attrMask |= ATTR_XS; + break; + case 0x66: + if (insn->mode != MODE_16BIT) + attrMask |= ATTR_OPSIZE; + break; + case 0x67: + attrMask |= ATTR_ADSIZE; + break; + } } - if (insn->rexPrefix & 0x08) + if (insn->rexPrefix & 0x08) { attrMask |= ATTR_REXW; + attrMask &= ~ATTR_ADSIZE; + } /* * JCXZ/JECXZ need special handling for 16-bit mode because the meaning @@ -977,8 +967,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { * CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes */ - if (insn->mode == MODE_64BIT && - isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) { + if ((insn->mode == MODE_64BIT) && insn->hasOpSize) { switch (insn->opcode) { case 0xE8: case 0xE9: @@ -1058,9 +1047,9 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { */ if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) { /* Make sure we observed the prefixes in any position. */ - if (insn->prefixPresent[0x67]) + if (insn->hasAdSize) attrMask |= ATTR_ADSIZE; - if (insn->prefixPresent[0x66]) + if (insn->hasOpSize) attrMask |= ATTR_OPSIZE; /* In 16-bit, invert the attributes. */ @@ -1075,7 +1064,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { return 0; } - if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) && + if ((insn->mode == MODE_16BIT || insn->hasOpSize) && !(attrMask & ATTR_OPSIZE)) { /* * The instruction tables make no distinction between instructions that @@ -1108,7 +1097,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg); if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) && - (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) { + (insn->mode == MODE_16BIT) ^ insn->hasOpSize) { insn->instructionID = instructionIDWithOpsize; insn->spec = specifierForUID(instructionIDWithOpsize); } else { @@ -1169,7 +1158,6 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) { * @return - 0 if the SIB byte was successfully read; nonzero otherwise. */ static int readSIB(struct InternalInstruction* insn) { - SIBIndex sibIndexBase = SIB_INDEX_NONE; SIBBase sibBaseBase = SIB_BASE_NONE; uint8_t index, base; @@ -1185,11 +1173,11 @@ static int readSIB(struct InternalInstruction* insn) { dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode"); return -1; case 4: - sibIndexBase = SIB_INDEX_EAX; + insn->sibIndexBase = SIB_INDEX_EAX; sibBaseBase = SIB_BASE_EAX; break; case 8: - sibIndexBase = SIB_INDEX_RAX; + insn->sibIndexBase = SIB_INDEX_RAX; sibBaseBase = SIB_BASE_RAX; break; } @@ -1199,26 +1187,10 @@ static int readSIB(struct InternalInstruction* insn) { index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3); - // FIXME: The fifth bit (bit index 4) is only to be used for instructions - // that understand VSIB indexing. ORing the bit in here is mildy dangerous - // because performing math on an 'enum SIBIndex' can produce garbage. - // Excluding the "none" value, it should cover 6 spaces of register names: - // - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI - // - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX - // - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX - // - 32 possibilities for each of XMM, YMM, ZMM registers - // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode, - // summing in a fully decoded index between 0 and 31 can end up with a value - // that looks like something in the low half of the XMM range. - // translateRMMemory() tries to reverse the damage, with only partial success, - // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt" - if (insn->vectorExtensionType == TYPE_EVEX) - index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4; - if (index == 0x4) { insn->sibIndex = SIB_INDEX_NONE; } else { - insn->sibIndex = (SIBIndex)(sibIndexBase + index); + insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index); } insn->sibScale = 1 << scaleFromSIB(insn->sib); @@ -1483,9 +1455,9 @@ static int readModRM(struct InternalInstruction* insn) { case TYPE_MM64: \ return prefix##_MM0 + (index & 0x7); \ case TYPE_SEGMENTREG: \ - if (index > 5) \ + if ((index & 7) > 5) \ *valid = 0; \ - return prefix##_ES + index; \ + return prefix##_ES + (index & 7); \ case TYPE_DEBUGREG: \ return prefix##_DR0 + index; \ case TYPE_CONTROLREG: \ @@ -1494,6 +1466,12 @@ static int readModRM(struct InternalInstruction* insn) { if (index > 3) \ *valid = 0; \ return prefix##_BND0 + index; \ + case TYPE_MVSIBX: \ + return prefix##_XMM0 + index; \ + case TYPE_MVSIBY: \ + return prefix##_YMM0 + index; \ + case TYPE_MVSIBZ: \ + return prefix##_ZMM0 + index; \ } \ } @@ -1549,7 +1527,6 @@ static int fixupReg(struct InternalInstruction *insn, return -1; break; CASE_ENCODING_RM: - CASE_ENCODING_VSIB: if (insn->eaBase >= insn->eaRegBase) { insn->eaBase = (EABase)fixupRMValue(insn, (OperandType)op->type, @@ -1747,8 +1724,39 @@ static int readOperands(struct InternalInstruction* insn) { needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0); if (readModRM(insn)) return -1; - if (fixupReg(insn, &Op)) + + // Reject if SIB wasn't used. + if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64) + return -1; + + // If sibIndex was set to SIB_INDEX_NONE, index offset is 4. + if (insn->sibIndex == SIB_INDEX_NONE) + insn->sibIndex = (SIBIndex)4; + + // If EVEX.v2 is set this is one of the 16-31 registers. + if (insn->vectorExtensionType == TYPE_EVEX && + v2FromEVEX4of4(insn->vectorExtensionPrefix[3])) + insn->sibIndex = (SIBIndex)(insn->sibIndex + 16); + + // Adjust the index register to the correct size. + switch ((OperandType)Op.type) { + default: + debug("Unhandled VSIB index type"); return -1; + case TYPE_MVSIBX: + insn->sibIndex = (SIBIndex)(SIB_INDEX_XMM0 + + (insn->sibIndex - insn->sibIndexBase)); + break; + case TYPE_MVSIBY: + insn->sibIndex = (SIBIndex)(SIB_INDEX_YMM0 + + (insn->sibIndex - insn->sibIndexBase)); + break; + case TYPE_MVSIBZ: + insn->sibIndex = (SIBIndex)(SIB_INDEX_ZMM0 + + (insn->sibIndex - insn->sibIndexBase)); + break; + } + // Apply the AVX512 compressed displacement scaling factor. if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8) insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB); @@ -1797,6 +1805,10 @@ static int readOperands(struct InternalInstruction* insn) { if (readImmediate(insn, insn->addressSize)) return -1; break; + case ENCODING_IRC: + insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) | + lFromEVEX4of4(insn->vectorExtensionPrefix[3]); + break; case ENCODING_RB: if (readOpcodeRegister(insn, 1)) return -1; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h index b07fd0b17d35..ecd9d8dccafa 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h @@ -546,24 +546,26 @@ struct InternalInstruction { // Prefix state - // 1 if the prefix byte corresponding to the entry is present; 0 if not - uint8_t prefixPresent[0x100]; - // contains the location (for use with the reader) of the prefix byte - uint64_t prefixLocations[0x100]; + // The possible mandatory prefix + uint8_t mandatoryPrefix; // The value of the vector extension prefix(EVEX/VEX/XOP), if present uint8_t vectorExtensionPrefix[4]; // The type of the vector extension prefix VectorExtensionType vectorExtensionType; // The value of the REX prefix, if present uint8_t rexPrefix; - // The location where a mandatory prefix would have to be (i.e., right before - // the opcode, or right before the REX prefix if one is present). - uint64_t necessaryPrefixLocation; // The segment override type SegmentOverride segmentOverride; // 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease bool xAcquireRelease; + // Address-size override + bool hasAdSize; + // Operand-size override + bool hasOpSize; + // The repeat prefix if any + uint8_t repeatPrefix; + // Sizes of various critical pieces of data, in bytes uint8_t registerSize; uint8_t addressSize; @@ -637,10 +639,14 @@ struct InternalInstruction { Reg reg; // SIB state + SIBIndex sibIndexBase; SIBIndex sibIndex; uint8_t sibScale; SIBBase sibBase; + // Embedded rounding control. + uint8_t RC; + ArrayRef<OperandSpecifier> operands; }; diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h index e0f4399b3687..ad1404860fb6 100644 --- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h +++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h @@ -382,6 +382,7 @@ enum ModRMDecisionType { \ ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \ ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \ + ENUM_ENTRY(ENCODING_IRC, "Immediate for static rounding control") \ ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \ "opcode byte") \ ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \ @@ -410,6 +411,9 @@ enum OperandEncoding { ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \ ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \ ENUM_ENTRY(TYPE_M, "Memory operand") \ + ENUM_ENTRY(TYPE_MVSIBX, "Memory operand using XMM index") \ + ENUM_ENTRY(TYPE_MVSIBY, "Memory operand using YMM index") \ + ENUM_ENTRY(TYPE_MVSIBZ, "Memory operand using ZMM index") \ ENUM_ENTRY(TYPE_SRCIDX, "memory at source index") \ ENUM_ENTRY(TYPE_DSTIDX, "memory at destination index") \ ENUM_ENTRY(TYPE_MOFFS, "memory offset (relative to segment base)") \ diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp index 4d91300c7ede..0c99dbbe328b 100644 --- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp @@ -50,8 +50,16 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, getRegisterName); + unsigned Flags = MI->getFlags(); if (TSFlags & X86II::LOCK) OS << "\tlock\t"; + if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK) + OS << "\tlock\t"; + + if (Flags & X86::IP_HAS_REPEAT_NE) + OS << "\trepne\t"; + else if (Flags & X86::IP_HAS_REPEAT) + OS << "\trep\t"; // Output CALLpcrel32 as "callq" in 64-bit mode. // In Intel annotation it's always emitted as "call". diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp index f5f3a4cc83dc..a46f22ff40f5 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.cpp +++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp @@ -205,16 +205,14 @@ static MVT getZeroExtensionResultType(const MCInst *MI) { } /// Wraps the destination register name with AVX512 mask/maskz filtering. -static std::string getMaskName(const MCInst *MI, const char *DestName, - const char *(*getRegName)(unsigned)) { - std::string OpMaskName(DestName); - +static void printMasking(raw_ostream &OS, const MCInst *MI, + const char *(*getRegName)(unsigned)) { bool MaskWithZero = false; const char *MaskRegName = nullptr; switch (MI->getOpcode()) { default: - return OpMaskName; + return; CASE_MASKZ_MOVDUP(MOVDDUP, m) CASE_MASKZ_MOVDUP(MOVDDUP, r) CASE_MASKZ_MOVDUP(MOVSHDUP, m) @@ -293,6 +291,8 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm) CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm) CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm) + CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, r) + CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, m) CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r) CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r) CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m) @@ -382,6 +382,8 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm) CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm) CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm) + CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, r) + CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, m) CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r) CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r) CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m) @@ -395,15 +397,11 @@ static std::string getMaskName(const MCInst *MI, const char *DestName, } // MASK: zmmX {%kY} - OpMaskName += " {%"; - OpMaskName += MaskRegName; - OpMaskName += "}"; + OS << " {%" << MaskRegName << "}"; // MASKZ: zmmX {%kY} {z} if (MaskWithZero) - OpMaskName += " {z}"; - - return OpMaskName; + OS << " {z}"; } //===----------------------------------------------------------------------===// @@ -585,12 +583,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSLLDQYri: case X86::VPSLLDQZ128rr: case X86::VPSLLDQZ256rr: - case X86::VPSLLDQZ512rr: + case X86::VPSLLDQZrr: Src1Name = getRegName(MI->getOperand(1).getReg()); LLVM_FALLTHROUGH; case X86::VPSLLDQZ128rm: case X86::VPSLLDQZ256rm: - case X86::VPSLLDQZ512rm: + case X86::VPSLLDQZrm: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), @@ -603,12 +601,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, case X86::VPSRLDQYri: case X86::VPSRLDQZ128rr: case X86::VPSRLDQZ256rr: - case X86::VPSRLDQZ512rr: + case X86::VPSRLDQZrr: Src1Name = getRegName(MI->getOperand(1).getReg()); LLVM_FALLTHROUGH; case X86::VPSRLDQZ128rm: case X86::VPSRLDQZ256rm: - case X86::VPSRLDQZ512rm: + case X86::VPSRLDQZrm: DestName = getRegName(MI->getOperand(0).getReg()); if (MI->getOperand(NumOperands - 1).isImm()) DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0), @@ -1090,6 +1088,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask); DestName = getRegName(MI->getOperand(0).getReg()); break; + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r) + Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); + LLVM_FALLTHROUGH; + CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m) + DecodeSubVectorBroadcast(MVT::v4f32, MVT::v2f32, ShuffleMask); + DestName = getRegName(MI->getOperand(0).getReg()); + break; CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r) CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r) Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg()); @@ -1149,7 +1154,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS, return false; if (!DestName) DestName = Src1Name; - OS << (DestName ? getMaskName(MI, DestName, getRegName) : "mem") << " = "; + if (DestName) { + OS << DestName; + printMasking(OS, MI, getRegName); + } else + OS << "mem"; + + OS << " = "; // If the two sources are the same, canonicalize the input elements to be // from the first src so that we get larger element spans. diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h index c6d0d85a7d3d..629c02c95c7f 100644 --- a/lib/Target/X86/InstPrinter/X86InstComments.h +++ b/lib/Target/X86/InstPrinter/X86InstComments.h @@ -15,10 +15,13 @@ #ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H #define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H +#include "llvm/CodeGen/MachineInstr.h" + namespace llvm { enum AsmComments { - AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX. + // For instr that was compressed from EVEX to VEX. + AC_EVEX_2_VEX = MachineInstr::TAsmComments }; class MCInst; diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp index d6af6712d5a1..1f02600a7982 100644 --- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp +++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp @@ -41,7 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS, uint64_t TSFlags = Desc.TSFlags; if (TSFlags & X86II::LOCK) - OS << "\tlock\n"; + OS << "\tlock\t"; + + unsigned Flags = MI->getFlags(); + if (Flags & X86::IP_HAS_REPEAT_NE) + OS << "\trepne\t"; + else if (Flags & X86::IP_HAS_REPEAT) + OS << "\trep\t"; printInstruction(MI, OS); @@ -152,6 +158,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo, O << formatImm((int64_t)Op.getImm()); } else { assert(Op.isExpr() && "unknown operand kind in printOperand"); + O << "offset "; Op.getExpr()->print(O, &MAI); } } diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt index 33df9ec7dcde..8d0d9fa1215c 100644 --- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt +++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt @@ -5,6 +5,7 @@ add_llvm_library(LLVMX86Desc X86MCCodeEmitter.cpp X86MachObjectWriter.cpp X86ELFObjectWriter.cpp - X86WinCOFFStreamer.cpp X86WinCOFFObjectWriter.cpp + X86WinCOFFStreamer.cpp + X86WinCOFFTargetStreamer.cpp ) diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 733eac7c0321..78385ae1877b 100644 --- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -20,12 +20,9 @@ #include "llvm/MC/MCMachObjectWriter.h" #include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCRegisterInfo.h" -#include "llvm/MC/MCSectionCOFF.h" -#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCSubtargetInfo.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Support/TargetRegistry.h" #include "llvm/Support/raw_ostream.h" using namespace llvm; @@ -76,7 +73,7 @@ class X86AsmBackend : public MCAsmBackend { public: X86AsmBackend(const Target &T, StringRef CPU) : MCAsmBackend(), CPU(CPU), - MaxNopLength((CPU == "slm") ? 7 : 15) { + MaxNopLength((CPU == "slm" || CPU == "silvermont") ? 7 : 15) { HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" && CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" && CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" && @@ -389,7 +386,8 @@ public: ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386); } }; @@ -399,7 +397,8 @@ public: ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_X86_64); } @@ -410,7 +409,8 @@ public: ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_IAMCU); } @@ -421,7 +421,8 @@ public: ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) : ELFX86AsmBackend(T, OSABI, CPU) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64); } }; @@ -443,7 +444,8 @@ public: .Default(MCAsmBackend::getFixupKind(Name)); } - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86WinCOFFObjectWriter(OS, Is64Bit); } }; @@ -804,7 +806,8 @@ public: StringRef CPU) : DarwinX86AsmBackend(T, MRI, CPU, false) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/false, MachO::CPU_TYPE_I386, MachO::CPU_SUBTYPE_I386_ALL); @@ -824,7 +827,8 @@ public: StringRef CPU, MachO::CPUSubTypeX86 st) : DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {} - MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override { + std::unique_ptr<MCObjectWriter> + createObjectWriter(raw_pwrite_stream &OS) const override { return createX86MachObjectWriter(OS, /*Is64Bit=*/true, MachO::CPU_TYPE_X86_64, Subtype); } diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h index d8953da4abb2..07cc488d047e 100644 --- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h +++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h @@ -51,6 +51,18 @@ namespace X86 { TO_ZERO = 3, CUR_DIRECTION = 4 }; + + /// The constants to describe instr prefixes if there are + enum IPREFIXES { + IP_NO_PREFIX = 0, + IP_HAS_OP_SIZE = 1, + IP_HAS_AD_SIZE = 2, + IP_HAS_REPEAT_NE = 4, + IP_HAS_REPEAT = 8, + IP_HAS_LOCK = 16, + NO_SCHED_INFO = 32 // Don't add sched comment to the current instr because + // it was already added + }; } // end namespace X86; /// X86II - This namespace holds all of the target specific flags that @@ -356,13 +368,15 @@ namespace X86II { // OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix. // OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in // 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66 - // prefix in 16-bit mode. + // prefix in 16-bit mode. OpSizeIgnore means that the instruction may + // take a optional 0x66 byte but should not emit with one. OpSizeShift = 7, OpSizeMask = 0x3 << OpSizeShift, - OpSizeFixed = 0 << OpSizeShift, - OpSize16 = 1 << OpSizeShift, - OpSize32 = 2 << OpSizeShift, + OpSizeFixed = 0 << OpSizeShift, + OpSize16 = 1 << OpSizeShift, + OpSize32 = 2 << OpSizeShift, + OpSizeIgnore = 3 << OpSizeShift, // AsSize - AdSizeX implies this instruction determines its need of 0x67 // prefix from a normal ModRM memory operand. The other types indicate that diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 4da4eebec038..4cdbae4d0d96 100644 --- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -15,6 +15,7 @@ #include "llvm/MC/MCELFObjectWriter.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/Support/ErrorHandling.h" #include <cassert> @@ -297,10 +298,9 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target, return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind); } -MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, - bool IsELF64, uint8_t OSABI, - uint16_t EMachine) { - MCELFObjectTargetWriter *MOTW = - new X86ELFObjectWriter(IsELF64, OSABI, EMachine); - return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true); +std::unique_ptr<MCObjectWriter> +llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64, + uint8_t OSABI, uint16_t EMachine) { + auto MOTW = llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine); + return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian=*/true); } diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp index 1538a515f419..fa7c352a1b63 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp @@ -13,10 +13,7 @@ #include "X86MCAsmInfo.h" #include "llvm/ADT/Triple.h" -#include "llvm/BinaryFormat/ELF.h" -#include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" -#include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCStreamer.h" #include "llvm/Support/CommandLine.h" using namespace llvm; @@ -27,11 +24,11 @@ enum AsmWriterFlavorTy { ATT = 0, Intel = 1 }; -static cl::opt<AsmWriterFlavorTy> -AsmWriterFlavor("x86-asm-syntax", cl::init(ATT), - cl::desc("Choose style of code to emit from X86 backend:"), - cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"), - clEnumValN(Intel, "intel", "Emit Intel-style assembly"))); +static cl::opt<AsmWriterFlavorTy> AsmWriterFlavor( + "x86-asm-syntax", cl::init(ATT), cl::Hidden, + cl::desc("Choose style of code to emit from X86 backend:"), + cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"), + clEnumValN(Intel, "intel", "Emit Intel-style assembly"))); static cl::opt<bool> MarkedJTDataRegions("mark-data-regions", cl::init(true), diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 10e2bbc64d3c..a7059c6914df 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -380,7 +380,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op, return X86::reloc_riprel_4byte_movq_load; case X86::CALL64m: case X86::JMP64m: - case X86::TEST64rm: + case X86::TEST64mr: case X86::ADC64rm: case X86::ADD64rm: case X86::AND64rm: @@ -1108,7 +1108,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EmitByte(0x66, CurByte, OS); // Emit the LOCK opcode prefix. - if (TSFlags & X86II::LOCK) + if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK) EmitByte(0xF0, CurByte, OS); switch (TSFlags & X86II::OpPrefixMask) { @@ -1130,6 +1130,8 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, EmitByte(0x40 | REX, CurByte, OS); Ret = true; } + } else { + assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode."); } // 0x0F escape code must be emitted just before the opcode. @@ -1159,6 +1161,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, unsigned Opcode = MI.getOpcode(); const MCInstrDesc &Desc = MCII.get(Opcode); uint64_t TSFlags = Desc.TSFlags; + unsigned Flags = MI.getFlags(); // Pseudo instructions don't get encoded. if ((TSFlags & X86II::FormMask) == X86II::Pseudo) @@ -1194,8 +1197,10 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS, MI, OS); // Emit the repeat opcode prefix as needed. - if (TSFlags & X86II::REP) + if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT) EmitByte(0xF3, CurByte, OS); + if (Flags & X86::IP_HAS_REPEAT_NE) + EmitByte(0xF2, CurByte, OS); // Emit the address size opcode prefix as needed. bool need_address_override; diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp index 22cb0fac33cb..cdd43478baed 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp @@ -16,6 +16,7 @@ #include "InstPrinter/X86IntelInstPrinter.h" #include "X86MCAsmInfo.h" #include "llvm/ADT/Triple.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" #include "llvm/MC/MCInstrAnalysis.h" #include "llvm/MC/MCInstrInfo.h" #include "llvm/MC/MCRegisterInfo.h" @@ -72,52 +73,128 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) { MRI->mapLLVMRegToSEHReg(Reg, SEH); } - // These CodeView registers are numbered sequentially starting at value 1. - static const MCPhysReg LowCVRegs[] = { - X86::AL, X86::CL, X86::DL, X86::BL, X86::AH, X86::CH, - X86::DH, X86::BH, X86::AX, X86::CX, X86::DX, X86::BX, - X86::SP, X86::BP, X86::SI, X86::DI, X86::EAX, X86::ECX, - X86::EDX, X86::EBX, X86::ESP, X86::EBP, X86::ESI, X86::EDI, + // Mapping from CodeView to MC register id. + static const struct { + codeview::RegisterId CVReg; + MCPhysReg Reg; + } RegMap[] = { + { codeview::RegisterId::AL, X86::AL}, + { codeview::RegisterId::CL, X86::CL}, + { codeview::RegisterId::DL, X86::DL}, + { codeview::RegisterId::BL, X86::BL}, + { codeview::RegisterId::AH, X86::AH}, + { codeview::RegisterId::CH, X86::CH}, + { codeview::RegisterId::DH, X86::DH}, + { codeview::RegisterId::BH, X86::BH}, + { codeview::RegisterId::AX, X86::AX}, + { codeview::RegisterId::CX, X86::CX}, + { codeview::RegisterId::DX, X86::DX}, + { codeview::RegisterId::BX, X86::BX}, + { codeview::RegisterId::SP, X86::SP}, + { codeview::RegisterId::BP, X86::BP}, + { codeview::RegisterId::SI, X86::SI}, + { codeview::RegisterId::DI, X86::DI}, + { codeview::RegisterId::EAX, X86::EAX}, + { codeview::RegisterId::ECX, X86::ECX}, + { codeview::RegisterId::EDX, X86::EDX}, + { codeview::RegisterId::EBX, X86::EBX}, + { codeview::RegisterId::ESP, X86::ESP}, + { codeview::RegisterId::EBP, X86::EBP}, + { codeview::RegisterId::ESI, X86::ESI}, + { codeview::RegisterId::EDI, X86::EDI}, + + { codeview::RegisterId::EFLAGS, X86::EFLAGS}, + + { codeview::RegisterId::ST0, X86::FP0}, + { codeview::RegisterId::ST1, X86::FP1}, + { codeview::RegisterId::ST2, X86::FP2}, + { codeview::RegisterId::ST3, X86::FP3}, + { codeview::RegisterId::ST4, X86::FP4}, + { codeview::RegisterId::ST5, X86::FP5}, + { codeview::RegisterId::ST6, X86::FP6}, + { codeview::RegisterId::ST7, X86::FP7}, + + { codeview::RegisterId::XMM0, X86::XMM0}, + { codeview::RegisterId::XMM1, X86::XMM1}, + { codeview::RegisterId::XMM2, X86::XMM2}, + { codeview::RegisterId::XMM3, X86::XMM3}, + { codeview::RegisterId::XMM4, X86::XMM4}, + { codeview::RegisterId::XMM5, X86::XMM5}, + { codeview::RegisterId::XMM6, X86::XMM6}, + { codeview::RegisterId::XMM7, X86::XMM7}, + + { codeview::RegisterId::XMM8, X86::XMM8}, + { codeview::RegisterId::XMM9, X86::XMM9}, + { codeview::RegisterId::XMM10, X86::XMM10}, + { codeview::RegisterId::XMM11, X86::XMM11}, + { codeview::RegisterId::XMM12, X86::XMM12}, + { codeview::RegisterId::XMM13, X86::XMM13}, + { codeview::RegisterId::XMM14, X86::XMM14}, + { codeview::RegisterId::XMM15, X86::XMM15}, + + { codeview::RegisterId::SIL, X86::SIL}, + { codeview::RegisterId::DIL, X86::DIL}, + { codeview::RegisterId::BPL, X86::BPL}, + { codeview::RegisterId::SPL, X86::SPL}, + { codeview::RegisterId::RAX, X86::RAX}, + { codeview::RegisterId::RBX, X86::RBX}, + { codeview::RegisterId::RCX, X86::RCX}, + { codeview::RegisterId::RDX, X86::RDX}, + { codeview::RegisterId::RSI, X86::RSI}, + { codeview::RegisterId::RDI, X86::RDI}, + { codeview::RegisterId::RBP, X86::RBP}, + { codeview::RegisterId::RSP, X86::RSP}, + { codeview::RegisterId::R8, X86::R8}, + { codeview::RegisterId::R9, X86::R9}, + { codeview::RegisterId::R10, X86::R10}, + { codeview::RegisterId::R11, X86::R11}, + { codeview::RegisterId::R12, X86::R12}, + { codeview::RegisterId::R13, X86::R13}, + { codeview::RegisterId::R14, X86::R14}, + { codeview::RegisterId::R15, X86::R15}, + { codeview::RegisterId::R8B, X86::R8B}, + { codeview::RegisterId::R9B, X86::R9B}, + { codeview::RegisterId::R10B, X86::R10B}, + { codeview::RegisterId::R11B, X86::R11B}, + { codeview::RegisterId::R12B, X86::R12B}, + { codeview::RegisterId::R13B, X86::R13B}, + { codeview::RegisterId::R14B, X86::R14B}, + { codeview::RegisterId::R15B, X86::R15B}, + { codeview::RegisterId::R8W, X86::R8W}, + { codeview::RegisterId::R9W, X86::R9W}, + { codeview::RegisterId::R10W, X86::R10W}, + { codeview::RegisterId::R11W, X86::R11W}, + { codeview::RegisterId::R12W, X86::R12W}, + { codeview::RegisterId::R13W, X86::R13W}, + { codeview::RegisterId::R14W, X86::R14W}, + { codeview::RegisterId::R15W, X86::R15W}, + { codeview::RegisterId::R8D, X86::R8D}, + { codeview::RegisterId::R9D, X86::R9D}, + { codeview::RegisterId::R10D, X86::R10D}, + { codeview::RegisterId::R11D, X86::R11D}, + { codeview::RegisterId::R12D, X86::R12D}, + { codeview::RegisterId::R13D, X86::R13D}, + { codeview::RegisterId::R14D, X86::R14D}, + { codeview::RegisterId::R15D, X86::R15D}, + { codeview::RegisterId::AMD64_YMM0, X86::YMM0}, + { codeview::RegisterId::AMD64_YMM1, X86::YMM1}, + { codeview::RegisterId::AMD64_YMM2, X86::YMM2}, + { codeview::RegisterId::AMD64_YMM3, X86::YMM3}, + { codeview::RegisterId::AMD64_YMM4, X86::YMM4}, + { codeview::RegisterId::AMD64_YMM5, X86::YMM5}, + { codeview::RegisterId::AMD64_YMM6, X86::YMM6}, + { codeview::RegisterId::AMD64_YMM7, X86::YMM7}, + { codeview::RegisterId::AMD64_YMM8, X86::YMM8}, + { codeview::RegisterId::AMD64_YMM9, X86::YMM9}, + { codeview::RegisterId::AMD64_YMM10, X86::YMM10}, + { codeview::RegisterId::AMD64_YMM11, X86::YMM11}, + { codeview::RegisterId::AMD64_YMM12, X86::YMM12}, + { codeview::RegisterId::AMD64_YMM13, X86::YMM13}, + { codeview::RegisterId::AMD64_YMM14, X86::YMM14}, + { codeview::RegisterId::AMD64_YMM15, X86::YMM15}, }; - unsigned CVLowRegStart = 1; - for (unsigned I = 0; I < array_lengthof(LowCVRegs); ++I) - MRI->mapLLVMRegToCVReg(LowCVRegs[I], I + CVLowRegStart); - - MRI->mapLLVMRegToCVReg(X86::EFLAGS, 34); - - // The x87 registers start at 128 and are numbered sequentially. - unsigned FP0Start = 128; - for (unsigned I = 0; I < 8; ++I) - MRI->mapLLVMRegToCVReg(X86::FP0 + I, FP0Start + I); - - // The low 8 XMM registers start at 154 and are numbered sequentially. - unsigned CVXMM0Start = 154; - for (unsigned I = 0; I < 8; ++I) - MRI->mapLLVMRegToCVReg(X86::XMM0 + I, CVXMM0Start + I); - - // The high 8 XMM registers start at 252 and are numbered sequentially. - unsigned CVXMM8Start = 252; - for (unsigned I = 0; I < 8; ++I) - MRI->mapLLVMRegToCVReg(X86::XMM8 + I, CVXMM8Start + I); - - // FIXME: XMM16 and above from AVX512 not yet documented. - - // AMD64 registers start at 324 and count up. - unsigned CVX64RegStart = 324; - static const MCPhysReg CVX64Regs[] = { - X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::RAX, X86::RBX, - X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBP, X86::RSP, - X86::R8, X86::R9, X86::R10, X86::R11, X86::R12, X86::R13, - X86::R14, X86::R15, X86::R8B, X86::R9B, X86::R10B, X86::R11B, - X86::R12B, X86::R13B, X86::R14B, X86::R15B, X86::R8W, X86::R9W, - X86::R10W, X86::R11W, X86::R12W, X86::R13W, X86::R14W, X86::R15W, - X86::R8D, X86::R9D, X86::R10D, X86::R11D, X86::R12D, X86::R13D, - X86::R14D, X86::R15D, X86::YMM0, X86::YMM1, X86::YMM2, X86::YMM3, - X86::YMM4, X86::YMM5, X86::YMM6, X86::YMM7, X86::YMM8, X86::YMM9, - X86::YMM10, X86::YMM11, X86::YMM12, X86::YMM13, X86::YMM14, X86::YMM15, - }; - for (unsigned I = 0; I < array_lengthof(CVX64Regs); ++I) - MRI->mapLLVMRegToCVReg(CVX64Regs[I], CVX64RegStart + I); + for (unsigned I = 0; I < array_lengthof(RegMap); ++I) + MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg)); } MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT, @@ -198,18 +275,6 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI, return MAI; } -static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM, - CodeModel::Model &CM) { - bool is64Bit = TT.getArch() == Triple::x86_64; - - // For static codegen, if we're not already set, use Small codegen. - if (CM == CodeModel::Default) - CM = CodeModel::Small; - else if (CM == CodeModel::JITDefault) - // 64-bit JIT places everything in the same buffer except external funcs. - CM = is64Bit ? CodeModel::Large : CodeModel::Small; -} - static MCInstPrinter *createX86MCInstPrinter(const Triple &T, unsigned SyntaxVariant, const MCAsmInfo &MAI, @@ -238,9 +303,6 @@ extern "C" void LLVMInitializeX86TargetMC() { // Register the MC asm info. RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo); - // Register the MC codegen info. - RegisterMCAdjustCodeGenOptsFn Y(*T, adjustCodeGenOpts); - // Register the MC instruction info. TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo); @@ -257,7 +319,13 @@ extern "C" void LLVMInitializeX86TargetMC() { // Register the code emitter. TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter); - // Register the object streamer. + // Register the obj target streamer. + TargetRegistry::RegisterObjectTargetStreamer(*T, + createX86ObjectTargetStreamer); + + // Register the asm target streamer. + TargetRegistry::RegisterAsmTargetStreamer(*T, createX86AsmTargetStreamer); + TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer); // Register the MCInstPrinter. diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h index f73e734b9b0e..c5859b600ad2 100644 --- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h +++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h @@ -77,25 +77,41 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI, const Triple &TT, StringRef CPU, const MCTargetOptions &Options); +/// Implements X86-only directives for assembly emission. +MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrint, + bool isVerboseAsm); + +/// Implements X86-only directives for object files. +MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &OS, + const MCSubtargetInfo &STI); + /// Construct an X86 Windows COFF machine code streamer which will generate /// PE/COFF format object files. /// /// Takes ownership of \p AB and \p CE. -MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, - raw_pwrite_stream &OS, MCCodeEmitter *CE, - bool RelaxAll, bool IncrementalLinkerCompatible); +MCStreamer *createX86WinCOFFStreamer(MCContext &C, + std::unique_ptr<MCAsmBackend> &&AB, + raw_pwrite_stream &OS, + std::unique_ptr<MCCodeEmitter> &&CE, + bool RelaxAll, + bool IncrementalLinkerCompatible); /// Construct an X86 Mach-O object writer. -MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, - uint32_t CPUType, - uint32_t CPUSubtype); +std::unique_ptr<MCObjectWriter> createX86MachObjectWriter(raw_pwrite_stream &OS, + bool Is64Bit, + uint32_t CPUType, + uint32_t CPUSubtype); /// Construct an X86 ELF object writer. -MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64, - uint8_t OSABI, uint16_t EMachine); +std::unique_ptr<MCObjectWriter> createX86ELFObjectWriter(raw_pwrite_stream &OS, + bool IsELF64, + uint8_t OSABI, + uint16_t EMachine); /// Construct an X86 Win COFF object writer. -MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, - bool Is64Bit); +std::unique_ptr<MCObjectWriter> +createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit); /// Returns the sub or super register of a specific X86 register. /// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX. diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index 8f2017e990c5..965f7de809b3 100644 --- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -597,11 +597,10 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer, Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE); } -MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, - bool Is64Bit, uint32_t CPUType, - uint32_t CPUSubtype) { - return createMachObjectWriter(new X86MachObjectWriter(Is64Bit, - CPUType, - CPUSubtype), - OS, /*IsLittleEndian=*/true); +std::unique_ptr<MCObjectWriter> +llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit, + uint32_t CPUType, uint32_t CPUSubtype) { + return createMachObjectWriter( + llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS, + /*IsLittleEndian=*/true); } diff --git a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h new file mode 100644 index 000000000000..8d38cd32b82c --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h @@ -0,0 +1,34 @@ +//===- X86TargetStreamer.h ------------------------------*- C++ -*---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86TARGETSTREAMER_H +#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86TARGETSTREAMER_H + +#include "llvm/MC/MCStreamer.h" + +namespace llvm { + +/// X86 target streamer implementing x86-only assembly directives. +class X86TargetStreamer : public MCTargetStreamer { +public: + X86TargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {} + + virtual bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize, + SMLoc L = {}) = 0; + virtual bool emitFPOEndPrologue(SMLoc L = {}) = 0; + virtual bool emitFPOEndProc(SMLoc L = {}) = 0; + virtual bool emitFPOData(const MCSymbol *ProcSym, SMLoc L = {}) = 0; + virtual bool emitFPOPushReg(unsigned Reg, SMLoc L = {}) = 0; + virtual bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L = {}) = 0; + virtual bool emitFPOSetFrame(unsigned Reg, SMLoc L = {}) = 0; +}; + +} // end namespace llvm + +#endif diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 807f7a6ddb19..5139bb46b561 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -13,6 +13,7 @@ #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" #include "llvm/MC/MCFixup.h" +#include "llvm/MC/MCObjectWriter.h" #include "llvm/MC/MCValue.h" #include "llvm/MC/MCWinCOFFObjectWriter.h" #include "llvm/Support/ErrorHandling.h" @@ -104,8 +105,8 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, llvm_unreachable("Unsupported COFF machine type."); } -MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, - bool Is64Bit) { - MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit); - return createWinCOFFObjectWriter(MOTW, OS); +std::unique_ptr<MCObjectWriter> +llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) { + auto MOTW = llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit); + return createWinCOFFObjectWriter(std::move(MOTW), OS); } diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp index d04511873b46..5b1357ae4a7b 100644 --- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp @@ -8,6 +8,9 @@ //===----------------------------------------------------------------------===// #include "X86MCTargetDesc.h" +#include "X86TargetStreamer.h" +#include "llvm/MC/MCAsmBackend.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCWin64EH.h" #include "llvm/MC/MCWinCOFFStreamer.h" @@ -17,17 +20,18 @@ namespace { class X86WinCOFFStreamer : public MCWinCOFFStreamer { Win64EH::UnwindEmitter EHStreamer; public: - X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE, - raw_pwrite_stream &OS) - : MCWinCOFFStreamer(C, AB, *CE, OS) {} + X86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB, + std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS) + : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {} - void EmitWinEHHandlerData() override; + void EmitWinEHHandlerData(SMLoc Loc) override; void EmitWindowsUnwindTables() override; + void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override; void FinishImpl() override; }; -void X86WinCOFFStreamer::EmitWinEHHandlerData() { - MCStreamer::EmitWinEHHandlerData(); +void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) { + MCStreamer::EmitWinEHHandlerData(Loc); // We have to emit the unwind info now, because this directive // actually switches to the .xdata section! @@ -40,6 +44,12 @@ void X86WinCOFFStreamer::EmitWindowsUnwindTables() { EHStreamer.Emit(*this); } +void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) { + X86TargetStreamer *XTS = + static_cast<X86TargetStreamer *>(getTargetStreamer()); + XTS->emitFPOData(ProcSym, Loc); +} + void X86WinCOFFStreamer::FinishImpl() { EmitFrames(nullptr); EmitWindowsUnwindTables(); @@ -48,11 +58,14 @@ void X86WinCOFFStreamer::FinishImpl() { } } -MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, +MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, + std::unique_ptr<MCAsmBackend> &&AB, raw_pwrite_stream &OS, - MCCodeEmitter *CE, bool RelaxAll, + std::unique_ptr<MCCodeEmitter> &&CE, + bool RelaxAll, bool IncrementalLinkerCompatible) { - X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS); + X86WinCOFFStreamer *S = + new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), OS); S->getAssembler().setRelaxAll(RelaxAll); S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible); return S; diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp new file mode 100644 index 000000000000..093dab4f2f96 --- /dev/null +++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp @@ -0,0 +1,415 @@ +//===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "X86MCTargetDesc.h" +#include "X86TargetStreamer.h" +#include "llvm/DebugInfo/CodeView/CodeView.h" +#include "llvm/MC/MCCodeView.h" +#include "llvm/MC/MCContext.h" +#include "llvm/MC/MCInstPrinter.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSubtargetInfo.h" +#include "llvm/Support/FormattedStream.h" + +using namespace llvm; +using namespace llvm::codeview; + +namespace { +/// Implements Windows x86-only directives for assembly emission. +class X86WinCOFFAsmTargetStreamer : public X86TargetStreamer { + formatted_raw_ostream &OS; + MCInstPrinter &InstPrinter; + +public: + X86WinCOFFAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS, + MCInstPrinter &InstPrinter) + : X86TargetStreamer(S), OS(OS), InstPrinter(InstPrinter) {} + + bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize, + SMLoc L) override; + bool emitFPOEndPrologue(SMLoc L) override; + bool emitFPOEndProc(SMLoc L) override; + bool emitFPOData(const MCSymbol *ProcSym, SMLoc L) override; + bool emitFPOPushReg(unsigned Reg, SMLoc L) override; + bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) override; + bool emitFPOSetFrame(unsigned Reg, SMLoc L) override; +}; + +/// Represents a single FPO directive. +struct FPOInstruction { + MCSymbol *Label; + enum Operation { + PushReg, + StackAlloc, + SetFrame, + } Op; + unsigned RegOrOffset; +}; + +struct FPOData { + const MCSymbol *Function = nullptr; + MCSymbol *Begin = nullptr; + MCSymbol *PrologueEnd = nullptr; + MCSymbol *End = nullptr; + unsigned ParamsSize = 0; + + SmallVector<FPOInstruction, 5> Instructions; +}; + +/// Implements Windows x86-only directives for object emission. +class X86WinCOFFTargetStreamer : public X86TargetStreamer { + /// Map from function symbol to its FPO data. + DenseMap<const MCSymbol *, std::unique_ptr<FPOData>> AllFPOData; + + /// Current FPO data created by .cv_fpo_proc. + std::unique_ptr<FPOData> CurFPOData; + + bool haveOpenFPOData() { return !!CurFPOData; } + + /// Diagnoses an error at L if we are not in an FPO prologue. Return true on + /// error. + bool checkInFPOPrologue(SMLoc L); + + MCSymbol *emitFPOLabel(); + + MCContext &getContext() { return getStreamer().getContext(); } + +public: + X86WinCOFFTargetStreamer(MCStreamer &S) : X86TargetStreamer(S) {} + + bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize, + SMLoc L) override; + bool emitFPOEndPrologue(SMLoc L) override; + bool emitFPOEndProc(SMLoc L) override; + bool emitFPOData(const MCSymbol *ProcSym, SMLoc L) override; + bool emitFPOPushReg(unsigned Reg, SMLoc L) override; + bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) override; + bool emitFPOSetFrame(unsigned Reg, SMLoc L) override; +}; +} // end namespace + +bool X86WinCOFFAsmTargetStreamer::emitFPOProc(const MCSymbol *ProcSym, + unsigned ParamsSize, SMLoc L) { + OS << "\t.cv_fpo_proc\t"; + ProcSym->print(OS, getStreamer().getContext().getAsmInfo()); + OS << ' ' << ParamsSize << '\n'; + return false; +} + +bool X86WinCOFFAsmTargetStreamer::emitFPOEndPrologue(SMLoc L) { + OS << "\t.cv_fpo_endprologue\n"; + return false; +} + +bool X86WinCOFFAsmTargetStreamer::emitFPOEndProc(SMLoc L) { + OS << "\t.cv_fpo_endproc\n"; + return false; +} + +bool X86WinCOFFAsmTargetStreamer::emitFPOData(const MCSymbol *ProcSym, + SMLoc L) { + OS << "\t.cv_fpo_data\t"; + ProcSym->print(OS, getStreamer().getContext().getAsmInfo()); + OS << '\n'; + return false; +} + +bool X86WinCOFFAsmTargetStreamer::emitFPOPushReg(unsigned Reg, SMLoc L) { + OS << "\t.cv_fpo_pushreg\t"; + InstPrinter.printRegName(OS, Reg); + OS << '\n'; + return false; +} + +bool X86WinCOFFAsmTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc, + SMLoc L) { + OS << "\t.cv_fpo_stackalloc\t" << StackAlloc << '\n'; + return false; +} + +bool X86WinCOFFAsmTargetStreamer::emitFPOSetFrame(unsigned Reg, SMLoc L) { + OS << "\t.cv_fpo_setframe\t"; + InstPrinter.printRegName(OS, Reg); + OS << '\n'; + return false; +} + +bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) { + if (!haveOpenFPOData() || CurFPOData->PrologueEnd) { + getContext().reportError( + L, + "directive must appear between .cv_fpo_proc and .cv_fpo_endprologue"); + return true; + } + return false; +} + +MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() { + MCSymbol *Label = getContext().createTempSymbol("cfi", true); + getStreamer().EmitLabel(Label); + return Label; +} + +bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym, + unsigned ParamsSize, SMLoc L) { + if (haveOpenFPOData()) { + getContext().reportError( + L, "opening new .cv_fpo_proc before closing previous frame"); + return true; + } + CurFPOData = llvm::make_unique<FPOData>(); + CurFPOData->Function = ProcSym; + CurFPOData->Begin = emitFPOLabel(); + CurFPOData->ParamsSize = ParamsSize; + return false; +} + +bool X86WinCOFFTargetStreamer::emitFPOEndProc(SMLoc L) { + if (!haveOpenFPOData()) { + getContext().reportError(L, ".cv_fpo_endproc must appear after .cv_proc"); + return true; + } + if (!CurFPOData->PrologueEnd) { + // Complain if there were prologue setup instructions but no end prologue. + if (!CurFPOData->Instructions.empty()) { + getContext().reportError(L, "missing .cv_fpo_endprologue"); + CurFPOData->Instructions.clear(); + } + + // Claim there is a zero-length prologue to make the label math work out + // later. + CurFPOData->PrologueEnd = CurFPOData->Begin; + } + + CurFPOData->End = emitFPOLabel(); + const MCSymbol *Fn = CurFPOData->Function; + AllFPOData.insert({Fn, std::move(CurFPOData)}); + return false; +} + +bool X86WinCOFFTargetStreamer::emitFPOSetFrame(unsigned Reg, SMLoc L) { + if (checkInFPOPrologue(L)) + return true; + FPOInstruction Inst; + Inst.Label = emitFPOLabel(); + Inst.Op = FPOInstruction::SetFrame; + Inst.RegOrOffset = Reg; + CurFPOData->Instructions.push_back(Inst); + return false; +} + +bool X86WinCOFFTargetStreamer::emitFPOPushReg(unsigned Reg, SMLoc L) { + if (checkInFPOPrologue(L)) + return true; + FPOInstruction Inst; + Inst.Label = emitFPOLabel(); + Inst.Op = FPOInstruction::PushReg; + Inst.RegOrOffset = Reg; + CurFPOData->Instructions.push_back(Inst); + return false; +} + +bool X86WinCOFFTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) { + if (checkInFPOPrologue(L)) + return true; + FPOInstruction Inst; + Inst.Label = emitFPOLabel(); + Inst.Op = FPOInstruction::StackAlloc; + Inst.RegOrOffset = StackAlloc; + CurFPOData->Instructions.push_back(Inst); + return false; +} + +bool X86WinCOFFTargetStreamer::emitFPOEndPrologue(SMLoc L) { + if (checkInFPOPrologue(L)) + return true; + CurFPOData->PrologueEnd = emitFPOLabel(); + return false; +} + +namespace { +struct RegSaveOffset { + RegSaveOffset(unsigned Reg, unsigned Offset) : Reg(Reg), Offset(Offset) {} + + unsigned Reg = 0; + unsigned Offset = 0; +}; + +struct FPOStateMachine { + explicit FPOStateMachine(const FPOData *FPO) : FPO(FPO) {} + + const FPOData *FPO = nullptr; + unsigned FrameReg = 0; + unsigned FrameRegOff = 0; + unsigned CurOffset = 0; + unsigned LocalSize = 0; + unsigned SavedRegSize = 0; + unsigned Flags = 0; // FIXME: Set HasSEH / HasEH. + + SmallString<128> FrameFunc; + + SmallVector<RegSaveOffset, 4> RegSaveOffsets; + + void emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label); +}; +} // end namespace + +static Printable printFPOReg(const MCRegisterInfo *MRI, unsigned LLVMReg) { + return Printable([MRI, LLVMReg](raw_ostream &OS) { + switch (LLVMReg) { + // MSVC only seems to emit symbolic register names for EIP, EBP, and ESP, + // but the format seems to support more than that, so we emit them. + case X86::EAX: OS << "$eax"; break; + case X86::EBX: OS << "$ebx"; break; + case X86::ECX: OS << "$ecx"; break; + case X86::EDX: OS << "$edx"; break; + case X86::EDI: OS << "$edi"; break; + case X86::ESI: OS << "$esi"; break; + case X86::ESP: OS << "$esp"; break; + case X86::EBP: OS << "$ebp"; break; + case X86::EIP: OS << "$eip"; break; + // Otherwise, get the codeview register number and print $N. + default: + OS << '$' << MRI->getCodeViewRegNum(LLVMReg); + break; + } + }); +} + +void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) { + unsigned CurFlags = Flags; + if (Label == FPO->Begin) + CurFlags |= FrameData::IsFunctionStart; + + // Compute the new FrameFunc string. + FrameFunc.clear(); + raw_svector_ostream FuncOS(FrameFunc); + const MCRegisterInfo *MRI = OS.getContext().getRegisterInfo(); + if (FrameReg) { + // CFA is FrameReg + FrameRegOff. + FuncOS << "$T0 " << printFPOReg(MRI, FrameReg) << " " << FrameRegOff + << " + = "; + } else { + // The address of return address is ESP + CurOffset, but we use .raSearch to + // match MSVC. This seems to ask the debugger to subtract some combination + // of LocalSize and SavedRegSize from ESP and grovel around in that memory + // to find the address of a plausible return address. + FuncOS << "$T0 .raSearch = "; + } + + // Caller's $eip should be dereferenced CFA, and $esp should be CFA plus 4. + FuncOS << "$eip $T0 ^ = $esp $T0 4 + = "; + + // Each saved register is stored at an unchanging negative CFA offset. + for (RegSaveOffset RO : RegSaveOffsets) + FuncOS << printFPOReg(MRI, RO.Reg) << " $T0 " << RO.Offset << " - ^ = "; + + // Add it to the CV string table. + CodeViewContext &CVCtx = OS.getContext().getCVContext(); + unsigned FrameFuncStrTabOff = CVCtx.addToStringTable(FuncOS.str()).second; + + // MSVC has only ever been observed to emit a MaxStackSize of zero. + unsigned MaxStackSize = 0; + + // The FrameData record format is: + // ulittle32_t RvaStart; + // ulittle32_t CodeSize; + // ulittle32_t LocalSize; + // ulittle32_t ParamsSize; + // ulittle32_t MaxStackSize; + // ulittle32_t FrameFunc; // String table offset + // ulittle16_t PrologSize; + // ulittle16_t SavedRegsSize; + // ulittle32_t Flags; + + OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart + OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4); // CodeSize + OS.EmitIntValue(LocalSize, 4); + OS.EmitIntValue(FPO->ParamsSize, 4); + OS.EmitIntValue(MaxStackSize, 4); + OS.EmitIntValue(FrameFuncStrTabOff, 4); // FrameFunc + OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2); + OS.EmitIntValue(SavedRegSize, 2); + OS.EmitIntValue(CurFlags, 4); +} + +/// Compute and emit the real CodeView FrameData subsection. +bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) { + MCStreamer &OS = getStreamer(); + MCContext &Ctx = OS.getContext(); + + auto I = AllFPOData.find(ProcSym); + if (I == AllFPOData.end()) { + Ctx.reportError(L, Twine("no FPO data found for symbol ") + + ProcSym->getName()); + return true; + } + const FPOData *FPO = I->second.get(); + assert(FPO->Begin && FPO->End && FPO->PrologueEnd && "missing FPO label"); + + MCSymbol *FrameBegin = Ctx.createTempSymbol(), + *FrameEnd = Ctx.createTempSymbol(); + + OS.EmitIntValue(unsigned(DebugSubsectionKind::FrameData), 4); + OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4); + OS.EmitLabel(FrameBegin); + + // Start with the RVA of the function in question. + OS.EmitValue(MCSymbolRefExpr::create(FPO->Function, + MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx), + 4); + + // Emit a sequence of FrameData records. + FPOStateMachine FSM(FPO); + + FSM.emitFrameDataRecord(OS, FPO->Begin); + for (const FPOInstruction &Inst : FPO->Instructions) { + switch (Inst.Op) { + case FPOInstruction::PushReg: + FSM.CurOffset += 4; + FSM.SavedRegSize += 4; + FSM.RegSaveOffsets.push_back({Inst.RegOrOffset, FSM.CurOffset}); + break; + case FPOInstruction::SetFrame: + FSM.FrameReg = Inst.RegOrOffset; + FSM.FrameRegOff = FSM.CurOffset; + break; + case FPOInstruction::StackAlloc: + FSM.CurOffset += Inst.RegOrOffset; + FSM.LocalSize += Inst.RegOrOffset; + // No need to emit FrameData for stack allocations with a frame pointer. + if (FSM.FrameReg) + continue; + break; + } + FSM.emitFrameDataRecord(OS, Inst.Label); + } + + OS.EmitValueToAlignment(4, 0); + OS.EmitLabel(FrameEnd); + return false; +} + +MCTargetStreamer *llvm::createX86AsmTargetStreamer(MCStreamer &S, + formatted_raw_ostream &OS, + MCInstPrinter *InstPrinter, + bool IsVerboseAsm) { + // FIXME: This makes it so we textually assemble COFF directives on ELF. + // That's kind of nonsensical. + return new X86WinCOFFAsmTargetStreamer(S, OS, *InstPrinter); +} + +MCTargetStreamer * +llvm::createX86ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) { + // No need to register a target streamer. + if (!STI.getTargetTriple().isOSBinFormatCOFF()) + return nullptr; + // Registers itself to the MCStreamer. + return new X86WinCOFFTargetStreamer(S); +} diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt index e6896e805568..73cf27692447 100644 --- a/lib/Target/X86/README-SSE.txt +++ b/lib/Target/X86/README-SSE.txt @@ -145,15 +145,15 @@ This is the llvm code after instruction scheduling: cond_next140 (0xa910740, LLVM BB @0xa90beb0): %reg1078 = MOV32ri -3 - %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0 - %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40 + %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0 + %reg1037 = MOV32rm %reg1024, 1, %noreg, 40 %reg1080 = IMUL32rr %reg1079, %reg1037 - %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0 + %reg1081 = MOV32rm %reg1058, 1, %noreg, 0 %reg1038 = LEA32r %reg1081, 1, %reg1080, -3 - %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32 + %reg1036 = MOV32rm %reg1024, 1, %noreg, 32 %reg1082 = SHL32ri %reg1038, 4 %reg1039 = ADD32rr %reg1036, %reg1082 - %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0 + %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0 %reg1034 = SHUFPSrr %reg1083, %reg1083, 170 %reg1032 = SHUFPSrr %reg1083, %reg1083, 0 %reg1035 = SHUFPSrr %reg1083, %reg1083, 255 @@ -166,32 +166,32 @@ cond_next140 (0xa910740, LLVM BB @0xa90beb0): Still ok. After register allocation: cond_next140 (0xa910740, LLVM BB @0xa90beb0): - %EAX = MOV32ri -3 - %EDX = MOV32rm <fi#3>, 1, %NOREG, 0 - ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0 - %EDX = MOV32rm <fi#7>, 1, %NOREG, 0 - %EDX = MOV32rm %EDX, 1, %NOREG, 40 - IMUL32rr %EAX<def&use>, %EDX - %ESI = MOV32rm <fi#5>, 1, %NOREG, 0 - %ESI = MOV32rm %ESI, 1, %NOREG, 0 - MOV32mr <fi#4>, 1, %NOREG, 0, %ESI - %EAX = LEA32r %ESI, 1, %EAX, -3 - %ESI = MOV32rm <fi#7>, 1, %NOREG, 0 - %ESI = MOV32rm %ESI, 1, %NOREG, 32 - %EDI = MOV32rr %EAX - SHL32ri %EDI<def&use>, 4 - ADD32rr %EDI<def&use>, %ESI - %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0 - %XMM1 = MOVAPSrr %XMM0 - SHUFPSrr %XMM1<def&use>, %XMM1, 170 - %XMM2 = MOVAPSrr %XMM0 - SHUFPSrr %XMM2<def&use>, %XMM2, 0 - %XMM3 = MOVAPSrr %XMM0 - SHUFPSrr %XMM3<def&use>, %XMM3, 255 - SHUFPSrr %XMM0<def&use>, %XMM0, 85 - %EBX = MOV32rr %EDI - AND32ri8 %EBX<def&use>, 15 - CMP32ri8 %EBX, 0 + %eax = MOV32ri -3 + %edx = MOV32rm %stack.3, 1, %noreg, 0 + ADD32rm %eax<def&use>, %edx, 1, %noreg, 0 + %edx = MOV32rm %stack.7, 1, %noreg, 0 + %edx = MOV32rm %edx, 1, %noreg, 40 + IMUL32rr %eax<def&use>, %edx + %esi = MOV32rm %stack.5, 1, %noreg, 0 + %esi = MOV32rm %esi, 1, %noreg, 0 + MOV32mr %stack.4, 1, %noreg, 0, %esi + %eax = LEA32r %esi, 1, %eax, -3 + %esi = MOV32rm %stack.7, 1, %noreg, 0 + %esi = MOV32rm %esi, 1, %noreg, 32 + %edi = MOV32rr %eax + SHL32ri %edi<def&use>, 4 + ADD32rr %edi<def&use>, %esi + %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0 + %xmm1 = MOVAPSrr %xmm0 + SHUFPSrr %xmm1<def&use>, %xmm1, 170 + %xmm2 = MOVAPSrr %xmm0 + SHUFPSrr %xmm2<def&use>, %xmm2, 0 + %xmm3 = MOVAPSrr %xmm0 + SHUFPSrr %xmm3<def&use>, %xmm3, 255 + SHUFPSrr %xmm0<def&use>, %xmm0, 85 + %ebx = MOV32rr %edi + AND32ri8 %ebx<def&use>, 15 + CMP32ri8 %ebx, 0 JE mbb<cond_next204,0xa914d30> This looks really bad. The problem is shufps is a destructive opcode. Since it diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt index 09626e13849d..a3ea4595ac1e 100644 --- a/lib/Target/X86/README-X86-64.txt +++ b/lib/Target/X86/README-X86-64.txt @@ -103,20 +103,20 @@ LBB1_3: ## bb Before regalloc, we have: - %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def> + %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags JMP mbb<bb2,0x203afb0> Successors according to CFG: 0x203afb0 (#3) bb1: 0x203af60, LLVM BB @0x1e02310, ID#2: Predecessors according to CFG: 0x203aec0 (#0) - %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def> + %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags Successors according to CFG: 0x203afb0 (#3) bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3: Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2) - %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>, + %reg1027 = PHI %reg1025, mbb<bb,0x203af10>, %reg1026, mbb<bb1,0x203af60> - %reg1029<def> = MOVZX64rr32 %reg1027 + %reg1029 = MOVZX64rr32 %reg1027 so we'd have to know that IMUL32rri8 leaves the high word zero extended and to be able to recognize the zero extend. This could also presumably be implemented diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt index 799157c926e6..11652af9f1fc 100644 --- a/lib/Target/X86/README.txt +++ b/lib/Target/X86/README.txt @@ -987,11 +987,11 @@ bb7: ; preds = %entry to: foo: # @foo -# BB#0: # %entry +# %bb.0: # %entry movl 4(%esp), %ecx cmpb $0, 16(%esp) je .LBB0_2 -# BB#1: # %bb +# %bb.1: # %bb movl 8(%esp), %eax addl %ecx, %eax ret @@ -1073,7 +1073,7 @@ declare void @exit(i32) noreturn nounwind This compiles into: _abort_gzip: ## @abort_gzip -## BB#0: ## %entry +## %bb.0: ## %entry subl $12, %esp movb _in_exit.4870.b, %al cmpb $1, %al @@ -1396,7 +1396,7 @@ define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize { } bar: # @bar -# BB#0: +# %bb.0: movb (%rdi), %al andb $1, %al movzbl %al, %eax @@ -1633,7 +1633,7 @@ In the real code, we get a lot more wrong than this. However, even in this code we generate: _foo: ## @foo -## BB#0: ## %entry +## %bb.0: ## %entry movb (%rsi), %al movb (%rdi), %cl cmpb %al, %cl @@ -1646,12 +1646,12 @@ LBB0_2: ## %if.end movb 1(%rdi), %cl cmpb %al, %cl jne LBB0_1 -## BB#3: ## %if.end38 +## %bb.3: ## %if.end38 movb 2(%rsi), %al movb 2(%rdi), %cl cmpb %al, %cl jne LBB0_1 -## BB#4: ## %if.end60 +## %bb.4: ## %if.end60 movb 3(%rdi), %al cmpb 3(%rsi), %al LBB0_5: ## %if.end60 diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp index d2654fc67ed5..16c2b56c48b5 100644 --- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp +++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp @@ -22,8 +22,8 @@ Target &llvm::getTheX86_64Target() { extern "C" void LLVMInitializeX86TargetInfo() { RegisterTarget<Triple::x86, /*HasJIT=*/true> X( - getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above"); + getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above", "X86"); RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y( - getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64"); + getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64", "X86"); } diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h index 91201d1fec85..5631648d2dc8 100644 --- a/lib/Target/X86/X86.h +++ b/lib/Target/X86/X86.h @@ -92,9 +92,13 @@ FunctionPass *createX86CmovConverterPass(); /// the upper portions of registers, and to save code size. FunctionPass *createX86FixupBWInsts(); +/// Return a Machine IR pass that reassigns instruction chains from one domain +/// to another, when profitable. +FunctionPass *createX86DomainReassignmentPass(); + void initializeFixupBWInstPassPass(PassRegistry &); -/// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX +/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX /// encoding when possible in order to reduce code size. FunctionPass *createX86EvexToVexInsts(); diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 54eabeac5126..08731cd0204c 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -95,8 +95,6 @@ def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true", def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true", "64-bit with cmpxchg16b", [Feature64Bit]>; -def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true", - "Bit testing of memory is slow">; def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true", "SHLD instruction is slow">; def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true", @@ -118,9 +116,15 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX", def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2", "Enable AVX2 instructions", [FeatureAVX]>; +def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", + "Enable three-operand fused multiple-add", + [FeatureAVX]>; +def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", + "Support 16-bit floating point conversion instructions", + [FeatureAVX]>; def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F", "Enable AVX-512 instructions", - [FeatureAVX2]>; + [FeatureAVX2, FeatureFMA, FeatureF16C]>; def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true", "Enable AVX-512 Exponential and Reciprocal Instructions", [FeatureAVX512]>; @@ -148,17 +152,29 @@ def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true", def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true", "Enable AVX-512 Vector Byte Manipulation Instructions", [FeatureBWI]>; +def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true", + "Enable AVX-512 further Vector Byte Manipulation Instructions", + [FeatureBWI]>; def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true", "Enable AVX-512 Integer Fused Multiple-Add", [FeatureAVX512]>; def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true", "Enable protection keys">; +def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true", + "Enable AVX-512 Vector Neural Network Instructions", + [FeatureAVX512]>; +def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true", + "Enable AVX-512 Bit Algorithms", + [FeatureBWI]>; def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true", "Enable packed carry-less multiplication instructions", [FeatureSSE2]>; -def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true", - "Enable three-operand fused multiple-add", - [FeatureAVX]>; +def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true", + "Enable Galois Field Arithmetic Instructions", + [FeatureSSE2]>; +def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true", + "Enable vpclmulqdq instructions", + [FeatureAVX, FeaturePCLMUL]>; def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true", "Enable four-operand fused multiple-add", [FeatureAVX, FeatureSSE4A]>; @@ -171,6 +187,9 @@ def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem", def FeatureAES : SubtargetFeature<"aes", "HasAES", "true", "Enable AES instructions", [FeatureSSE2]>; +def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true", + "Promote selected AES instructions to AVX512/AVX registers", + [FeatureAVX, FeatureAES]>; def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true", "Enable TBM instructions">; def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true", @@ -179,9 +198,6 @@ def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true", "Support MOVBE instruction">; def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true", "Support RDRAND instruction">; -def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true", - "Support 16-bit floating point conversion instructions", - [FeatureAVX]>; def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true", "Support FS/GS Base instructions">; def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true", @@ -197,6 +213,10 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true", def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true", "Enable SHA instructions", [FeatureSSE2]>; +def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true", + "Support CET Shadow-Stack instructions">; +def FeatureIBT : SubtargetFeature<"ibt", "HasIBT", "true", + "Support CET Indirect-Branch-Tracking instructions">; def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true", "Support PRFCHW instructions">; def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true", @@ -226,14 +246,12 @@ def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true", "Flush A Cache Line Optimized">; def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true", "Cache Line Write Back">; -// TODO: This feature ought to be renamed. -// What it really refers to are CPUs for which certain instructions -// (which ones besides the example below?) are microcoded. -// The best examples of this are the memory forms of CALL and PUSH -// instructions, which should be avoided in favor of a MOV + register CALL/PUSH. -def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect", - "CallRegIndirect", "true", - "Call register indirect">; +// On some processors, instructions that implicitly take two memory operands are +// slow. In practice, this means that CALL, PUSH, and POP with memory operands +// should be avoided in favor of a MOV + register CALL/PUSH/POP. +def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops", + "SlowTwoMemOps", "true", + "Two memory operand instructions are slow">; def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", @@ -290,11 +308,50 @@ def FeatureERMSB "ermsb", "HasERMSB", "true", "REP MOVS/STOS are fast">; +// Sandy Bridge and newer processors have many instructions that can be +// fused with conditional branches and pass through the CPU as a single +// operation. +def FeatureMacroFusion + : SubtargetFeature<"macrofusion", "HasMacroFusion", "true", + "Various instructions can be fused with conditional branches">; + +// Gather is available since Haswell (AVX2 set). So technically, we can +// generate Gathers on all AVX2 processors. But the overhead on HSW is high. +// Skylake Client processor has faster Gathers than HSW and performance is +// similar to Skylake Server (AVX-512). +def FeatureHasFastGather + : SubtargetFeature<"fast-gather", "HasFastGather", "true", + "Indicates if gather is reasonably fast.">; + //===----------------------------------------------------------------------===// -// X86 processors supported. +// Register File Description +//===----------------------------------------------------------------------===// + +include "X86RegisterInfo.td" +include "X86RegisterBanks.td" + +//===----------------------------------------------------------------------===// +// Instruction Descriptions //===----------------------------------------------------------------------===// include "X86Schedule.td" +include "X86InstrInfo.td" + +def X86InstrInfo : InstrInfo; + +//===----------------------------------------------------------------------===// +// X86 processors supported. +//===----------------------------------------------------------------------===// + +include "X86ScheduleAtom.td" +include "X86SchedSandyBridge.td" +include "X86SchedHaswell.td" +include "X86SchedBroadwell.td" +include "X86ScheduleSLM.td" +include "X86ScheduleZnver1.td" +include "X86ScheduleBtVer2.td" +include "X86SchedSkylakeClient.td" +include "X86SchedSkylakeServer.td" def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom", "Intel Atom processors">; @@ -302,6 +359,20 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM", "Intel Silvermont processors">; def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM", "Intel Goldmont processors">; +def ProcIntelHSW : SubtargetFeature<"haswell", "X86ProcFamily", + "IntelHaswell", "Intel Haswell processors">; +def ProcIntelBDW : SubtargetFeature<"broadwell", "X86ProcFamily", + "IntelBroadwell", "Intel Broadwell processors">; +def ProcIntelSKL : SubtargetFeature<"skylake", "X86ProcFamily", + "IntelSkylake", "Intel Skylake processors">; +def ProcIntelKNL : SubtargetFeature<"knl", "X86ProcFamily", + "IntelKNL", "Intel Knights Landing processors">; +def ProcIntelSKX : SubtargetFeature<"skx", "X86ProcFamily", + "IntelSKX", "Intel Skylake Server processors">; +def ProcIntelCNL : SubtargetFeature<"cannonlake", "X86ProcFamily", + "IntelCannonlake", "Intel Cannonlake processors">; +def ProcIntelICL : SubtargetFeature<"icelake", "X86ProcFamily", + "IntelIcelake", "Intel Icelake processors">; class Proc<string Name, list<SubtargetFeature> Features> : ProcessorModel<Name, GenericModel, Features>; @@ -312,14 +383,18 @@ def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>; def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>; def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>; def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; -def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16]>; -def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>; + +foreach P = ["i686", "pentiumpro"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>; +} + def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureCMOV, FeatureFXSR]>; -def : Proc<"pentium3", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE1, FeatureFXSR]>; -def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>; + +foreach P = ["pentium3", "pentium3m"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1, + FeatureFXSR]>; +} // Enable the PostRAScheduler for SSE2 and SSE3 class cpus. // The intent is to enable it for pentium4 which is the current default @@ -333,15 +408,13 @@ def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, def : ProcessorModel<"pentium-m", GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; - -def : ProcessorModel<"pentium4", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE2, FeatureFXSR]>; -def : ProcessorModel<"pentium4m", GenericPostRAModel, - [FeatureX87, FeatureSlowUAMem16, FeatureMMX, - FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>; +foreach P = ["pentium4", "pentium4m"] in { + def : ProcessorModel<P, GenericPostRAModel, + [FeatureX87, FeatureSlowUAMem16, FeatureMMX, + FeatureSSE2, FeatureFXSR]>; +} // Intel Quark. def : Proc<"lakemont", []>; @@ -349,20 +422,19 @@ def : Proc<"lakemont", []>; // Intel Core Duo. def : ProcessorModel<"yonah", SandyBridgeModel, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureSlowBTMem]>; + FeatureFXSR]>; // NetBurst. def : ProcessorModel<"prescott", GenericPostRAModel, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, - FeatureFXSR, FeatureSlowBTMem]>; + FeatureFXSR]>; def : ProcessorModel<"nocona", GenericPostRAModel, [ FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3, FeatureFXSR, - FeatureCMPXCHG16B, - FeatureSlowBTMem + FeatureCMPXCHG16B ]>; // Intel Core 2 Solo/Duo. @@ -373,8 +445,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [ FeatureSSSE3, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureX87, @@ -383,8 +455,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [ FeatureSSE41, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Atom CPUs. @@ -397,11 +469,10 @@ class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [ FeatureFXSR, FeatureCMPXCHG16B, FeatureMOVBE, - FeatureSlowBTMem, FeatureLEAForSP, FeatureSlowDivide32, FeatureSlowDivide64, - FeatureCallRegIndirect, + FeatureSlowTwoMemOps, FeatureLEAUsesAG, FeaturePadShortFunctions, FeatureLAHFSAHF @@ -421,11 +492,10 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [ FeaturePCLMUL, FeatureAES, FeatureSlowDivide64, - FeatureCallRegIndirect, + FeatureSlowTwoMemOps, FeaturePRFCHW, FeatureSlowLEA, FeatureSlowIncDec, - FeatureSlowBTMem, FeatureSlowPMULLD, FeatureLAHFSAHF ]>; @@ -444,10 +514,9 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [ FeaturePCLMUL, FeatureAES, FeaturePRFCHW, - FeatureCallRegIndirect, + FeatureSlowTwoMemOps, FeatureSlowLEA, FeatureSlowIncDec, - FeatureSlowBTMem, FeatureLAHFSAHF, FeatureMPX, FeatureSHA, @@ -457,7 +526,8 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [ FeatureXSAVEOPT, FeatureXSAVEC, FeatureXSAVES, - FeatureCLFLUSHOPT + FeatureCLFLUSHOPT, + FeatureFSGSBase ]>; def : GoldmontProc<"goldmont">; @@ -468,9 +538,9 @@ class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ FeatureSSE42, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeaturePOPCNT, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; @@ -483,11 +553,11 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [ FeatureSSE42, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeaturePOPCNT, FeatureAES, FeaturePCLMUL, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; def : WestmereProc<"westmere">; @@ -518,12 +588,13 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureLAHFSAHF, FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, - FeatureFastSHLDRotate + FeatureFastSHLDRotate, + FeatureSlowIncDec, + FeatureMacroFusion ]>; class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, SNBFeatures.Value, [ - FeatureSlowBTMem, FeatureSlowUAMem32 ]>; def : SandyBridgeProc<"sandybridge">; @@ -537,7 +608,6 @@ def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [ class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel, IVBFeatures.Value, [ - FeatureSlowBTMem, FeatureSlowUAMem32 ]>; def : IvyBridgeProc<"ivybridge">; @@ -550,12 +620,13 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureERMSB, FeatureFMA, FeatureLZCNT, - FeatureMOVBE, - FeatureSlowIncDec + FeatureMOVBE ]>; class HaswellProc<string Name> : ProcModel<Name, HaswellModel, - HSWFeatures.Value, []>; + HSWFeatures.Value, [ + ProcIntelHSW +]>; def : HaswellProc<"haswell">; def : HaswellProc<"core-avx2">; // Legacy alias. @@ -563,8 +634,10 @@ def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [ FeatureADX, FeatureRDSEED ]>; -class BroadwellProc<string Name> : ProcModel<Name, HaswellModel, - BDWFeatures.Value, []>; +class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel, + BDWFeatures.Value, [ + ProcIntelBDW +]>; def : BroadwellProc<"broadwell">; def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ @@ -577,14 +650,14 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [ FeatureFastVectorFSQRT ]>; -// FIXME: define SKL model -class SkylakeClientProc<string Name> : ProcModel<Name, HaswellModel, - SKLFeatures.Value, []>; +class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel, + SKLFeatures.Value, [ + ProcIntelSKL, + FeatureHasFastGather +]>; def : SkylakeClientProc<"skylake">; -// FIXME: define KNL model -class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, - IVBFeatures.Value, [ +def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [ FeatureAVX512, FeatureERI, FeatureCDI, @@ -596,11 +669,29 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, FeatureLZCNT, FeatureBMI, FeatureBMI2, - FeatureFMA, - FeatureFastPartialYMMorZMMWrite + FeatureFMA +]>; + +// FIXME: define KNL model +class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel, + KNLFeatures.Value, [ + ProcIntelKNL, + FeatureSlowTwoMemOps, + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather ]>; def : KnightsLandingProc<"knl">; +class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel, + KNLFeatures.Value, [ + ProcIntelKNL, + FeatureSlowTwoMemOps, + FeatureFastPartialYMMorZMMWrite, + FeatureHasFastGather, + FeatureVPOPCNTDQ +]>; +def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features + def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [ FeatureAVX512, FeatureCDI, @@ -611,9 +702,11 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [ FeatureCLWB ]>; -// FIXME: define SKX model -class SkylakeServerProc<string Name> : ProcModel<Name, HaswellModel, - SKXFeatures.Value, []>; +class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel, + SKXFeatures.Value, [ + ProcIntelSKX, + FeatureHasFastGather +]>; def : SkylakeServerProc<"skylake-avx512">; def : SkylakeServerProc<"skx">; // Legacy alias. @@ -623,57 +716,60 @@ def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [ FeatureSHA ]>; -class CannonlakeProc<string Name> : ProcModel<Name, HaswellModel, - CNLFeatures.Value, []>; +class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel, + CNLFeatures.Value, [ + ProcIntelCNL, + FeatureHasFastGather +]>; def : CannonlakeProc<"cannonlake">; +def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [ + FeatureBITALG, + FeatureVAES, + FeatureVBMI2, + FeatureVNNI, + FeatureVPCLMULQDQ, + FeatureVPOPCNTDQ, + FeatureGFNI +]>; + +class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel, + ICLFeatures.Value, [ + ProcIntelICL, + FeatureHasFastGather +]>; +def : IcelakeProc<"icelake">; + // AMD CPUs. def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>; def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>; -def : Proc<"athlon", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-tbird", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-4", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, - Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon-xp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, - Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"athlon-mp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, - Feature3DNowA, FeatureFXSR, FeatureSlowBTMem, - FeatureSlowSHLD]>; -def : Proc<"k8", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, - Feature3DNowA, FeatureFXSR, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, - Feature3DNowA, FeatureFXSR, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, - Feature3DNowA, FeatureFXSR, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon-fx", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, - Feature3DNowA, FeatureFXSR, Feature64Bit, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"k8-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, - Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"opteron-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, - Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"athlon64-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, - Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B, - FeatureSlowBTMem, FeatureSlowSHLD]>; -def : Proc<"amdfam10", [FeatureX87, FeatureSSE4A, Feature3DNowA, - FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD, - FeatureLAHFSAHF]>; -def : Proc<"barcelona", [FeatureX87, FeatureSSE4A, Feature3DNowA, - FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT, - FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD, - FeatureLAHFSAHF]>; + +foreach P = ["athlon", "athlon-tbird"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowSHLD]>; +} + +foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE1, + Feature3DNowA, FeatureFXSR, FeatureSlowSHLD]>; +} + +foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA, + FeatureFXSR, Feature64Bit, FeatureSlowSHLD]>; +} + +foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in { + def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA, + FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowSHLD]>; +} + +foreach P = ["amdfam10", "barcelona"] in { + def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR, + FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT, + FeatureSlowSHLD, FeatureLAHFSAHF]>; +} // Bobcat def : Proc<"btver1", [ @@ -732,7 +828,8 @@ def : Proc<"bdver1", [ FeatureXSAVE, FeatureLWP, FeatureSlowSHLD, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Piledriver def : Proc<"bdver2", [ @@ -756,7 +853,8 @@ def : Proc<"bdver2", [ FeatureLWP, FeatureFMA, FeatureSlowSHLD, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Steamroller @@ -783,7 +881,8 @@ def : Proc<"bdver3", [ FeatureXSAVEOPT, FeatureSlowSHLD, FeatureFSGSBase, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureMacroFusion ]>; // Excavator @@ -811,7 +910,8 @@ def : Proc<"bdver4", [ FeatureSlowSHLD, FeatureFSGSBase, FeatureLAHFSAHF, - FeatureMWAITX + FeatureMWAITX, + FeatureMacroFusion ]>; // Znver1 @@ -831,6 +931,7 @@ def: ProcessorModel<"znver1", Znver1Model, [ FeatureFastLZCNT, FeatureLAHFSAHF, FeatureLZCNT, + FeatureMacroFusion, FeatureMMX, FeatureMOVBE, FeatureMWAITX, @@ -866,24 +967,16 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX, // covers a huge swath of x86 processors. If there are specific scheduling // knobs which need to be tuned differently for AMD chips, we might consider // forming a common base for them. -def : ProcessorModel<"x86-64", SandyBridgeModel, - [FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR, - Feature64Bit, FeatureSlowBTMem ]>; - -//===----------------------------------------------------------------------===// -// Register File Description -//===----------------------------------------------------------------------===// - -include "X86RegisterInfo.td" -include "X86RegisterBanks.td" - -//===----------------------------------------------------------------------===// -// Instruction Descriptions -//===----------------------------------------------------------------------===// - -include "X86InstrInfo.td" - -def X86InstrInfo : InstrInfo; +def : ProcessorModel<"x86-64", SandyBridgeModel, [ + FeatureX87, + FeatureMMX, + FeatureSSE2, + FeatureFXSR, + Feature64Bit, + FeatureSlow3OpsLEA, + FeatureSlowIncDec, + FeatureMacroFusion +]>; //===----------------------------------------------------------------------===// // Calling Conventions diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp index dc15aeadaa61..71526dd77f11 100644 --- a/lib/Target/X86/X86AsmPrinter.cpp +++ b/lib/Target/X86/X86AsmPrinter.cpp @@ -15,6 +15,7 @@ #include "X86AsmPrinter.h" #include "InstPrinter/X86ATTInstPrinter.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86TargetStreamer.h" #include "X86InstrInfo.h" #include "X86MachineFunctionInfo.h" #include "llvm/BinaryFormat/COFF.h" @@ -22,12 +23,10 @@ #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineValueType.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/IR/DebugInfo.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCCodeEmitter.h" #include "llvm/MC/MCContext.h" #include "llvm/MC/MCExpr.h" @@ -40,6 +39,10 @@ #include "llvm/Support/TargetRegistry.h" using namespace llvm; +X86AsmPrinter::X86AsmPrinter(TargetMachine &TM, + std::unique_ptr<MCStreamer> Streamer) + : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {} + //===----------------------------------------------------------------------===// // Primitive Helper Functions. //===----------------------------------------------------------------------===// @@ -51,13 +54,16 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { SMShadowTracker.startFunction(MF); CodeEmitter.reset(TM.getTarget().createMCCodeEmitter( - *MF.getSubtarget().getInstrInfo(), *MF.getSubtarget().getRegisterInfo(), + *Subtarget->getInstrInfo(), *Subtarget->getRegisterInfo(), MF.getContext())); + EmitFPOData = + Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag(); + SetupMachineFunction(MF); if (Subtarget->isTargetCOFF()) { - bool Local = MF.getFunction()->hasLocalLinkage(); + bool Local = MF.getFunction().hasLocalLinkage(); OutStreamer->BeginCOFFSymbolDef(CurrentFnSym); OutStreamer->EmitCOFFSymbolStorageClass( Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL); @@ -72,10 +78,30 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) { // Emit the XRay table for this function. emitXRayTable(); + EmitFPOData = false; + // We didn't modify anything. return false; } +void X86AsmPrinter::EmitFunctionBodyStart() { + if (EmitFPOData) { + X86TargetStreamer *XTS = + static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()); + unsigned ParamsSize = + MF->getInfo<X86MachineFunctionInfo>()->getArgumentStackSize(); + XTS->emitFPOProc(CurrentFnSym, ParamsSize); + } +} + +void X86AsmPrinter::EmitFunctionBodyEnd() { + if (EmitFPOData) { + X86TargetStreamer *XTS = + static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()); + XTS->emitFPOEndProc(); + } +} + /// printSymbolOperand - Print a raw symbol reference operand. This handles /// jump tables, constant pools, global address and external symbols, all of /// which print to a label with various suffixes for relocation types etc. diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h index d7c3b74d3efb..7e70789ac82c 100644 --- a/lib/Target/X86/X86AsmPrinter.h +++ b/lib/Target/X86/X86AsmPrinter.h @@ -14,6 +14,7 @@ #include "llvm/CodeGen/AsmPrinter.h" #include "llvm/CodeGen/FaultMaps.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/MC/MCCodeEmitter.h" #include "llvm/Target/TargetMachine.h" // Implemented in X86MCInstLower.cpp @@ -30,6 +31,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { StackMaps SM; FaultMaps FM; std::unique_ptr<MCCodeEmitter> CodeEmitter; + bool EmitFPOData = false; // This utility class tracks the length of a stackmap instruction's 'shadow'. // It is used by the X86AsmPrinter to ensure that the stackmap shadow @@ -95,14 +97,11 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter { void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL); - // Helper function that emits the XRay sleds we've collected for a particular - // function. - void EmitXRayTable(); + // Choose between emitting .seh_ directives and .cv_fpo_ directives. + void EmitSEHInstruction(const MachineInstr *MI); public: - explicit X86AsmPrinter(TargetMachine &TM, - std::unique_ptr<MCStreamer> Streamer) - : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {} + X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer); StringRef getPassName() const override { return "X86 Assembly Printer"; @@ -117,6 +116,7 @@ public: void EmitInstruction(const MachineInstr *MI) override; void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override { + AsmPrinter::EmitBasicBlockEnd(MBB); SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo()); } @@ -133,10 +133,13 @@ public: bool doInitialization(Module &M) override { SMShadowTracker.reset(0); SM.reset(); + FM.reset(); return AsmPrinter::doInitialization(M); } bool runOnMachineFunction(MachineFunction &F) override; + void EmitFunctionBodyStart() override; + void EmitFunctionBodyEnd() override; }; } // end namespace llvm diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp index 765af67de160..522dc7926b94 100644 --- a/lib/Target/X86/X86CallFrameOptimization.cpp +++ b/lib/Target/X86/X86CallFrameOptimization.cpp @@ -34,14 +34,14 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" #include "llvm/MC/MCDwarf.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetInstrInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" #include <cassert> #include <cstddef> #include <cstdint> @@ -56,18 +56,27 @@ static cl::opt<bool> cl::desc("Avoid optimizing x86 call frames for size"), cl::init(false), cl::Hidden); +namespace llvm { +void initializeX86CallFrameOptimizationPass(PassRegistry &); +} + namespace { class X86CallFrameOptimization : public MachineFunctionPass { public: - X86CallFrameOptimization() : MachineFunctionPass(ID) {} + X86CallFrameOptimization() : MachineFunctionPass(ID) { + initializeX86CallFrameOptimizationPass( + *PassRegistry::getPassRegistry()); + } bool runOnMachineFunction(MachineFunction &MF) override; + static char ID; + private: // Information we know about a particular call site struct CallContext { - CallContext() : FrameSetup(nullptr), MovVector(4, nullptr) {} + CallContext() : FrameSetup(nullptr), ArgStoreVector(4, nullptr) {} // Iterator referring to the frame setup instruction MachineBasicBlock::iterator FrameSetup; @@ -81,8 +90,8 @@ private: // The total displacement of all passed parameters int64_t ExpectedDist = 0; - // The sequence of movs used to pass the parameters - SmallVector<MachineInstr *, 4> MovVector; + // The sequence of storing instructions used to pass the parameters + SmallVector<MachineInstr *, 4> ArgStoreVector; // True if this call site has no stack parameters bool NoStackParams = false; @@ -120,12 +129,12 @@ private: MachineRegisterInfo *MRI; unsigned SlotSize; unsigned Log2SlotSize; - static char ID; }; -char X86CallFrameOptimization::ID = 0; - } // end anonymous namespace +char X86CallFrameOptimization::ID = 0; +INITIALIZE_PASS(X86CallFrameOptimization, DEBUG_TYPE, + "X86 Call Frame Optimization", false, false) // This checks whether the transformation is legal. // Also returns false in cases where it's potentially legal, but @@ -139,7 +148,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) { // is a danger of that being generated. if (STI->isTargetDarwin() && (!MF.getLandingPads().empty() || - (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF)))) + (MF.getFunction().needsUnwindTableEntry() && !TFL->hasFP(MF)))) return false; // It is not valid to change the stack pointer outside the prolog/epilog @@ -234,7 +243,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) { assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size"); Log2SlotSize = Log2_32(SlotSize); - if (skipFunction(*MF.getFunction()) || !isLegal(MF)) + if (skipFunction(MF.getFunction()) || !isLegal(MF)) return false; unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode(); @@ -271,11 +280,27 @@ X86CallFrameOptimization::classifyInstruction( if (MI == MBB.end()) return Exit; - // The instructions we actually care about are movs onto the stack - int Opcode = MI->getOpcode(); - if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr || - Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr) - return Convert; + // The instructions we actually care about are movs onto the stack or special + // cases of constant-stores to stack + switch (MI->getOpcode()) { + case X86::AND16mi8: + case X86::AND32mi8: + case X86::AND64mi8: { + MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands); + return ImmOp.getImm() == 0 ? Convert : Exit; + } + case X86::OR16mi8: + case X86::OR32mi8: + case X86::OR64mi8: { + MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands); + return ImmOp.getImm() == -1 ? Convert : Exit; + } + case X86::MOV32mi: + case X86::MOV32mr: + case X86::MOV64mi32: + case X86::MOV64mr: + return Convert; + } // Not all calling conventions have only stack MOVs between the stack // adjust and the call. @@ -354,32 +379,40 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, ++I; unsigned StackPtr = RegInfo.getStackRegister(); + auto StackPtrCopyInst = MBB.end(); // SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual - // register here. If it's there, use that virtual register as stack pointer - // instead. - if (I->isCopy() && I->getOperand(0).isReg() && I->getOperand(1).isReg() && - I->getOperand(1).getReg() == StackPtr) { - Context.SPCopy = &*I++; - StackPtr = Context.SPCopy->getOperand(0).getReg(); - } + // register. If it's there, use that virtual register as stack pointer + // instead. Also, we need to locate this instruction so that we can later + // safely ignore it while doing the conservative processing of the call chain. + // The COPY can be located anywhere between the call-frame setup + // instruction and its first use. We use the call instruction as a boundary + // because it is usually cheaper to check if an instruction is a call than + // checking if an instruction uses a register. + for (auto J = I; !J->isCall(); ++J) + if (J->isCopy() && J->getOperand(0).isReg() && J->getOperand(1).isReg() && + J->getOperand(1).getReg() == StackPtr) { + StackPtrCopyInst = J; + Context.SPCopy = &*J++; + StackPtr = Context.SPCopy->getOperand(0).getReg(); + break; + } // Scan the call setup sequence for the pattern we're looking for. // We only handle a simple case - a sequence of store instructions that // push a sequence of stack-slot-aligned values onto the stack, with // no gaps between them. if (MaxAdjust > 4) - Context.MovVector.resize(MaxAdjust, nullptr); + Context.ArgStoreVector.resize(MaxAdjust, nullptr); - InstClassification Classification; DenseSet<unsigned int> UsedRegs; - while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) != - Exit) { - if (Classification == Skip) { - ++I; + for (InstClassification Classification = Skip; Classification != Exit; ++I) { + // If this is the COPY of the stack pointer, it's ok to ignore. + if (I == StackPtrCopyInst) + continue; + Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs); + if (Classification != Convert) continue; - } - // We know the instruction has a supported store opcode. // We only want movs of the form: // mov imm/reg, k(%StackPtr) @@ -407,13 +440,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, return; StackDisp >>= Log2SlotSize; - assert((size_t)StackDisp < Context.MovVector.size() && + assert((size_t)StackDisp < Context.ArgStoreVector.size() && "Function call has more parameters than the stack is adjusted for."); // If the same stack slot is being filled twice, something's fishy. - if (Context.MovVector[StackDisp] != nullptr) + if (Context.ArgStoreVector[StackDisp] != nullptr) return; - Context.MovVector[StackDisp] = &*I; + Context.ArgStoreVector[StackDisp] = &*I; for (const MachineOperand &MO : I->uses()) { if (!MO.isReg()) @@ -422,10 +455,10 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, if (RegInfo.isPhysicalRegister(Reg)) UsedRegs.insert(Reg); } - - ++I; } + --I; + // We now expect the end of the sequence. If we stopped early, // or reached the end of the block without finding a call, bail. if (I == MBB.end() || !I->isCall()) @@ -436,14 +469,14 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF, return; // Now, go through the vector, and see that we don't have any gaps, - // but only a series of MOVs. - auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end(); + // but only a series of storing instructions. + auto MMI = Context.ArgStoreVector.begin(), MME = Context.ArgStoreVector.end(); for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize) if (*MMI == nullptr) break; // If the call had no parameters, do nothing - if (MMI == Context.MovVector.begin()) + if (MMI == Context.ArgStoreVector.begin()) return; // We are either at the last parameter, or a gap. @@ -466,17 +499,23 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, DebugLoc DL = FrameSetup->getDebugLoc(); bool Is64Bit = STI->is64Bit(); - // Now, iterate through the vector in reverse order, and replace the movs - // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to + // Now, iterate through the vector in reverse order, and replace the store to + // stack with pushes. MOVmi/MOVmr doesn't have any defs, so no need to // replace uses. for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) { - MachineBasicBlock::iterator MOV = *Context.MovVector[Idx]; - MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands); + MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx]; + MachineOperand PushOp = Store->getOperand(X86::AddrNumOperands); MachineBasicBlock::iterator Push = nullptr; unsigned PushOpcode; - switch (MOV->getOpcode()) { + switch (Store->getOpcode()) { default: llvm_unreachable("Unexpected Opcode!"); + case X86::AND16mi8: + case X86::AND32mi8: + case X86::AND64mi8: + case X86::OR16mi8: + case X86::OR32mi8: + case X86::OR64mi8: case X86::MOV32mi: case X86::MOV64mi32: PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32; @@ -497,7 +536,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg // in preparation for the PUSH64. The upper 32 bits can be undef. - if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) { + if (Is64Bit && Store->getOpcode() == X86::MOV32mr) { unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass); Reg = MRI->createVirtualRegister(&X86::GR64RegClass); BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg); @@ -541,7 +580,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF, MBB, std::next(Push), DL, MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize)); - MBB.erase(MOV); + MBB.erase(Store); } // The stack-pointer copy is no longer used in the call sequences. diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp index 99aeec67c326..ccb982f9ac16 100644 --- a/lib/Target/X86/X86CallLowering.cpp +++ b/lib/Target/X86/X86CallLowering.cpp @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering -----------===// +//===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===// // // The LLVM Compiler Infrastructure // @@ -6,33 +6,50 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -/// +// /// \file /// This file implements the lowering of LLVM calls to machine code calls for /// GlobalISel. -/// +// //===----------------------------------------------------------------------===// #include "X86CallLowering.h" #include "X86CallingConv.h" #include "X86ISelLowering.h" #include "X86InstrInfo.h" -#include "X86TargetMachine.h" - +#include "X86RegisterInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/CodeGen/Analysis.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" +#include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/LowLevelType.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFrameInfo.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/MachineValueType.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Value.h" +#include "llvm/MC/MCRegisterInfo.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include <cassert> +#include <cstdint> using namespace llvm; #include "X86GenCallingConv.inc" -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "This shouldn't be built without GISel" -#endif - X86CallLowering::X86CallLowering(const X86TargetLowering &TLI) : CallLowering(&TLI) {} @@ -41,7 +58,6 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, const DataLayout &DL, MachineRegisterInfo &MRI, SplitArgTy PerformArgSplit) const { - const X86TargetLowering &TLI = *getTLI<X86TargetLowering>(); LLVMContext &Context = OrigArg.Ty->getContext(); @@ -82,14 +98,29 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg, } namespace { -struct FuncReturnHandler : public CallLowering::ValueHandler { - FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - MachineInstrBuilder &MIB, CCAssignFn *AssignFn) - : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + +struct OutgoingValueHandler : public CallLowering::ValueHandler { + OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + MachineInstrBuilder &MIB, CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), + DL(MIRBuilder.getMF().getDataLayout()), + STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {} unsigned getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { - llvm_unreachable("Don't know how to get a stack address yet"); + LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0)); + LLT SType = LLT::scalar(DL.getPointerSizeInBits(0)); + unsigned SPReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister()); + + unsigned OffsetReg = MRI.createGenericVirtualRegister(SType); + MIRBuilder.buildConstant(OffsetReg, Offset); + + unsigned AddrReg = MRI.createGenericVirtualRegister(p0); + MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg); + + MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset); + return AddrReg; } void assignValueToReg(unsigned ValVReg, unsigned PhysReg, @@ -101,16 +132,43 @@ struct FuncReturnHandler : public CallLowering::ValueHandler { void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - llvm_unreachable("Don't know how to assign a value to an address yet"); + unsigned ExtReg = extendRegister(ValVReg, VA); + auto MMO = MIRBuilder.getMF().getMachineMemOperand( + MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(), + /* Alignment */ 0); + MIRBuilder.buildStore(ExtReg, Addr, *MMO); + } + + bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + const CallLowering::ArgInfo &Info, CCState &State) override { + bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State); + StackSize = State.getNextStackOffset(); + + static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2, + X86::XMM3, X86::XMM4, X86::XMM5, + X86::XMM6, X86::XMM7}; + if (!Info.IsFixed) + NumXMMRegs = State.getFirstUnallocated(XMMArgRegs); + + return Res; } + uint64_t getStackSize() { return StackSize; } + uint64_t getNumXmmRegs() { return NumXMMRegs; } + +protected: MachineInstrBuilder &MIB; + uint64_t StackSize = 0; + const DataLayout &DL; + const X86Subtarget &STI; + unsigned NumXMMRegs = 0; }; -} // End anonymous namespace. + +} // end anonymous namespace bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, const Value *Val, unsigned VReg) const { - assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg"); auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0); @@ -119,7 +177,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, MachineFunction &MF = MIRBuilder.getMF(); MachineRegisterInfo &MRI = MF.getRegInfo(); auto &DL = MF.getDataLayout(); - const Function &F = *MF.getFunction(); + const Function &F = MF.getFunction(); ArgInfo OrigArg{VReg, Val->getType()}; setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F); @@ -131,7 +189,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, })) return false; - FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); + OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; } @@ -141,14 +199,15 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder, } namespace { -struct FormalArgHandler : public CallLowering::ValueHandler { - FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, - CCAssignFn *AssignFn, const DataLayout &DL) - : ValueHandler(MIRBuilder, MRI, AssignFn), DL(DL) {} + +struct IncomingValueHandler : public CallLowering::ValueHandler { + IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : ValueHandler(MIRBuilder, MRI, AssignFn), + DL(MIRBuilder.getMF().getDataLayout()) {} unsigned getStackAddress(uint64_t Size, int64_t Offset, MachinePointerInfo &MPO) override { - auto &MFI = MIRBuilder.getMF().getFrameInfo(); int FI = MFI.CreateFixedObject(Size, Offset, true); MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI); @@ -161,7 +220,6 @@ struct FormalArgHandler : public CallLowering::ValueHandler { void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size, MachinePointerInfo &MPO, CCValAssign &VA) override { - auto MMO = MIRBuilder.getMF().getMachineMemOperand( MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 0); @@ -170,13 +228,54 @@ struct FormalArgHandler : public CallLowering::ValueHandler { void assignValueToReg(unsigned ValVReg, unsigned PhysReg, CCValAssign &VA) override { - MIRBuilder.getMBB().addLiveIn(PhysReg); - MIRBuilder.buildCopy(ValVReg, PhysReg); + markPhysRegUsed(PhysReg); + switch (VA.getLocInfo()) { + default: + MIRBuilder.buildCopy(ValVReg, PhysReg); + break; + case CCValAssign::LocInfo::SExt: + case CCValAssign::LocInfo::ZExt: + case CCValAssign::LocInfo::AExt: { + auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg); + MIRBuilder.buildTrunc(ValVReg, Copy); + break; + } + } } + /// How the physical register gets marked varies between formal + /// parameters (it's a basic-block live-in), and a call instruction + /// (it's an implicit-def of the BL). + virtual void markPhysRegUsed(unsigned PhysReg) = 0; + +protected: const DataLayout &DL; }; -} // namespace + +struct FormalArgHandler : public IncomingValueHandler { + FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn) + : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIRBuilder.getMBB().addLiveIn(PhysReg); + } +}; + +struct CallReturnHandler : public IncomingValueHandler { + CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI, + CCAssignFn *AssignFn, MachineInstrBuilder &MIB) + : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {} + + void markPhysRegUsed(unsigned PhysReg) override { + MIB.addDef(PhysReg, RegState::Implicit); + } + +protected: + MachineInstrBuilder &MIB; +}; + +} // end anonymous namespace bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, @@ -219,7 +318,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, if (!MBB.empty()) MIRBuilder.setInstr(*MBB.begin()); - FormalArgHandler Handler(MIRBuilder, MRI, CC_X86, DL); + FormalArgHandler Handler(MIRBuilder, MRI, CC_X86); if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) return false; @@ -228,3 +327,114 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder, return true; } + +bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, + CallingConv::ID CallConv, + const MachineOperand &Callee, + const ArgInfo &OrigRet, + ArrayRef<ArgInfo> OrigArgs) const { + MachineFunction &MF = MIRBuilder.getMF(); + const Function &F = MF.getFunction(); + MachineRegisterInfo &MRI = MF.getRegInfo(); + auto &DL = F.getParent()->getDataLayout(); + const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>(); + const TargetInstrInfo &TII = *STI.getInstrInfo(); + auto TRI = STI.getRegisterInfo(); + + // Handle only Linux C, X86_64_SysV calling conventions for now. + if (!STI.isTargetLinux() || + !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV)) + return false; + + unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); + auto CallSeqStart = MIRBuilder.buildInstr(AdjStackDown); + + // Create a temporarily-floating call instruction so we can add the implicit + // uses of arg registers. + bool Is64Bit = STI.is64Bit(); + unsigned CallOpc = Callee.isReg() + ? (Is64Bit ? X86::CALL64r : X86::CALL32r) + : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); + + auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask( + TRI->getCallPreservedMask(MF, CallConv)); + + SmallVector<ArgInfo, 8> SplitArgs; + for (const auto &OrigArg : OrigArgs) { + + // TODO: handle not simple cases. + if (OrigArg.Flags.isByVal()) + return false; + + if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI, + [&](ArrayRef<unsigned> Regs) { + MIRBuilder.buildUnmerge(Regs, OrigArg.Reg); + })) + return false; + } + // Do the actual argument marshalling. + OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + + bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed; + if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) { + // From AMD64 ABI document: + // For calls that may call functions that use varargs or stdargs + // (prototype-less calls or calls to functions containing ellipsis (...) in + // the declaration) %al is used as hidden argument to specify the number + // of SSE registers used. The contents of %al do not need to match exactly + // the number of registers, but must be an ubound on the number of SSE + // registers used and is in the range 0 - 8 inclusive. + + MIRBuilder.buildInstr(X86::MOV8ri) + .addDef(X86::AL) + .addImm(Handler.getNumXmmRegs()); + MIB.addUse(X86::AL, RegState::Implicit); + } + + // Now we can add the actual call instruction to the correct basic block. + MIRBuilder.insertInstr(MIB); + + // If Callee is a reg, since it is used by a target specific + // instruction, it must have a register class matching the + // constraint of that instruction. + if (Callee.isReg()) + MIB->getOperand(0).setReg(constrainOperandRegClass( + MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(), + *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(), + Callee.getReg(), 0)); + + // Finally we can copy the returned value back into its virtual-register. In + // symmetry with the arguments, the physical register must be an + // implicit-define of the call instruction. + + if (OrigRet.Reg) { + SplitArgs.clear(); + SmallVector<unsigned, 8> NewRegs; + + if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI, + [&](ArrayRef<unsigned> Regs) { + NewRegs.assign(Regs.begin(), Regs.end()); + })) + return false; + + CallReturnHandler Handler(MIRBuilder, MRI, RetCC_X86, MIB); + if (!handleAssignments(MIRBuilder, SplitArgs, Handler)) + return false; + + if (!NewRegs.empty()) + MIRBuilder.buildMerge(OrigRet.Reg, NewRegs); + } + + CallSeqStart.addImm(Handler.getStackSize()) + .addImm(0 /* see getFrameTotalSize */) + .addImm(0 /* see getFrameAdjustment */); + + unsigned AdjStackUp = TII.getCallFrameDestroyOpcode(); + MIRBuilder.buildInstr(AdjStackUp) + .addImm(Handler.getStackSize()) + .addImm(0 /* NumBytesForCalleeToPop */); + + return true; +} diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h index 6a5dabf33a0a..6c9dc1565dad 100644 --- a/lib/Target/X86/X86CallLowering.h +++ b/lib/Target/X86/X86CallLowering.h @@ -1,4 +1,4 @@ -//===-- llvm/lib/Target/X86/X86CallLowering.h - Call lowering -----===// +//===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -6,24 +6,24 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -/// +// /// \file /// This file describes how to lower LLVM calls to machine code calls. -/// +// //===----------------------------------------------------------------------===// -#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING -#define LLVM_LIB_TARGET_X86_X86CALLLOWERING +#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING_H +#define LLVM_LIB_TARGET_X86_X86CALLLOWERING_H #include "llvm/ADT/ArrayRef.h" #include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include <functional> namespace llvm { -class Function; -class MachineIRBuilder; +class DataLayout; +class MachineRegisterInfo; class X86TargetLowering; -class Value; class X86CallLowering : public CallLowering { public: @@ -35,14 +35,20 @@ public: bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F, ArrayRef<unsigned> VRegs) const override; + bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv, + const MachineOperand &Callee, const ArgInfo &OrigRet, + ArrayRef<ArgInfo> OrigArgs) const override; + private: /// A function of this type is used to perform value split action. - typedef std::function<void(ArrayRef<unsigned>)> SplitArgTy; + using SplitArgTy = std::function<void(ArrayRef<unsigned>)>; bool splitToValueTypes(const ArgInfo &OrigArgInfo, SmallVectorImpl<ArgInfo> &SplitArgs, const DataLayout &DL, MachineRegisterInfo &MRI, SplitArgTy SplitArg) const; }; -} // namespace llvm -#endif + +} // end namespace llvm + +#endif // LLVM_LIB_TARGET_X86_X86CALLLOWERING_H diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td index 26461986427d..5d806fe60b86 100644 --- a/lib/Target/X86/X86CallingConv.td +++ b/lib/Target/X86/X86CallingConv.td @@ -500,7 +500,7 @@ def CC_X86_64_C : CallingConv<[ // A SwiftError is passed in R12. CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, - // For Swift Calling Convention, pass sret in %RAX. + // For Swift Calling Convention, pass sret in %rax. CCIfCC<"CallingConv::Swift", CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>, @@ -592,6 +592,9 @@ def CC_X86_Win64_C : CallingConv<[ // The 'nest' parameter, if any, is passed in R10. CCIfNest<CCAssignToReg<[R10]>>, + // A SwiftError is passed in R12. + CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>, + // 128 bit vectors are passed by pointer CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>, @@ -1047,6 +1050,8 @@ def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R1 def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE, (sequence "XMM%u", 6, 15))>; +def CSR_Win64_SwiftError : CalleeSavedRegs<(sub CSR_Win64, R12)>; + // The function used by Darwin to obtain the address of a thread-local variable // uses rdi to pass a single parameter and rax for the return value. All other // GPRs are preserved. diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp index bfc834435de5..489d9d86e254 100644 --- a/lib/Target/X86/X86CmovConversion.cpp +++ b/lib/Target/X86/X86CmovConversion.cpp @@ -1,4 +1,4 @@ -//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===// +//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===// // // The LLVM Compiler Infrastructure // @@ -6,104 +6,146 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// +// /// \file -/// This file implements a pass that converts X86 cmov instructions into branch -/// when profitable. This pass is conservative, i.e., it applies transformation -/// if and only if it can gaurantee a gain with high confidence. +/// This file implements a pass that converts X86 cmov instructions into +/// branches when profitable. This pass is conservative. It transforms if and +/// only if it can guarantee a gain with high confidence. /// /// Thus, the optimization applies under the following conditions: -/// 1. Consider as a candidate only CMOV in most inner loop, assuming that -/// most hotspots are represented by these loops. -/// 2. Given a group of CMOV instructions, that are using same EFLAGS def +/// 1. Consider as candidates only CMOVs in innermost loops (assume that +/// most hotspots are represented by these loops). +/// 2. Given a group of CMOV instructions that are using the same EFLAGS def /// instruction: -/// a. Consider them as candidates only if all have same code condition or -/// opposite one, to prevent generating more than one conditional jump -/// per EFLAGS def instruction. +/// a. Consider them as candidates only if all have the same code condition +/// or the opposite one to prevent generating more than one conditional +/// jump per EFLAGS def instruction. /// b. Consider them as candidates only if all are profitable to be -/// converted, assuming that one bad conversion may casue a degradation. -/// 3. Apply conversion only for loop that are found profitable and only for +/// converted (assume that one bad conversion may cause a degradation). +/// 3. Apply conversion only for loops that are found profitable and only for /// CMOV candidates that were found profitable. -/// a. Loop is considered profitable only if conversion will reduce its -/// depth cost by some thrishold. +/// a. A loop is considered profitable only if conversion will reduce its +/// depth cost by some threshold. /// b. CMOV is considered profitable if the cost of its condition is higher /// than the average cost of its true-value and false-value by 25% of -/// branch-misprediction-penalty, this to assure no degredassion even -/// with 25% branch misprediction. +/// branch-misprediction-penalty. This assures no degradation even with +/// 25% branch misprediction. /// /// Note: This pass is assumed to run on SSA machine code. +// //===----------------------------------------------------------------------===// // // External interfaces: // FunctionPass *llvm::createX86CmovConverterPass(); // bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF); // +//===----------------------------------------------------------------------===// #include "X86.h" #include "X86InstrInfo.h" -#include "X86Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/CodeGen/TargetSchedule.h" -#include "llvm/IR/InstIterator.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCSchedule.h" +#include "llvm/Pass.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" +#include <algorithm> +#include <cassert> +#include <iterator> +#include <utility> + using namespace llvm; -#define DEBUG_TYPE "x86-cmov-converter" +#define DEBUG_TYPE "x86-cmov-conversion" STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups"); STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates"); STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops"); STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups"); -namespace { +namespace llvm { + +void initializeX86CmovConverterPassPass(PassRegistry &); + +} // end namespace llvm + // This internal switch can be used to turn off the cmov/branch optimization. static cl::opt<bool> EnableCmovConverter("x86-cmov-converter", cl::desc("Enable the X86 cmov-to-branch optimization."), cl::init(true), cl::Hidden); +static cl::opt<unsigned> + GainCycleThreshold("x86-cmov-converter-threshold", + cl::desc("Minimum gain per loop (in cycles) threshold."), + cl::init(4), cl::Hidden); + +static cl::opt<bool> ForceMemOperand( + "x86-cmov-converter-force-mem-operand", + cl::desc("Convert cmovs to branches whenever they have memory operands."), + cl::init(true), cl::Hidden); + +namespace { + /// Converts X86 cmov instructions into branches when profitable. class X86CmovConverterPass : public MachineFunctionPass { public: - X86CmovConverterPass() : MachineFunctionPass(ID) {} - ~X86CmovConverterPass() {} + X86CmovConverterPass() : MachineFunctionPass(ID) { + initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry()); + } StringRef getPassName() const override { return "X86 cmov Conversion"; } bool runOnMachineFunction(MachineFunction &MF) override; void getAnalysisUsage(AnalysisUsage &AU) const override; -private: /// Pass identification, replacement for typeid. static char ID; - const MachineRegisterInfo *MRI; +private: + MachineRegisterInfo *MRI; const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; TargetSchedModel TSchedModel; /// List of consecutive CMOV instructions. - typedef SmallVector<MachineInstr *, 2> CmovGroup; - typedef SmallVector<CmovGroup, 2> CmovGroups; + using CmovGroup = SmallVector<MachineInstr *, 2>; + using CmovGroups = SmallVector<CmovGroup, 2>; /// Collect all CMOV-group-candidates in \p CurrLoop and update \p /// CmovInstGroups accordingly. /// - /// \param CurrLoop Loop being processed. + /// \param Blocks List of blocks to process. /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. /// \returns true iff it found any CMOV-group-candidate. - bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups); + bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, + CmovGroups &CmovInstGroups, + bool IncludeLoads = false); /// Check if it is profitable to transform each CMOV-group-candidates into /// branch. Remove all groups that are not profitable from \p CmovInstGroups. /// - /// \param CurrLoop Loop being processed. + /// \param Blocks List of blocks to process. /// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop. /// \returns true iff any CMOV-group-candidate remain. - bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop, + bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups); /// Convert the given list of consecutive CMOV instructions into a branch. @@ -112,6 +154,8 @@ private: void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const; }; +} // end anonymous namespace + char X86CmovConverterPass::ID = 0; void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { @@ -120,7 +164,7 @@ void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const { } bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; if (!EnableCmovConverter) return false; @@ -133,10 +177,36 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { const TargetSubtargetInfo &STI = MF.getSubtarget(); MRI = &MF.getRegInfo(); TII = STI.getInstrInfo(); + TRI = STI.getRegisterInfo(); TSchedModel.init(STI.getSchedModel(), &STI, TII); + // Before we handle the more subtle cases of register-register CMOVs inside + // of potentially hot loops, we want to quickly remove all CMOVs with + // a memory operand. The CMOV will risk a stall waiting for the load to + // complete that speculative execution behind a branch is better suited to + // handle on modern x86 chips. + if (ForceMemOperand) { + CmovGroups AllCmovGroups; + SmallVector<MachineBasicBlock *, 4> Blocks; + for (auto &MBB : MF) + Blocks.push_back(&MBB); + if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) { + for (auto &Group : AllCmovGroups) { + // Skip any group that doesn't do at least one memory operand cmov. + if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); })) + continue; + + // For CMOV groups which we can rewrite and which contain a memory load, + // always rewrite them. On x86, a CMOV will dramatically amplify any + // memory latency by blocking speculative execution. + Changed = true; + convertCmovInstsToBranches(Group); + } + } + } + //===--------------------------------------------------------------------===// - // Algorithm + // Register-operand Conversion Algorithm // --------- // For each inner most loop // collectCmovCandidates() { @@ -157,32 +227,41 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) { // // Note: For more details, see each function description. //===--------------------------------------------------------------------===// - for (MachineBasicBlock &MBB : MF) { - MachineLoop *CurrLoop = MLI.getLoopFor(&MBB); + // Build up the loops in pre-order. + SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end()); + // Note that we need to check size on each iteration as we accumulate child + // loops. + for (int i = 0; i < (int)Loops.size(); ++i) + for (MachineLoop *Child : Loops[i]->getSubLoops()) + Loops.push_back(Child); + + for (MachineLoop *CurrLoop : Loops) { // Optimize only inner most loops. - if (!CurrLoop || CurrLoop->getHeader() != &MBB || - !CurrLoop->getSubLoops().empty()) + if (!CurrLoop->getSubLoops().empty()) continue; // List of consecutive CMOV instructions to be processed. CmovGroups CmovInstGroups; - if (!collectCmovCandidates(CurrLoop, CmovInstGroups)) + if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups)) continue; - if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups)) + if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(), + CmovInstGroups)) continue; Changed = true; for (auto &Group : CmovInstGroups) convertCmovInstsToBranches(Group); } + return Changed; } -bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop, - CmovGroups &CmovInstGroups) { +bool X86CmovConverterPass::collectCmovCandidates( + ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups, + bool IncludeLoads) { //===--------------------------------------------------------------------===// // Collect all CMOV-group-candidates and add them into CmovInstGroups. // @@ -204,24 +283,29 @@ bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop, // Current processed CMOV-Group. CmovGroup Group; - for (auto *MBB : CurrLoop->getBlocks()) { + for (auto *MBB : Blocks) { Group.clear(); // Condition code of first CMOV instruction current processed range and its // opposite condition code. - X86::CondCode FirstCC, FirstOppCC; + X86::CondCode FirstCC, FirstOppCC, MemOpCC; // Indicator of a non CMOVrr instruction in the current processed range. bool FoundNonCMOVInst = false; // Indicator for current processed CMOV-group if it should be skipped. bool SkipGroup = false; for (auto &I : *MBB) { + // Skip debug instructions. + if (I.isDebugValue()) + continue; X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode()); // Check if we found a X86::CMOVrr instruction. - if (CC != X86::COND_INVALID && !I.mayLoad()) { + if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) { if (Group.empty()) { // We found first CMOV in the range, reset flags. FirstCC = CC; FirstOppCC = X86::GetOppositeBranchCondition(CC); + // Clear out the prior group's memory operand CC. + MemOpCC = X86::COND_INVALID; FoundNonCMOVInst = false; SkipGroup = false; } @@ -231,6 +315,24 @@ bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop, if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC)) // Mark the SKipGroup indicator to skip current processed CMOV-Group. SkipGroup = true; + if (I.mayLoad()) { + if (MemOpCC == X86::COND_INVALID) + // The first memory operand CMOV. + MemOpCC = CC; + else if (CC != MemOpCC) + // Can't handle mixed conditions with memory operands. + SkipGroup = true; + } + // Check if we were relying on zero-extending behavior of the CMOV. + if (!SkipGroup && + llvm::any_of( + MRI->use_nodbg_instructions(I.defs().begin()->getReg()), + [&](MachineInstr &UseI) { + return UseI.getOpcode() == X86::SUBREG_TO_REG; + })) + // FIXME: We should model the cost of using an explicit MOV to handle + // the zero-extension rather than just refusing to handle this. + SkipGroup = true; continue; } // If Group is empty, keep looking for first CMOV in the range. @@ -278,7 +380,7 @@ static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) { } bool X86CmovConverterPass::checkForProfitableCmovCandidates( - MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) { + ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) { struct DepthInfo { /// Depth of original loop. unsigned Depth; @@ -328,10 +430,13 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( //===--------------------------------------------------------------------===// for (unsigned I = 0; I < LoopIterations; ++I) { DepthInfo &MaxDepth = LoopDepth[I]; - for (auto *MBB : CurrLoop->getBlocks()) { + for (auto *MBB : Blocks) { // Clear physical registers Def map. RegDefMaps[PhyRegType].clear(); for (MachineInstr &MI : *MBB) { + // Skip debug instructions. + if (MI.isDebugValue()) + continue; unsigned MIDepth = 0; unsigned MIDepthOpt = 0; bool IsCMOV = CmovInstructions.count(&MI); @@ -389,19 +494,28 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates( // Critical-path is iteration dependent - there is dependency of // critical-path instructions on critical-path instructions of // previous iteration. - // Thus, it is required to check the gradient of the gain - the - // change in Depth-Diff compared to the change in Loop-Depth between - // 1st and 2nd iterations. + // Thus, check the gain percent of the 2nd iteration (similar to the + // previous case), but it is also required to check the gradient of + // the gain - the change in Depth-Diff compared to the change in + // Loop-Depth between 1st and 2nd iterations. // To be conservative, the gradient need to be at least 50%. // + // In addition, In order not to optimize loops with very small gain, the + // gain (in cycles) after 2nd iteration should not be less than a given + // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply. + // // If loop is not worth optimizing, remove all CMOV-group-candidates. //===--------------------------------------------------------------------===// + if (Diff[1] < GainCycleThreshold) + return false; + bool WorthOptLoop = false; if (Diff[1] == Diff[0]) WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth; else if (Diff[1] > Diff[0]) WorthOptLoop = - (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth); + (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) && + (Diff[1] * 8 >= LoopDepth[1].Depth); if (!WorthOptLoop) return false; @@ -481,11 +595,36 @@ static bool checkEFLAGSLive(MachineInstr *MI) { return false; } +/// Given /p First CMOV instruction and /p Last CMOV instruction representing a +/// group of CMOV instructions, which may contain debug instructions in between, +/// move all debug instructions to after the last CMOV instruction, making the +/// CMOV group consecutive. +static void packCmovGroup(MachineInstr *First, MachineInstr *Last) { + assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID && + "Last instruction in a CMOV group must be a CMOV instruction"); + + SmallVector<MachineInstr *, 2> DBGInstructions; + for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) { + if (I->isDebugValue()) + DBGInstructions.push_back(&*I); + } + + // Splice the debug instruction after the cmov group. + MachineBasicBlock *MBB = First->getParent(); + for (auto *MI : DBGInstructions) + MBB->insertAfter(Last, MI->removeFromParent()); +} + void X86CmovConverterPass::convertCmovInstsToBranches( SmallVectorImpl<MachineInstr *> &Group) const { assert(!Group.empty() && "No CMOV instructions to convert"); ++NumOfOptimizedCmovGroups; + // If the CMOV group is not packed, e.g., there are debug instructions between + // first CMOV and last CMOV, then pack the group and make the CMOV instruction + // consecutive by moving the debug instructions to after the last CMOV. + packCmovGroup(Group.front(), Group.back()); + // To convert a CMOVcc instruction, we actually have to insert the diamond // control-flow pattern. The incoming instruction knows the destination vreg // to set, the condition code register to branch on, the true/false values to @@ -518,8 +657,18 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MachineInstr &MI = *Group.front(); MachineInstr *LastCMOV = Group.back(); DebugLoc DL = MI.getDebugLoc(); + X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode())); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + // Potentially swap the condition codes so that any memory operand to a CMOV + // is in the *false* position instead of the *true* position. We can invert + // any non-memory operand CMOV instructions to cope with this and we ensure + // memory operand CMOVs are only included with a single condition code. + if (llvm::any_of(Group, [&](MachineInstr *I) { + return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC; + })) + std::swap(CC, OppCC); + MachineBasicBlock *MBB = MI.getParent(); MachineFunction::iterator It = ++MBB->getIterator(); MachineFunction *F = MBB->getParent(); @@ -556,7 +705,111 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = std::next(MachineBasicBlock::iterator(LastCMOV)); + MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin(); MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); + + // First we need to insert an explicit load on the false path for any memory + // operand. We also need to potentially do register rewriting here, but it is + // simpler as the memory operands are always on the false path so we can + // simply take that input, whatever it is. + DenseMap<unsigned, unsigned> FalseBBRegRewriteTable; + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) { + auto &MI = *MIIt++; + // Skip any CMOVs in this group which don't load from memory. + if (!MI.mayLoad()) { + // Remember the false-side register input. + unsigned FalseReg = + MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2) + .getReg(); + // Walk back through any intermediate cmovs referenced. + while (true) { + auto FRIt = FalseBBRegRewriteTable.find(FalseReg); + if (FRIt == FalseBBRegRewriteTable.end()) + break; + FalseReg = FRIt->second; + } + FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg; + continue; + } + + // The condition must be the *opposite* of the one we've decided to branch + // on as the branch will go *around* the load and the load should happen + // when the CMOV condition is false. + assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC && + "Can only handle memory-operand cmov instructions with a condition " + "opposite to the selected branch direction."); + + // The goal is to rewrite the cmov from: + // + // MBB: + // %A = CMOVcc %B (tied), (mem) + // + // to + // + // MBB: + // %A = CMOVcc %B (tied), %C + // FalseMBB: + // %C = MOV (mem) + // + // Which will allow the next loop to rewrite the CMOV in terms of a PHI: + // + // MBB: + // JMP!cc SinkMBB + // FalseMBB: + // %C = MOV (mem) + // SinkMBB: + // %A = PHI [ %C, FalseMBB ], [ %B, MBB] + + // Get a fresh register to use as the destination of the MOV. + const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg()); + unsigned TmpReg = MRI->createVirtualRegister(RC); + + SmallVector<MachineInstr *, 4> NewMIs; + bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg, + /*UnfoldLoad*/ true, + /*UnfoldStore*/ false, NewMIs); + (void)Unfolded; + assert(Unfolded && "Should never fail to unfold a loading cmov!"); + + // Move the new CMOV to just before the old one and reset any impacted + // iterator. + auto *NewCMOV = NewMIs.pop_back_val(); + assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC && + "Last new instruction isn't the expected CMOV!"); + DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump()); + MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV); + if (&*MIItBegin == &MI) + MIItBegin = MachineBasicBlock::iterator(NewCMOV); + + // Sink whatever instructions were needed to produce the unfolded operand + // into the false block. + for (auto *NewMI : NewMIs) { + DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump()); + FalseMBB->insert(FalseInsertionPoint, NewMI); + // Re-map any operands that are from other cmovs to the inputs for this block. + for (auto &MOp : NewMI->uses()) { + if (!MOp.isReg()) + continue; + auto It = FalseBBRegRewriteTable.find(MOp.getReg()); + if (It == FalseBBRegRewriteTable.end()) + continue; + + MOp.setReg(It->second); + // This might have been a kill when it referenced the cmov result, but + // it won't necessarily be once rewritten. + // FIXME: We could potentially improve this by tracking whether the + // operand to the cmov was also a kill, and then skipping the PHI node + // construction below. + MOp.setIsKill(false); + } + } + MBB->erase(MachineBasicBlock::iterator(MI), + std::next(MachineBasicBlock::iterator(MI))); + + // Add this PHI to the rewrite table. + FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg; + } + // As we are creating the PHIs, we have to be careful if there is more than // one. Later CMOVs may reference the results of earlier CMOVs, but later // PHIs have to reference the individual true/false inputs from earlier PHIs. @@ -604,7 +857,11 @@ void X86CmovConverterPass::convertCmovInstsToBranches( MBB->erase(MIItBegin, MIItEnd); } -} // End anonymous namespace. +INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion", + false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion", + false, false) FunctionPass *llvm::createX86CmovConverterPass() { return new X86CmovConverterPass(); diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp new file mode 100644 index 000000000000..0a87fb4533c2 --- /dev/null +++ b/lib/Target/X86/X86DomainReassignment.cpp @@ -0,0 +1,753 @@ +//===--- X86DomainReassignment.cpp - Selectively switch register classes---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass attempts to find instruction chains (closures) in one domain, +// and convert them to equivalent instructions in a different domain, +// if profitable. +// +//===----------------------------------------------------------------------===// + +#include "X86.h" +#include "X86InstrInfo.h" +#include "X86Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include <bitset> + +using namespace llvm; + +namespace llvm { +void initializeX86DomainReassignmentPass(PassRegistry &); +} + +#define DEBUG_TYPE "x86-domain-reassignment" + +STATISTIC(NumClosuresConverted, "Number of closures converted by the pass"); + +static cl::opt<bool> DisableX86DomainReassignment( + "disable-x86-domain-reassignment", cl::Hidden, + cl::desc("X86: Disable Virtual Register Reassignment."), cl::init(false)); + +namespace { +enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain, NumDomains }; + +static bool isGPR(const TargetRegisterClass *RC) { + return X86::GR64RegClass.hasSubClassEq(RC) || + X86::GR32RegClass.hasSubClassEq(RC) || + X86::GR16RegClass.hasSubClassEq(RC) || + X86::GR8RegClass.hasSubClassEq(RC); +} + +static bool isMask(const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + return X86::VK16RegClass.hasSubClassEq(RC); +} + +static RegDomain getDomain(const TargetRegisterClass *RC, + const TargetRegisterInfo *TRI) { + if (isGPR(RC)) + return GPRDomain; + if (isMask(RC, TRI)) + return MaskDomain; + return OtherDomain; +} + +/// Return a register class equivalent to \p SrcRC, in \p Domain. +static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC, + RegDomain Domain) { + assert(Domain == MaskDomain && "add domain"); + if (X86::GR8RegClass.hasSubClassEq(SrcRC)) + return &X86::VK8RegClass; + if (X86::GR16RegClass.hasSubClassEq(SrcRC)) + return &X86::VK16RegClass; + if (X86::GR32RegClass.hasSubClassEq(SrcRC)) + return &X86::VK32RegClass; + if (X86::GR64RegClass.hasSubClassEq(SrcRC)) + return &X86::VK64RegClass; + llvm_unreachable("add register class"); + return nullptr; +} + +/// Abstract Instruction Converter class. +class InstrConverterBase { +protected: + unsigned SrcOpcode; + +public: + InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {} + + virtual ~InstrConverterBase() {} + + /// \returns true if \p MI is legal to convert. + virtual bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const { + assert(MI->getOpcode() == SrcOpcode && + "Wrong instruction passed to converter"); + return true; + } + + /// Applies conversion to \p MI. + /// + /// \returns true if \p MI is no longer need, and can be deleted. + virtual bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const = 0; + + /// \returns the cost increment incurred by converting \p MI. + virtual double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const = 0; +}; + +/// An Instruction Converter which ignores the given instruction. +/// For example, PHI instructions can be safely ignored since only the registers +/// need to change. +class InstrIgnore : public InstrConverterBase { +public: + InstrIgnore(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {} + + bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + assert(isLegal(MI, TII) && "Cannot convert instruction"); + return false; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with another. +class InstrReplacer : public InstrConverterBase { +public: + /// Opcode of the destination instruction. + unsigned DstOpcode; + + InstrReplacer(unsigned SrcOpcode, unsigned DstOpcode) + : InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {} + + bool isLegal(const MachineInstr *MI, + const TargetInstrInfo *TII) const override { + if (!InstrConverterBase::isLegal(MI, TII)) + return false; + // It's illegal to replace an instruction that implicitly defines a register + // with an instruction that doesn't, unless that register dead. + for (auto &MO : MI->implicit_operands()) + if (MO.isReg() && MO.isDef() && !MO.isDead() && + !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg())) + return false; + return true; + } + + bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + assert(isLegal(MI, TII) && "Cannot convert instruction"); + MachineInstrBuilder Bld = + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(DstOpcode)); + // Transfer explicit operands from original instruction. Implicit operands + // are handled by BuildMI. + for (auto &Op : MI->explicit_operands()) + Bld.add(Op); + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + // Assuming instructions have the same cost. + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with another, and +/// adds a COPY from the new instruction's destination to the old one's. +class InstrReplacerDstCOPY : public InstrConverterBase { +public: + unsigned DstOpcode; + + InstrReplacerDstCOPY(unsigned SrcOpcode, unsigned DstOpcode) + : InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {} + + bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + assert(isLegal(MI, TII) && "Cannot convert instruction"); + MachineBasicBlock *MBB = MI->getParent(); + auto &DL = MI->getDebugLoc(); + + unsigned Reg = MRI->createVirtualRegister( + TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(), + *MBB->getParent())); + MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg); + for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx) + Bld.add(MI->getOperand(Idx)); + + BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY)) + .add(MI->getOperand(0)) + .addReg(Reg); + + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + // Assuming instructions have the same cost, and that COPY is in the same + // domain so it will be eliminated. + return 0; + } +}; + +/// An Instruction Converter for replacing COPY instructions. +class InstrCOPYReplacer : public InstrReplacer { +public: + RegDomain DstDomain; + + InstrCOPYReplacer(unsigned SrcOpcode, RegDomain DstDomain, unsigned DstOpcode) + : InstrReplacer(SrcOpcode, DstOpcode), DstDomain(DstDomain) {} + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY"); + + for (auto &MO : MI->operands()) { + // Physical registers will not be converted. Assume that converting the + // COPY to the destination domain will eventually result in a actual + // instruction. + if (TargetRegisterInfo::isPhysicalRegister(MO.getReg())) + return 1; + + RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()), + MRI->getTargetRegisterInfo()); + // Converting a cross domain COPY to a same domain COPY should eliminate + // an insturction + if (OpDomain == DstDomain) + return -1; + } + return 0; + } +}; + +/// An Instruction Converter which replaces an instruction with a COPY. +class InstrReplaceWithCopy : public InstrConverterBase { +public: + // Source instruction operand Index, to be used as the COPY source. + unsigned SrcOpIdx; + + InstrReplaceWithCopy(unsigned SrcOpcode, unsigned SrcOpIdx) + : InstrConverterBase(SrcOpcode), SrcOpIdx(SrcOpIdx) {} + + bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + assert(isLegal(MI, TII) && "Cannot convert instruction"); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY)) + .add({MI->getOperand(0), MI->getOperand(SrcOpIdx)}); + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + return 0; + } +}; + +/// An Instruction Converter which completely deletes an instruction. +/// For example, IMPLICIT_DEF instructions can be deleted when converting from +/// GPR to mask. +class InstrDeleter : public InstrConverterBase { +public: + InstrDeleter(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {} + + bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII, + MachineRegisterInfo *MRI) const override { + assert(isLegal(MI, TII) && "Cannot convert instruction"); + return true; + } + + double getExtraCost(const MachineInstr *MI, + MachineRegisterInfo *MRI) const override { + return 0; + } +}; + +// Key type to be used by the Instruction Converters map. +// A converter is identified by <destination domain, source opcode> +typedef std::pair<int, unsigned> InstrConverterBaseKeyTy; + +typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *> + InstrConverterBaseMap; + +/// A closure is a set of virtual register representing all of the edges in +/// the closure, as well as all of the instructions connected by those edges. +/// +/// A closure may encompass virtual registers in the same register bank that +/// have different widths. For example, it may contain 32-bit GPRs as well as +/// 64-bit GPRs. +/// +/// A closure that computes an address (i.e. defines a virtual register that is +/// used in a memory operand) excludes the instructions that contain memory +/// operands using the address. Such an instruction will be included in a +/// different closure that manipulates the loaded or stored value. +class Closure { +private: + const TargetInstrInfo *TII; + MachineRegisterInfo *MRI; + + /// Virtual registers in the closure. + DenseSet<unsigned> Edges; + + /// Instructions in the closure. + SmallVector<MachineInstr *, 8> Instrs; + + /// A map of available Instruction Converters. + const InstrConverterBaseMap &Converters; + + /// The register domain of this closure. + RegDomain Domain; + + /// Domains which this closure can legally be reassigned to. + std::bitset<NumDomains> LegalDstDomains; + + /// Enqueue \p Reg to be considered for addition to the closure. + void visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist); + + /// Add \p MI to this closure. + void encloseInstr(MachineInstr *MI); + + /// Calculate the total cost of reassigning the closure to \p Domain. + double calculateCost(RegDomain Domain) const; + + /// All edges that are included in some closure. + DenseSet<unsigned> &EnclosedEdges; + + /// All instructions that are included in some closure. + DenseMap<MachineInstr *, Closure *> &EnclosedInstrs; + +public: + Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI, + const InstrConverterBaseMap &Converters, + std::initializer_list<RegDomain> LegalDstDomainList, + DenseSet<unsigned> &EnclosedEdges, + DenseMap<MachineInstr *, Closure *> &EnclosedInstrs) + : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain), + EnclosedEdges(EnclosedEdges), EnclosedInstrs(EnclosedInstrs) { + for (RegDomain D : LegalDstDomainList) + LegalDstDomains.set(D); + } + + /// Starting from \Reg, expand the closure as much as possible. + void buildClosure(unsigned E); + + /// /returns true if it is profitable to reassign the closure to \p Domain. + bool isReassignmentProfitable(RegDomain Domain) const; + + /// Reassign the closure to \p Domain. + void Reassign(RegDomain Domain) const; + + /// Mark this closure as illegal for reassignment to all domains. + void setAllIllegal() { LegalDstDomains.reset(); } + + /// \returns true if this closure has domains which are legal to reassign to. + bool hasLegalDstDomain() const { return LegalDstDomains.any(); } + + /// \returns true if is legal to reassign this closure to domain \p RD. + bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; } + + bool empty() const { return Edges.empty(); } +}; + +class X86DomainReassignment : public MachineFunctionPass { +public: + static char ID; + + X86DomainReassignment() : MachineFunctionPass(ID) { + initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &MF) override; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "X86 Domain Reassignment Pass"; + } + +private: + const X86Subtarget *STI; + MachineRegisterInfo *MRI; + const X86InstrInfo *TII; + + /// A map of available Instruction Converters. + InstrConverterBaseMap Converters; + + /// Initialize Converters map. + void initConverters(); +}; + +char X86DomainReassignment::ID = 0; + +} // End anonymous namespace. + +void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) { + if (EnclosedEdges.count(Reg)) + return; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + return; + + if (!MRI->hasOneDef(Reg)) + return; + + RegDomain RD = getDomain(MRI->getRegClass(Reg), MRI->getTargetRegisterInfo()); + // First edge in closure sets the domain. + if (Domain == NoDomain) + Domain = RD; + + if (Domain != RD) + return; + + Worklist.push_back(Reg); +} + +void Closure::encloseInstr(MachineInstr *MI) { + auto I = EnclosedInstrs.find(MI); + if (I != EnclosedInstrs.end()) { + if (I->second != this) + // Instruction already belongs to another closure, avoid conflicts between + // closure and mark this closure as illegal. + setAllIllegal(); + return; + } + + EnclosedInstrs[MI] = this; + Instrs.push_back(MI); + + // Mark closure as illegal for reassignment to domains, if there is no + // converter for the instruction or if the converter cannot convert the + // instruction. + for (unsigned i = 0; i != LegalDstDomains.size(); ++i) { + if (LegalDstDomains[i]) { + InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()}); + if (!IC || !IC->isLegal(MI, TII)) + LegalDstDomains[i] = false; + } + } +} + +double Closure::calculateCost(RegDomain DstDomain) const { + assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure"); + + double Cost = 0.0; + for (auto MI : Instrs) + Cost += + Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI); + return Cost; +} + +bool Closure::isReassignmentProfitable(RegDomain Domain) const { + return calculateCost(Domain) < 0.0; +} + +void Closure::Reassign(RegDomain Domain) const { + assert(isLegal(Domain) && "Cannot convert illegal closure"); + + // Iterate all instructions in the closure, convert each one using the + // appropriate converter. + SmallVector<MachineInstr *, 8> ToErase; + for (auto MI : Instrs) + if (Converters.lookup({Domain, MI->getOpcode()}) + ->convertInstr(MI, TII, MRI)) + ToErase.push_back(MI); + + // Iterate all registers in the closure, replace them with registers in the + // destination domain. + for (unsigned Reg : Edges) { + MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain)); + for (auto &MO : MRI->use_operands(Reg)) { + if (MO.isReg()) + // Remove all subregister references as they are not valid in the + // destination domain. + MO.setSubReg(0); + } + } + + for (auto MI : ToErase) + MI->eraseFromParent(); +} + +/// \returns true when \p Reg is used as part of an address calculation in \p +/// MI. +static bool usedAsAddr(const MachineInstr &MI, unsigned Reg, + const TargetInstrInfo *TII) { + if (!MI.mayLoadOrStore()) + return false; + + const MCInstrDesc &Desc = TII->get(MI.getOpcode()); + int MemOpStart = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemOpStart == -1) + return false; + + MemOpStart += X86II::getOperandBias(Desc); + for (unsigned MemOpIdx = MemOpStart, + MemOpEnd = MemOpStart + X86::AddrNumOperands; + MemOpIdx < MemOpEnd; ++MemOpIdx) { + auto &Op = MI.getOperand(MemOpIdx); + if (Op.isReg() && Op.getReg() == Reg) + return true; + } + return false; +} + +void Closure::buildClosure(unsigned Reg) { + SmallVector<unsigned, 4> Worklist; + visitRegister(Reg, Worklist); + while (!Worklist.empty()) { + unsigned CurReg = Worklist.pop_back_val(); + + // Register already in this closure. + if (!Edges.insert(CurReg).second) + continue; + + MachineInstr *DefMI = MRI->getVRegDef(CurReg); + encloseInstr(DefMI); + + // Add register used by the defining MI to the worklist. + // Do not add registers which are used in address calculation, they will be + // added to a different closure. + int OpEnd = DefMI->getNumOperands(); + const MCInstrDesc &Desc = DefMI->getDesc(); + int MemOp = X86II::getMemoryOperandNo(Desc.TSFlags); + if (MemOp != -1) + MemOp += X86II::getOperandBias(Desc); + for (int OpIdx = 0; OpIdx < OpEnd; ++OpIdx) { + if (OpIdx == MemOp) { + // skip address calculation. + OpIdx += (X86::AddrNumOperands - 1); + continue; + } + auto &Op = DefMI->getOperand(OpIdx); + if (!Op.isReg() || !Op.isUse()) + continue; + visitRegister(Op.getReg(), Worklist); + } + + // Expand closure through register uses. + for (auto &UseMI : MRI->use_nodbg_instructions(CurReg)) { + // We would like to avoid converting closures which calculare addresses, + // as this should remain in GPRs. + if (usedAsAddr(UseMI, CurReg, TII)) { + setAllIllegal(); + continue; + } + encloseInstr(&UseMI); + + for (auto &DefOp : UseMI.defs()) { + if (!DefOp.isReg()) + continue; + + unsigned DefReg = DefOp.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(DefReg)) { + setAllIllegal(); + continue; + } + visitRegister(DefReg, Worklist); + } + } + } +} + +void X86DomainReassignment::initConverters() { + Converters[{MaskDomain, TargetOpcode::PHI}] = + new InstrIgnore(TargetOpcode::PHI); + + Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] = + new InstrDeleter(TargetOpcode::IMPLICIT_DEF); + + Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] = + new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2); + + Converters[{MaskDomain, TargetOpcode::COPY}] = + new InstrCOPYReplacer(TargetOpcode::COPY, MaskDomain, TargetOpcode::COPY); + + auto createReplacerDstCOPY = [&](unsigned From, unsigned To) { + Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(From, To); + }; + + createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm); + createReplacerDstCOPY(X86::MOVZX64rm16, X86::KMOVWkm); + + createReplacerDstCOPY(X86::MOVZX32rr16, X86::KMOVWkk); + createReplacerDstCOPY(X86::MOVZX64rr16, X86::KMOVWkk); + + if (STI->hasDQI()) { + createReplacerDstCOPY(X86::MOVZX16rm8, X86::KMOVBkm); + createReplacerDstCOPY(X86::MOVZX32rm8, X86::KMOVBkm); + createReplacerDstCOPY(X86::MOVZX64rm8, X86::KMOVBkm); + + createReplacerDstCOPY(X86::MOVZX16rr8, X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX32rr8, X86::KMOVBkk); + createReplacerDstCOPY(X86::MOVZX64rr8, X86::KMOVBkk); + } + + auto createReplacer = [&](unsigned From, unsigned To) { + Converters[{MaskDomain, From}] = new InstrReplacer(From, To); + }; + + createReplacer(X86::MOV16rm, X86::KMOVWkm); + createReplacer(X86::MOV16mr, X86::KMOVWmk); + createReplacer(X86::MOV16rr, X86::KMOVWkk); + createReplacer(X86::SHR16ri, X86::KSHIFTRWri); + createReplacer(X86::SHL16ri, X86::KSHIFTLWri); + createReplacer(X86::NOT16r, X86::KNOTWrr); + createReplacer(X86::OR16rr, X86::KORWrr); + createReplacer(X86::AND16rr, X86::KANDWrr); + createReplacer(X86::XOR16rr, X86::KXORWrr); + + if (STI->hasBWI()) { + createReplacer(X86::MOV32rm, X86::KMOVDkm); + createReplacer(X86::MOV64rm, X86::KMOVQkm); + + createReplacer(X86::MOV32mr, X86::KMOVDmk); + createReplacer(X86::MOV64mr, X86::KMOVQmk); + + createReplacer(X86::MOV32rr, X86::KMOVDkk); + createReplacer(X86::MOV64rr, X86::KMOVQkk); + + createReplacer(X86::SHR32ri, X86::KSHIFTRDri); + createReplacer(X86::SHR64ri, X86::KSHIFTRQri); + + createReplacer(X86::SHL32ri, X86::KSHIFTLDri); + createReplacer(X86::SHL64ri, X86::KSHIFTLQri); + + createReplacer(X86::ADD32rr, X86::KADDDrr); + createReplacer(X86::ADD64rr, X86::KADDQrr); + + createReplacer(X86::NOT32r, X86::KNOTDrr); + createReplacer(X86::NOT64r, X86::KNOTQrr); + + createReplacer(X86::OR32rr, X86::KORDrr); + createReplacer(X86::OR64rr, X86::KORQrr); + + createReplacer(X86::AND32rr, X86::KANDDrr); + createReplacer(X86::AND64rr, X86::KANDQrr); + + createReplacer(X86::ANDN32rr, X86::KANDNDrr); + createReplacer(X86::ANDN64rr, X86::KANDNQrr); + + createReplacer(X86::XOR32rr, X86::KXORDrr); + createReplacer(X86::XOR64rr, X86::KXORQrr); + + createReplacer(X86::TEST32rr, X86::KTESTDrr); + createReplacer(X86::TEST64rr, X86::KTESTQrr); + } + + if (STI->hasDQI()) { + createReplacer(X86::ADD8rr, X86::KADDBrr); + createReplacer(X86::ADD16rr, X86::KADDWrr); + + createReplacer(X86::AND8rr, X86::KANDBrr); + + createReplacer(X86::MOV8rm, X86::KMOVBkm); + createReplacer(X86::MOV8mr, X86::KMOVBmk); + createReplacer(X86::MOV8rr, X86::KMOVBkk); + + createReplacer(X86::NOT8r, X86::KNOTBrr); + + createReplacer(X86::OR8rr, X86::KORBrr); + + createReplacer(X86::SHR8ri, X86::KSHIFTRBri); + createReplacer(X86::SHL8ri, X86::KSHIFTLBri); + + createReplacer(X86::TEST8rr, X86::KTESTBrr); + createReplacer(X86::TEST16rr, X86::KTESTWrr); + + createReplacer(X86::XOR8rr, X86::KXORBrr); + } +} + +bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(MF.getFunction())) + return false; + if (DisableX86DomainReassignment) + return false; + + DEBUG(dbgs() << "***** Machine Function before Domain Reassignment *****\n"); + DEBUG(MF.print(dbgs())); + + STI = &MF.getSubtarget<X86Subtarget>(); + // GPR->K is the only transformation currently supported, bail out early if no + // AVX512. + if (!STI->hasAVX512()) + return false; + + MRI = &MF.getRegInfo(); + assert(MRI->isSSA() && "Expected MIR to be in SSA form"); + + TII = STI->getInstrInfo(); + initConverters(); + bool Changed = false; + + DenseSet<unsigned> EnclosedEdges; + DenseMap<MachineInstr *, Closure *> EnclosedInstrs; + + std::vector<Closure> Closures; + + // Go over all virtual registers and calculate a closure. + for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx); + + // GPR only current source domain supported. + if (!isGPR(MRI->getRegClass(Reg))) + continue; + + // Register already in closure. + if (EnclosedEdges.count(Reg)) + continue; + + // Calculate closure starting with Reg. + Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges, + EnclosedInstrs); + C.buildClosure(Reg); + + // Collect all closures that can potentially be converted. + if (!C.empty() && C.isLegal(MaskDomain)) + Closures.push_back(std::move(C)); + } + + for (Closure &C : Closures) + if (C.isReassignmentProfitable(MaskDomain)) { + C.Reassign(MaskDomain); + ++NumClosuresConverted; + Changed = true; + } + + for (auto I : Converters) + delete I.second; + + DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n"); + DEBUG(MF.print(dbgs())); + + return Changed; +} + +INITIALIZE_PASS(X86DomainReassignment, "x86-domain-reassignment", + "X86 Domain Reassignment Pass", false, false) + +/// Returns an instance of the Domain Reassignment pass. +FunctionPass *llvm::createX86DomainReassignmentPass() { + return new X86DomainReassignment(); +} diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp index 6472bbbc9016..6dd4631a4844 100755 --- a/lib/Target/X86/X86EvexToVex.cpp +++ b/lib/Target/X86/X86EvexToVex.cpp @@ -1,4 +1,4 @@ -//===----------------------- X86EvexToVex.cpp ----------------------------===// +//===- X86EvexToVex.cpp ---------------------------------------------------===// // Compress EVEX instructions to VEX encoding when possible to reduce code size // // The LLVM Compiler Infrastructure @@ -6,18 +6,19 @@ // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//===---------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// +// /// \file /// This file defines the pass that goes over all AVX-512 instructions which /// are encoded using the EVEX prefix and if possible replaces them by their /// corresponding VEX encoding which is usually shorter by 2 bytes. /// EVEX instructions may be encoded via the VEX prefix when the AVX-512 /// instruction has a corresponding AVX/AVX2 opcode and when it does not -/// use the xmm or the mask registers or xmm/ymm registers wuith indexes +/// use the xmm or the mask registers or xmm/ymm registers with indexes /// higher than 15. /// The pass applies code reduction on the generated code for AVX-512 instrs. -/// -//===---------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// #include "InstPrinter/X86InstComments.h" #include "MCTargetDesc/X86BaseInfo.h" @@ -54,7 +55,7 @@ namespace { class EvexToVexInstPass : public MachineFunctionPass { /// X86EvexToVexCompressTable - Evex to Vex encoding opcode map. - typedef DenseMap<unsigned, uint16_t> EvexToVexTableType; + using EvexToVexTableType = DenseMap<unsigned, uint16_t>; EvexToVexTableType EvexToVex128Table; EvexToVexTableType EvexToVex256Table; @@ -101,10 +102,10 @@ private: const X86InstrInfo *TII; }; -char EvexToVexInstPass::ID = 0; - } // end anonymous namespace +char EvexToVexInstPass::ID = 0; + bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); @@ -118,8 +119,8 @@ bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) { /// EVEX encoded instrs by VEX encoding when possible. for (MachineBasicBlock &MBB : MF) { - // Traverse the basic block. - for (MachineInstr &MI : MBB) + // Traverse the basic block. + for (MachineInstr &MI : MBB) Changed |= CompressEvexToVexImpl(MI); } @@ -131,6 +132,75 @@ void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable, EvexToVexTable[EvexOp] = VexOp; } +static bool usesExtendedRegister(const MachineInstr &MI) { + auto isHiRegIdx = [](unsigned Reg) { + // Check for XMM register with indexes between 16 - 31. + if (Reg >= X86::XMM16 && Reg <= X86::XMM31) + return true; + + // Check for YMM register with indexes between 16 - 31. + if (Reg >= X86::YMM16 && Reg <= X86::YMM31) + return true; + + return false; + }; + + // Check that operands are not ZMM regs or + // XMM/YMM regs with hi indexes between 16 - 31. + for (const MachineOperand &MO : MI.explicit_operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + + assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) && + "ZMM instructions should not be in the EVEX->VEX tables"); + + if (isHiRegIdx(Reg)) + return true; + } + + return false; +} + +// Do any custom cleanup needed to finalize the conversion. +static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) { + (void)NewOpc; + unsigned Opc = MI.getOpcode(); + switch (Opc) { + case X86::VALIGNDZ128rri: + case X86::VALIGNDZ128rmi: + case X86::VALIGNQZ128rri: + case X86::VALIGNQZ128rmi: { + assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) && + "Unexpected new opcode!"); + unsigned Scale = (Opc == X86::VALIGNQZ128rri || + Opc == X86::VALIGNQZ128rmi) ? 8 : 4; + MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1); + Imm.setImm(Imm.getImm() * Scale); + break; + } + case X86::VSHUFF32X4Z256rmi: + case X86::VSHUFF32X4Z256rri: + case X86::VSHUFF64X2Z256rmi: + case X86::VSHUFF64X2Z256rri: + case X86::VSHUFI32X4Z256rmi: + case X86::VSHUFI32X4Z256rri: + case X86::VSHUFI64X2Z256rmi: + case X86::VSHUFI64X2Z256rri: { + assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr || + NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) && + "Unexpected new opcode!"); + MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1); + int64_t ImmVal = Imm.getImm(); + // Set bit 5, move bit 1 to bit 4, copy bit 0. + Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1)); + break; + } + } +} + + // For EVEX instructions that can be encoded using VEX encoding // replace them by the VEX encoding in order to reduce size. bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { @@ -147,18 +217,18 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { // Check for EVEX instructions only. if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX) return false; - - // Check for EVEX instructions with mask or broadcast as in these cases - // the EVEX prefix is needed in order to carry this information + + // Check for EVEX instructions with mask or broadcast as in these cases + // the EVEX prefix is needed in order to carry this information // thus preventing the transformation to VEX encoding. if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B)) return false; - + // Check for non EVEX_V512 instrs only. // EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0. if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L)) - return false; - + return false; + // EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0. bool IsEVEX_V128 = (!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L)); @@ -176,7 +246,6 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { if (It != EvexToVex256Table.end()) NewOpc = It->second; } - // Check for EVEX_V128 or Scalar instructions. else if (IsEVEX_V128) { // Search for opcode in the EvexToVex128 table. @@ -188,36 +257,14 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const { if (!NewOpc) return false; - auto isHiRegIdx = [](unsigned Reg) { - // Check for XMM register with indexes between 16 - 31. - if (Reg >= X86::XMM16 && Reg <= X86::XMM31) - return true; - - // Check for YMM register with indexes between 16 - 31. - if (Reg >= X86::YMM16 && Reg <= X86::YMM31) - return true; - + if (usesExtendedRegister(MI)) return false; - }; - - // Check that operands are not ZMM regs or - // XMM/YMM regs with hi indexes between 16 - 31. - for (const MachineOperand &MO : MI.explicit_operands()) { - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - - assert (!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31)); + performCustomAdjustments(MI, NewOpc); - if (isHiRegIdx(Reg)) - return false; - } - - const MCInstrDesc &MCID = TII->get(NewOpc); - MI.setDesc(MCID); + MI.setDesc(TII->get(NewOpc)); MI.setAsmPrinterFlag(AC_EVEX_2_VEX); - return true; + return true; } INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false) diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp index 5dfd95f71301..ab2ef26d1cc9 100644 --- a/lib/Target/X86/X86ExpandPseudo.cpp +++ b/lib/Target/X86/X86ExpandPseudo.cpp @@ -222,7 +222,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB, case X86::EH_RESTORE: { // Restore ESP and EBP, and optionally ESI if required. bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality( - MBB.getParent()->getFunction()->getPersonalityFn())); + MBB.getParent()->getFunction().getPersonalityFn())); X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH); MBBI->eraseFromParent(); return true; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index 527e5d568ac6..5dae485f4c9f 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -110,6 +110,8 @@ private: bool X86SelectZExt(const Instruction *I); + bool X86SelectSExt(const Instruction *I); + bool X86SelectBranch(const Instruction *I); bool X86SelectShift(const Instruction *I); @@ -208,8 +210,8 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) { case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH; case CmpInst::FCMP_UGT: CC = 6; break; case CmpInst::FCMP_ORD: CC = 7; break; - case CmpInst::FCMP_UEQ: - case CmpInst::FCMP_ONE: CC = 8; break; + case CmpInst::FCMP_UEQ: CC = 8; break; + case CmpInst::FCMP_ONE: CC = 12; break; } return std::make_pair(CC, NeedSwap); @@ -329,10 +331,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, switch (VT.getSimpleVT().SimpleTy) { default: return false; case MVT::i1: - // TODO: Support this properly. - if (Subtarget->hasAVX512()) - return false; - LLVM_FALLTHROUGH; case MVT::i8: Opc = X86::MOV8rm; RC = &X86::GR8RegClass; @@ -353,7 +351,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::f32: if (X86ScalarSSEf32) { Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm; - RC = &X86::FR32RegClass; + RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; RC = &X86::RFP32RegClass; @@ -362,7 +360,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::f64: if (X86ScalarSSEf64) { Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm; - RC = &X86::FR64RegClass; + RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; RC = &X86::RFP64RegClass; @@ -381,7 +379,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVUPSZ128rm : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm; - RC = &X86::VR128RegClass; + RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v2f64: if (IsNonTemporal && Alignment >= 16 && HasSSE41) @@ -393,7 +391,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVUPDZ128rm : HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm; - RC = &X86::VR128RegClass; + RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v4i32: case MVT::v2i64: @@ -408,7 +406,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, else Opc = HasVLX ? X86::VMOVDQU64Z128rm : HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm; - RC = &X86::VR128RegClass; + RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass; break; case MVT::v8f32: assert(HasAVX); @@ -420,19 +418,19 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm; else Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm; - RC = &X86::VR256RegClass; + RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v4f64: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) - Opc = X86::VMOVNTDQAYrm; + Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; else if (IsNonTemporal && Alignment >= 16) return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm; else Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm; - RC = &X86::VR256RegClass; + RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v8i32: case MVT::v4i64: @@ -440,14 +438,14 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM, case MVT::v32i8: assert(HasAVX); if (IsNonTemporal && Alignment >= 32 && HasAVX2) - Opc = X86::VMOVNTDQAYrm; + Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm; else if (IsNonTemporal && Alignment >= 16) return false; // Force split for X86::VMOVNTDQArm else if (Alignment >= 32) Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm; else Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm; - RC = &X86::VR256RegClass; + RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass; break; case MVT::v16f32: assert(HasAVX512); @@ -510,16 +508,6 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill, case MVT::f80: // No f80 support yet. default: return false; case MVT::i1: { - // In case ValReg is a K register, COPY to a GPR - if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) { - unsigned KValReg = ValReg; - ValReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ValReg) - .addReg(KValReg); - ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true, - X86::sub_8bit); - } // Mask out all but lowest bit. unsigned AndResult = createResultReg(&X86::GR8RegClass); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, @@ -1077,10 +1065,6 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { (AM.Base.Reg != 0 || AM.IndexReg != 0)) return false; - // Can't handle DLL Import. - if (GV->hasDLLImportStorageClass()) - return false; - // Can't handle TLS. if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV)) if (GVar->isThreadLocal()) @@ -1089,8 +1073,9 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) { // Okay, we've committed to selecting this global. Set up the basic address. AM.GV = GV; - // No ABI requires an extra load for anything other than DLLImport, which - // we rejected above. Return a direct reference to the global. + // Return a direct reference to the global. Fastisel can handle calls to + // functions that require loads, such as dllimport and nonlazybind + // functions. if (Subtarget->isPICStyleRIPRel()) { // Use rip-relative addressing if we can. Above we verified that the // base and index registers are unused. @@ -1254,16 +1239,6 @@ bool X86FastISel::X86SelectRet(const Instruction *I) { if (SrcVT == MVT::i1) { if (Outs[0].Flags.isSExt()) return false; - // In case SrcReg is a K register, COPY to a GPR - if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) { - unsigned KSrcReg = SrcReg; - SrcReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), SrcReg) - .addReg(KSrcReg); - SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true, - X86::sub_8bit); - } SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; } @@ -1367,6 +1342,7 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) { } static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { + bool HasAVX512 = Subtarget->hasAVX512(); bool HasAVX = Subtarget->hasAVX(); bool X86ScalarSSEf32 = Subtarget->hasSSE1(); bool X86ScalarSSEf64 = Subtarget->hasSSE2(); @@ -1378,9 +1354,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) { case MVT::i32: return X86::CMP32rr; case MVT::i64: return X86::CMP64rr; case MVT::f32: - return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0; + return X86ScalarSSEf32 + ? (HasAVX512 ? X86::VUCOMISSZrr + : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) + : 0; case MVT::f64: - return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0; + return X86ScalarSSEf64 + ? (HasAVX512 ? X86::VUCOMISDZrr + : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) + : 0; } } @@ -1453,9 +1435,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { if (!isTypeLegal(I->getOperand(0)->getType(), VT)) return false; - if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512()) - return false; - // Try to optimize or fold the cmp. CmpInst::Predicate Predicate = optimizeCmpPredicate(CI); unsigned ResultReg = 0; @@ -1555,17 +1534,6 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { // Handle zero-extension from i1 to i8, which is common. MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); if (SrcVT == MVT::i1) { - // In case ResultReg is a K register, COPY to a GPR - if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) { - unsigned KResultReg = ResultReg; - ResultReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ResultReg) - .addReg(KResultReg); - ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true, - X86::sub_8bit); - } - // Set the high bits to zero. ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false); SrcVT = MVT::i8; @@ -1593,6 +1561,15 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg) .addImm(0).addReg(Result32).addImm(X86::sub_32bit); + } else if (DstVT == MVT::i16) { + // i8->i16 doesn't exist in the autogenerated isel table. Need to zero + // extend to 32-bits and then extract down to 16-bits. + unsigned Result32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8), + Result32).addReg(ResultReg); + + ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true, + X86::sub_16bit); } else if (DstVT != MVT::i8) { ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND, ResultReg, /*Kill=*/true); @@ -1604,6 +1581,52 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) { return true; } +bool X86FastISel::X86SelectSExt(const Instruction *I) { + EVT DstVT = TLI.getValueType(DL, I->getType()); + if (!TLI.isTypeLegal(DstVT)) + return false; + + unsigned ResultReg = getRegForValue(I->getOperand(0)); + if (ResultReg == 0) + return false; + + // Handle sign-extension from i1 to i8. + MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType()); + if (SrcVT == MVT::i1) { + // Set the high bits to zero. + unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg, + /*TODO: Kill=*/false); + if (ZExtReg == 0) + return false; + + // Negate the result to make an 8-bit sign extended value. + ResultReg = createResultReg(&X86::GR8RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r), + ResultReg).addReg(ZExtReg); + + SrcVT = MVT::i8; + } + + if (DstVT == MVT::i16) { + // i8->i16 doesn't exist in the autogenerated isel table. Need to sign + // extend to 32-bits and then extract down to 16-bits. + unsigned Result32 = createResultReg(&X86::GR32RegClass); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8), + Result32).addReg(ResultReg); + + ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true, + X86::sub_16bit); + } else if (DstVT != MVT::i8) { + ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND, + ResultReg, /*Kill=*/true); + if (ResultReg == 0) + return false; + } + + updateValueMap(I, ResultReg); + return true; +} + bool X86FastISel::X86SelectBranch(const Instruction *I) { // Unconditional branches are selected by tablegen-generated code. // Handle a conditional branch. @@ -1766,41 +1789,34 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { bool X86FastISel::X86SelectShift(const Instruction *I) { unsigned CReg = 0, OpReg = 0; const TargetRegisterClass *RC = nullptr; - if (I->getType()->isIntegerTy(8)) { - CReg = X86::CL; - RC = &X86::GR8RegClass; - switch (I->getOpcode()) { - case Instruction::LShr: OpReg = X86::SHR8rCL; break; - case Instruction::AShr: OpReg = X86::SAR8rCL; break; - case Instruction::Shl: OpReg = X86::SHL8rCL; break; - default: return false; - } - } else if (I->getType()->isIntegerTy(16)) { + assert(!I->getType()->isIntegerTy(8) && + "i8 shifts should be handled by autogenerated table"); + if (I->getType()->isIntegerTy(16)) { CReg = X86::CX; RC = &X86::GR16RegClass; switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected shift opcode"); case Instruction::LShr: OpReg = X86::SHR16rCL; break; case Instruction::AShr: OpReg = X86::SAR16rCL; break; case Instruction::Shl: OpReg = X86::SHL16rCL; break; - default: return false; } } else if (I->getType()->isIntegerTy(32)) { CReg = X86::ECX; RC = &X86::GR32RegClass; switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected shift opcode"); case Instruction::LShr: OpReg = X86::SHR32rCL; break; case Instruction::AShr: OpReg = X86::SAR32rCL; break; case Instruction::Shl: OpReg = X86::SHL32rCL; break; - default: return false; } } else if (I->getType()->isIntegerTy(64)) { CReg = X86::RCX; RC = &X86::GR64RegClass; switch (I->getOpcode()) { + default: llvm_unreachable("Unexpected shift opcode"); case Instruction::LShr: OpReg = X86::SHR64rCL; break; case Instruction::AShr: OpReg = X86::SAR64rCL; break; case Instruction::Shl: OpReg = X86::SHL64rCL; break; - default: return false; } } else { return false; @@ -1820,10 +1836,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) { // The shift instruction uses X86::CL. If we defined a super-register // of X86::CL, emit a subreg KILL to precisely describe what we're doing here. - if (CReg != X86::CL) - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::KILL), X86::CL) - .addReg(CReg, RegState::Kill); + assert(CReg != X86::CL && "CReg should be a super register of CL"); + BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, + TII.get(TargetOpcode::KILL), X86::CL) + .addReg(CReg, RegState::Kill); unsigned ResultReg = createResultReg(RC); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg) @@ -1960,12 +1976,12 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) { // Generate the DIV/IDIV instruction. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpEntry.OpDivRem)).addReg(Op1Reg); - // For i8 remainder, we can't reference AH directly, as we'll end - // up with bogus copies like %R9B = COPY %AH. Reference AX - // instead to prevent AH references in a REX instruction. + // For i8 remainder, we can't reference ah directly, as we'll end + // up with bogus copies like %r9b = COPY %ah. Reference ax + // instead to prevent ah references in a rex instruction. // // The current assumption of the fast register allocator is that isel - // won't generate explicit references to the GPR8_NOREX registers. If + // won't generate explicit references to the GR8_NOREX registers. If // the allocator and/or the backend get enhanced to be more robust in // that regard, this can be, and should be, removed. unsigned ResultReg = 0; @@ -2159,7 +2175,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) { unsigned CC; bool NeedSwap; std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate); - if (CC > 7) + if (CC > 7 && !Subtarget->hasAVX()) return false; if (NeedSwap) @@ -2394,7 +2410,8 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) { if (!Subtarget->hasAVX()) return false; - if (!I->getOperand(0)->getType()->isIntegerTy(32)) + Type *InTy = I->getOperand(0)->getType(); + if (!InTy->isIntegerTy(32) && !InTy->isIntegerTy(64)) return false; // Select integer to float/double conversion. @@ -2407,11 +2424,11 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) { if (I->getType()->isDoubleTy()) { // sitofp int -> double - Opcode = X86::VCVTSI2SDrr; + Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SDrr : X86::VCVTSI2SDrr; RC = &X86::FR64RegClass; } else if (I->getType()->isFloatTy()) { // sitofp int -> float - Opcode = X86::VCVTSI2SSrr; + Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SSrr : X86::VCVTSI2SSrr; RC = &X86::FR32RegClass; } else return false; @@ -2461,9 +2478,13 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I, bool X86FastISel::X86SelectFPExt(const Instruction *I) { if (X86ScalarSSEf64 && I->getType()->isDoubleTy() && I->getOperand(0)->getType()->isFloatTy()) { + bool HasAVX512 = Subtarget->hasAVX512(); // fpext from float to double. - unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; - return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass); + unsigned Opc = + HasAVX512 ? X86::VCVTSS2SDZrr + : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr; + return X86SelectFPExtOrFPTrunc( + I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass); } return false; @@ -2472,9 +2493,13 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) { bool X86FastISel::X86SelectFPTrunc(const Instruction *I) { if (X86ScalarSSEf64 && I->getType()->isFloatTy() && I->getOperand(0)->getType()->isDoubleTy()) { + bool HasAVX512 = Subtarget->hasAVX512(); // fptrunc from double to float. - unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; - return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass); + unsigned Opc = + HasAVX512 ? X86::VCVTSD2SSZrr + : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr; + return X86SelectFPExtOrFPTrunc( + I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass); } return false; @@ -2485,8 +2510,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { EVT DstVT = TLI.getValueType(DL, I->getType()); // This code only handles truncation to byte. - // TODO: Support truncate to i1 with AVX512. - if (DstVT != MVT::i8 && (DstVT != MVT::i1 || Subtarget->hasAVX512())) + if (DstVT != MVT::i8 && DstVT != MVT::i1) return false; if (!TLI.isTypeLegal(SrcVT)) return false; @@ -2502,22 +2526,9 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) { return true; } - bool KillInputReg = false; - if (!Subtarget->is64Bit()) { - // If we're on x86-32; we can't extract an i8 from a general register. - // First issue a copy to GR16_ABCD or GR32_ABCD. - const TargetRegisterClass *CopyRC = - (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass; - unsigned CopyReg = createResultReg(CopyRC); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg); - InputReg = CopyReg; - KillInputReg = true; - } - // Issue an extract_subreg. unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8, - InputReg, KillInputReg, + InputReg, false, X86::sub_8bit); if (!ResultReg) return false; @@ -3300,16 +3311,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Handle zero-extension from i1 to i8, which is common. if (ArgVT == MVT::i1) { - // In case SrcReg is a K register, COPY to a GPR - if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) { - unsigned KArgReg = ArgReg; - ArgReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(TargetOpcode::COPY), ArgReg) - .addReg(KArgReg); - ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true, - X86::sub_8bit); - } // Set the high bits to zero. ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false); ArgVT = MVT::i8; @@ -3455,19 +3456,26 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { } else { // Direct call. assert(GV && "Not a direct call"); - unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32; - // See if we need any target-specific flags on the GV operand. unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV); - // Ignore NonLazyBind attribute in FastISel - if (OpFlags == X86II::MO_GOTPCREL) - OpFlags = 0; + + // This will be a direct call, or an indirect call through memory for + // NonLazyBind calls or dllimport calls. + bool NeedLoad = + OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL; + unsigned CallOpc = NeedLoad + ? (Is64Bit ? X86::CALL64m : X86::CALL32m) + : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32); MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc)); + if (NeedLoad) + MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0); if (Symbol) MIB.addSym(Symbol, OpFlags); else MIB.addGlobalAddress(GV, 0, OpFlags); + if (NeedLoad) + MIB.addReg(0); } // Add a register mask operand representing the call-preserved registers. @@ -3515,16 +3523,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { report_fatal_error("SSE register return with SSE disabled"); } - // If the return value is an i1 and AVX-512 is enabled, we need - // to do a fixup to make the copy legal. - if (CopyVT == MVT::i1 && SrcReg == X86::AL && Subtarget->hasAVX512()) { - // Need to copy to a GR32 first. - // TODO: MOVZX isn't great here. We don't care about the upper bits. - SrcReg = createResultReg(&X86::GR32RegClass); - BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, - TII.get(X86::MOVZX32rr8), SrcReg).addReg(X86::AL); - } - // If we prefer to use the value in xmm registers, copy it out as f80 and // use a truncate to move it from fp stack reg to xmm reg. if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) && @@ -3577,6 +3575,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I) { return X86SelectCmp(I); case Instruction::ZExt: return X86SelectZExt(I); + case Instruction::SExt: + return X86SelectSExt(I); case Instruction::Br: return X86SelectBranch(I); case Instruction::LShr: @@ -3723,8 +3723,10 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { default: return 0; case MVT::f32: if (X86ScalarSSEf32) { - Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; - RC = &X86::FR32RegClass; + Opc = Subtarget->hasAVX512() + ? X86::VMOVSSZrm + : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm; + RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass; } else { Opc = X86::LD_Fp32m; RC = &X86::RFP32RegClass; @@ -3732,8 +3734,10 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) { break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; - RC = &X86::FR64RegClass; + Opc = Subtarget->hasAVX512() + ? X86::VMOVSDZrm + : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm; + RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass; } else { Opc = X86::LD_Fp64m; RC = &X86::RFP64RegClass; @@ -3871,14 +3875,15 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { return 0; // Get opcode and regclass for the given zero. + bool HasAVX512 = Subtarget->hasAVX512(); unsigned Opc = 0; const TargetRegisterClass *RC = nullptr; switch (VT.SimpleTy) { default: return 0; case MVT::f32: if (X86ScalarSSEf32) { - Opc = X86::FsFLD0SS; - RC = &X86::FR32RegClass; + Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS; + RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass; } else { Opc = X86::LD_Fp032; RC = &X86::RFP32RegClass; @@ -3886,8 +3891,8 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) { break; case MVT::f64: if (X86ScalarSSEf64) { - Opc = X86::FsFLD0SD; - RC = &X86::FR64RegClass; + Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD; + RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass; } else { Opc = X86::LD_Fp064; RC = &X86::RFP64RegClass; @@ -3964,7 +3969,7 @@ unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode, Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs()); Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1); Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2); - Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3); + Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3); if (II.getNumDefs() >= 1) BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg) diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp index 95c6f2a3fa34..01d10fe4cae4 100644 --- a/lib/Target/X86/X86FixupBWInsts.cpp +++ b/lib/Target/X86/X86FixupBWInsts.cpp @@ -55,9 +55,9 @@ #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; #define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup" @@ -146,12 +146,12 @@ INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false) FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); } bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { - if (!FixupBWInsts || skipFunction(*MF.getFunction())) + if (!FixupBWInsts || skipFunction(MF.getFunction())) return false; this->MF = &MF; TII = MF.getSubtarget<X86Subtarget>().getInstrInfo(); - OptForSize = MF.getFunction()->optForSize(); + OptForSize = MF.getFunction().optForSize(); MLI = &getAnalysis<MachineLoopInfo>(); LiveRegs.init(TII->getRegisterInfo()); @@ -166,15 +166,86 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) { return true; } -// TODO: This method of analysis can miss some legal cases, because the -// super-register could be live into the address expression for a memory -// reference for the instruction, and still be killed/last used by the -// instruction. However, the existing query interfaces don't seem to -// easily allow that to be checked. -// -// What we'd really like to know is whether after OrigMI, the -// only portion of SuperDestReg that is alive is the portion that -// was the destination register of OrigMI. +/// Check if register \p Reg is live after the \p MI. +/// +/// \p LiveRegs should be in a state describing liveness information in +/// that exact place as this function tries to precise analysis made +/// by \p LiveRegs by exploiting the information about particular +/// instruction \p MI. \p MI is expected to be one of the MOVs handled +/// by the x86FixupBWInsts pass. +/// Note: similar to LivePhysRegs::contains this would state that +/// super-register is not used if only some part of it is used. +/// +/// X86 backend does not have subregister liveness tracking enabled, +/// so liveness information might be overly conservative. However, for +/// some specific instructions (this pass only cares about MOVs) we can +/// produce more precise results by analysing that MOV's operands. +/// +/// Indeed, if super-register is not live before the mov it means that it +/// was originally <read-undef> and so we are free to modify these +/// undef upper bits. That may happen in case where the use is in another MBB +/// and the vreg/physreg corresponding to the move has higher width than +/// necessary (e.g. due to register coalescing with a "truncate" copy). +/// So, it handles pattern like this: +/// +/// %bb.2: derived from LLVM BB %if.then +/// Live Ins: %rdi +/// Predecessors according to CFG: %bb.0 +/// %ax = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax; +/// mem:LD2[%p] +/// No implicit %eax +/// Successors according to CFG: %bb.3(?%) +/// +/// %bb.3: derived from LLVM BB %if.end +/// Live Ins: %eax Only %ax is actually live +/// Predecessors according to CFG: %bb.2 %bb.1 +/// %ax = KILL %ax, implicit killed %eax +/// RET 0, %ax +static bool isLive(const MachineInstr &MI, + const LivePhysRegs &LiveRegs, + const TargetRegisterInfo *TRI, + unsigned Reg) { + if (!LiveRegs.contains(Reg)) + return false; + + unsigned Opc = MI.getOpcode(); (void)Opc; + // These are the opcodes currently handled by the pass, if something + // else will be added we need to ensure that new opcode has the same + // properties. + assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr || + Opc == X86::MOV16rr) && + "Unexpected opcode."); + + bool IsDefined = false; + for (auto &MO: MI.implicit_operands()) { + if (!MO.isReg()) + continue; + + assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!"); + + for (MCSuperRegIterator Supers(Reg, TRI, true); Supers.isValid(); ++Supers) { + if (*Supers == MO.getReg()) { + if (MO.isDef()) + IsDefined = true; + else + return true; // SuperReg Imp-used' -> live before the MI + } + } + } + // Reg is not Imp-def'ed -> it's live both before/after the instruction. + if (!IsDefined) + return true; + + // Otherwise, the Reg is not live before the MI and the MOV can't + // make it really live, so it's in fact dead even after the MI. + return false; +} + +/// \brief Check if after \p OrigMI the only portion of super register +/// of the destination register of \p OrigMI that is alive is that +/// destination register. +/// +/// If so, return that super register in \p SuperDestReg. bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, unsigned &SuperDestReg) const { auto *TRI = &TII->getRegisterInfo(); @@ -191,7 +262,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, if (SubRegIdx == X86::sub_8bit_hi) return false; - if (LiveRegs.contains(SuperDestReg)) + if (isLive(*OrigMI, LiveRegs, TRI, SuperDestReg)) return false; if (SubRegIdx == X86::sub_8bit) { @@ -201,7 +272,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI, unsigned UpperByteReg = getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true); - if (LiveRegs.contains(UpperByteReg)) + if (isLive(*OrigMI, LiveRegs, TRI, UpperByteReg)) return false; } @@ -328,7 +399,7 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF, for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) { MachineInstr *MI = &*I; - + if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB)) MIReplacements.push_back(std::make_pair(MI, NewMI)); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 9f649dad8bc0..b41bf99f19b2 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -17,14 +17,12 @@ #include "X86InstrInfo.h" #include "X86Subtarget.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveVariables.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; namespace llvm { @@ -193,12 +191,12 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); } bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { - if (skipFunction(*Func.getFunction())) + if (skipFunction(Func.getFunction())) return false; MF = &Func; const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>(); - OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); + OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize(); OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); if (!OptLEA && !OptIncDec) diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp index 5582526541ba..9a72e7114be0 100644 --- a/lib/Target/X86/X86FloatingPoint.cpp +++ b/lib/Target/X86/X86FloatingPoint.cpp @@ -37,13 +37,13 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/InlineAsm.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <algorithm> #include <bitset> using namespace llvm; @@ -349,7 +349,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) { // In regcall convention, some FP registers may not be passed through // the stack, so they will need to be assigned to the stack first - if ((Entry->getParent()->getFunction()->getCallingConv() == + if ((Entry->getParent()->getFunction().getCallingConv() == CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) { // In the register calling convention, up to one FP argument could be // saved in the first FP register. @@ -499,7 +499,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) { /// setupBlockStack - Use the live bundles to set up our model of the stack /// to match predecessors' live out stack. void FPS::setupBlockStack() { - DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber() + DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB) << " derived from " << MBB->getName() << ".\n"); StackTop = 0; // Get the live-in bundle for MBB. @@ -516,7 +516,7 @@ void FPS::setupBlockStack() { // Push the fixed live-in registers. for (unsigned i = Bundle.FixCount; i > 0; --i) { - DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP" + DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %fp" << unsigned(Bundle.FixStack[i-1]) << '\n'); pushReg(Bundle.FixStack[i-1]); } @@ -538,7 +538,7 @@ void FPS::finishBlockStack() { if (MBB->succ_empty()) return; - DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber() + DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB) << " derived from " << MBB->getName() << ".\n"); // Get MBB's live-out bundle. @@ -893,7 +893,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { while (Kills && Defs) { unsigned KReg = countTrailingZeros(Kills); unsigned DReg = countTrailingZeros(Defs); - DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n"); + DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg << "\n"); std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]); std::swap(RegMap[KReg], RegMap[DReg]); Kills &= ~(1 << KReg); @@ -907,7 +907,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { unsigned KReg = getStackEntry(0); if (!(Kills & (1 << KReg))) break; - DEBUG(dbgs() << "Popping %FP" << KReg << "\n"); + DEBUG(dbgs() << "Popping %fp" << KReg << "\n"); popStackAfter(I2); Kills &= ~(1 << KReg); } @@ -916,7 +916,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Manually kill the rest. while (Kills) { unsigned KReg = countTrailingZeros(Kills); - DEBUG(dbgs() << "Killing %FP" << KReg << "\n"); + DEBUG(dbgs() << "Killing %fp" << KReg << "\n"); freeStackSlotBefore(I, KReg); Kills &= ~(1 << KReg); } @@ -924,7 +924,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) { // Load zeros for all the imp-defs. while(Defs) { unsigned DReg = countTrailingZeros(Defs); - DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n"); + DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n"); BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0)); pushReg(DReg); Defs &= ~(1 << DReg); @@ -973,7 +973,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) { unsigned R = MO.getReg() - X86::FP0; if (R < 8) { - if (MF->getFunction()->getCallingConv() != CallingConv::X86_RegCall) { + if (MF->getFunction().getCallingConv() != CallingConv::X86_RegCall) { assert(MO.isDef() && MO.isImplicit()); } diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp index f294e819090b..80b1cc192a88 100644 --- a/lib/Target/X86/X86FrameLowering.cpp +++ b/lib/Target/X86/X86FrameLowering.cpp @@ -148,8 +148,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB, const X86RegisterInfo *TRI, bool Is64Bit) { const MachineFunction *MF = MBB.getParent(); - const Function *F = MF->getFunction(); - if (!F || MF->callsEHReturn()) + if (MF->callsEHReturn()) return 0; const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF); @@ -820,7 +819,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con const MachineFrameInfo &MFI = MF.getFrameInfo(); uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment. unsigned StackAlign = getStackAlignment(); - if (MF.getFunction()->hasFnAttribute("stackrealign")) { + if (MF.getFunction().hasFnAttribute("stackrealign")) { if (MFI.hasCalls()) MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign; else if (MaxAlign < SlotSize) @@ -924,6 +923,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB, Notes: - .seh directives are emitted only for Windows 64 ABI + - .cv_fpo directives are emitted on win32 when emitting CodeView - .cfi directives are emitted for all other ABIs - for 32-bit code, substitute %e?? registers for %r?? */ @@ -934,31 +934,35 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, "MF used frame lowering for wrong subtarget"); MachineBasicBlock::iterator MBBI = MBB.begin(); MachineFrameInfo &MFI = MF.getFrameInfo(); - const Function *Fn = MF.getFunction(); + const Function &Fn = MF.getFunction(); MachineModuleInfo &MMI = MF.getMMI(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment. uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate. bool IsFunclet = MBB.isEHFuncletEntry(); EHPersonality Personality = EHPersonality::Unknown; - if (Fn->hasPersonalityFn()) - Personality = classifyEHPersonality(Fn->getPersonalityFn()); + if (Fn.hasPersonalityFn()) + Personality = classifyEHPersonality(Fn.getPersonalityFn()); bool FnHasClrFunclet = MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR; bool IsClrFunclet = IsFunclet && FnHasClrFunclet; bool HasFP = hasFP(MF); - bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv()); + bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv()); bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry(); + bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry(); + // FIXME: Emit FPO data for EH funclets. + bool NeedsWinFPO = + !IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag(); + bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO; bool NeedsDwarfCFI = - !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry()); unsigned FramePtr = TRI->getFrameRegister(MF); const unsigned MachineFramePtr = STI.isTarget64BitILP32() ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; unsigned BasePtr = TRI->getBaseRegister(); bool HasWinCFI = false; - + // Debug location must be unknown since the first debug location is used // to determine the end of the prologue. DebugLoc DL; @@ -977,16 +981,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // The default stack probe size is 4096 if the function has no stackprobesize // attribute. unsigned StackProbeSize = 4096; - if (Fn->hasFnAttribute("stack-probe-size")) - Fn->getFnAttribute("stack-probe-size") + if (Fn.hasFnAttribute("stack-probe-size")) + Fn.getFnAttribute("stack-probe-size") .getValueAsString() .getAsInteger(0, StackProbeSize); // Re-align the stack on 64-bit if the x86-interrupt calling convention is // used and an error code was pushed, since the x86-64 ABI requires a 16-byte // stack alignment. - if (Fn->getCallingConv() == CallingConv::X86_INTR && Is64Bit && - Fn->arg_size() == 2) { + if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit && + Fn.arg_size() == 2) { StackSize += 8; MFI.setStackSize(StackSize); emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false); @@ -997,7 +1001,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // pointer, calls, or dynamic alloca then we do not need to adjust the // stack pointer (we fit in the Red Zone). We also check that we don't // push and pop from the stack. - if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) && + if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) && !TRI->needsStackRealignment(MF) && !MFI.hasVarSizedObjects() && // No dynamic alloca. !MFI.adjustsStack() && // No calls. @@ -1120,6 +1124,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister( nullptr, DwarfFramePtr)); } + + if (NeedsWinFPO) { + // .cv_fpo_setframe $FramePtr + HasWinCFI = true; + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) + .addImm(FramePtr) + .addImm(0) + .setMIFlag(MachineInstr::FrameSetup); + } } } else { assert(!IsFunclet && "funclets without FPs not yet implemented"); @@ -1155,8 +1168,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, if (NeedsWinCFI) { HasWinCFI = true; - BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag( - MachineInstr::FrameSetup); + BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)) + .addImm(Reg) + .setMIFlag(MachineInstr::FrameSetup); } } @@ -1295,6 +1309,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // If this is not a funclet, emit the CFI describing our frame pointer. if (NeedsWinCFI && !IsFunclet) { + assert(!NeedsWinFPO && "this setframe incompatible with FPO data"); HasWinCFI = true; BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame)) .addImm(FramePtr) @@ -1333,6 +1348,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, Offset += SEHFrameOffset; HasWinCFI = true; + assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data"); BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM)) .addImm(Reg) .addImm(Offset) @@ -1419,8 +1435,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, } // Emit DWARF info specifying the offsets of the callee-saved registers. - if (PushedRegs) - emitCalleeSavedFrameMoves(MBB, MBBI, DL); + emitCalleeSavedFrameMoves(MBB, MBBI, DL); } // X86 Interrupt handling function cannot assume anything about the direction @@ -1431,7 +1446,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF, // 1. The interrupt handling function uses any of the "rep" instructions. // 2. Interrupt handling function calls another function. // - if (Fn->getCallingConv() == CallingConv::X86_INTR) + if (Fn.getCallingConv() == CallingConv::X86_INTR) BuildMI(MBB, MBBI, DL, TII.get(X86::CLD)) .setMIFlag(MachineInstr::FrameSetup); @@ -1492,7 +1507,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const { // This is the amount of stack a funclet needs to allocate. unsigned UsedSize; EHPersonality Personality = - classifyEHPersonality(MF.getFunction()->getPersonalityFn()); + classifyEHPersonality(MF.getFunction().getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { // CLR funclets need to hold enough space to include the PSPSym, at the // same offset from the stack pointer (immediately after the prolog) as it @@ -1522,10 +1537,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const { const MachineFrameInfo &MFI = MF.getFrameInfo(); X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>(); - MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(); - Optional<unsigned> RetOpcode; - if (MBBI != MBB.end()) - RetOpcode = MBBI->getOpcode(); + MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator(); + MachineBasicBlock::iterator MBBI = Terminator; DebugLoc DL; if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); @@ -1536,38 +1549,21 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr; bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool NeedsWinCFI = - IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry(); + bool NeedsWin64CFI = + IsWin64Prologue && MF.getFunction().needsUnwindTableEntry(); bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI); - MachineBasicBlock *TargetMBB = nullptr; // Get the number of bytes to allocate from the FrameInfo. uint64_t StackSize = MFI.getStackSize(); uint64_t MaxAlign = calculateMaxStackAlign(MF); unsigned CSSize = X86FI->getCalleeSavedFrameSize(); + bool HasFP = hasFP(MF); uint64_t NumBytes = 0; - if (RetOpcode && *RetOpcode == X86::CATCHRET) { - // SEH shouldn't use catchret. - assert(!isAsynchronousEHPersonality( - classifyEHPersonality(MF.getFunction()->getPersonalityFn())) && - "SEH should not use CATCHRET"); - - NumBytes = getWinEHFuncletFrameSize(MF); - assert(hasFP(MF) && "EH funclets without FP not yet implemented"); - TargetMBB = MBBI->getOperand(0).getMBB(); - - // Pop EBP. - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), - MachineFramePtr) - .setMIFlag(MachineInstr::FrameDestroy); - } else if (RetOpcode && *RetOpcode == X86::CLEANUPRET) { + if (IsFunclet) { + assert(HasFP && "EH funclets without FP not yet implemented"); NumBytes = getWinEHFuncletFrameSize(MF); - assert(hasFP(MF) && "EH funclets without FP not yet implemented"); - BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), - MachineFramePtr) - .setMIFlag(MachineInstr::FrameDestroy); - } else if (hasFP(MF)) { + } else if (HasFP) { // Calculate required stack adjustment. uint64_t FrameSize = StackSize - SlotSize; NumBytes = FrameSize - CSSize; @@ -1576,16 +1572,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // realigned. if (TRI->needsStackRealignment(MF) && !IsWin64Prologue) NumBytes = alignTo(FrameSize, MaxAlign); - - // Pop EBP. - BuildMI(MBB, MBBI, DL, - TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr) - .setMIFlag(MachineInstr::FrameDestroy); } else { NumBytes = StackSize - CSSize; } uint64_t SEHStackAllocAmt = NumBytes; + if (HasFP) { + // Pop EBP. + BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r), + MachineFramePtr) + .setMIFlag(MachineInstr::FrameDestroy); + } + MachineBasicBlock::iterator FirstCSPop = MBBI; // Skip the callee-saved pop instructions. while (MBBI != MBB.begin()) { @@ -1603,26 +1601,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, } MBBI = FirstCSPop; - if (TargetMBB) { - // Fill EAX/RAX with the address of the target block. - unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX; - if (STI.is64Bit()) { - // LEA64r TargetMBB(%rip), %rax - BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg) - .addReg(X86::RIP) - .addImm(0) - .addReg(0) - .addMBB(TargetMBB) - .addReg(0); - } else { - // MOV32ri $TargetMBB, %eax - BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg) - .addMBB(TargetMBB); - } - // Record that we've taken the address of TargetMBB and no longer just - // reference it in a terminator. - TargetMBB->setHasAddressTaken(); - } + if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET) + emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator); if (MBBI != MBB.end()) DL = MBBI->getDebugLoc(); @@ -1674,19 +1654,17 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF, // into the epilogue. To cope with that, we insert an epilogue marker here, // then replace it with a 'nop' if it ends up immediately after a CALL in the // final emitted code. - if (NeedsWinCFI && MF.hasWinCFI()) + if (NeedsWin64CFI && MF.hasWinCFI()) BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue)); - if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) { + if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) { // Add the return addr area delta back since we are not tail calling. int Offset = -1 * X86FI->getTCReturnAddrDelta(); assert(Offset >= 0 && "TCDelta should never be positive"); if (Offset) { - MBBI = MBB.getFirstTerminator(); - // Check for possible merge with preceding ADD instruction. - Offset += mergeSPUpdates(MBB, MBBI, true); - emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true); + Offset += mergeSPUpdates(MBB, Terminator, true); + emitSPUpdate(MBB, Terminator, Offset, /*InEpilogue=*/true); } } } @@ -1997,9 +1975,39 @@ bool X86FrameLowering::spillCalleeSavedRegisters( return true; } +void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineInstr *CatchRet) const { + // SEH shouldn't use catchret. + assert(!isAsynchronousEHPersonality(classifyEHPersonality( + MBB.getParent()->getFunction().getPersonalityFn())) && + "SEH should not use CATCHRET"); + DebugLoc DL = CatchRet->getDebugLoc(); + MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB(); + + // Fill EAX/RAX with the address of the target block. + if (STI.is64Bit()) { + // LEA64r CatchRetTarget(%rip), %rax + BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX) + .addReg(X86::RIP) + .addImm(0) + .addReg(0) + .addMBB(CatchRetTarget) + .addReg(0); + } else { + // MOV32ri $CatchRetTarget, %eax + BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX) + .addMBB(CatchRetTarget); + } + + // Record that we've taken the address of CatchRetTarget and no longer just + // reference it in a terminator. + CatchRetTarget->setHasAddressTaken(); +} + bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const { if (CSI.empty()) return false; @@ -2012,9 +2020,9 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB, // Don't restore CSRs before an SEH catchret. SEH except blocks do not form // funclets. emitEpilogue transforms these to normal jumps. if (MI->getOpcode() == X86::CATCHRET) { - const Function *Func = MBB.getParent()->getFunction(); + const Function &F = MBB.getParent()->getFunction(); bool IsSEH = isAsynchronousEHPersonality( - classifyEHPersonality(Func->getPersonalityFn())); + classifyEHPersonality(F.getPersonalityFn())); if (IsSEH) return true; } @@ -2086,8 +2094,8 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF, static bool HasNestArgument(const MachineFunction *MF) { - const Function *F = MF->getFunction(); - for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end(); + const Function &F = MF->getFunction(); + for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; I++) { if (I->hasNestAttr()) return true; @@ -2101,7 +2109,7 @@ HasNestArgument(const MachineFunction *MF) { /// needed. Set primary to true for the first register, false for the second. static unsigned GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) { - CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv(); + CallingConv::ID CallingConvention = MF.getFunction().getCallingConv(); // Erlang stuff. if (CallingConvention == CallingConv::HiPE) { @@ -2151,7 +2159,7 @@ void X86FrameLowering::adjustForSegmentedStacks( assert(!MF.getRegInfo().isLiveIn(ScratchReg) && "Scratch register is live-in"); - if (MF.getFunction()->isVarArg()) + if (MF.getFunction().isVarArg()) report_fatal_error("Segmented stacks do not support vararg functions."); if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() && !STI.isTargetWin64() && !STI.isTargetFreeBSD() && @@ -2425,8 +2433,8 @@ void X86FrameLowering::adjustForHiPEPrologue( Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS"); const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5; const unsigned Guaranteed = HipeLeafWords * SlotSize; - unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ? - MF.getFunction()->arg_size() - CCRegisteredArgs : 0; + unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ? + MF.getFunction().arg_size() - CCRegisteredArgs : 0; unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize; assert(STI.isTargetLinux() && @@ -2567,6 +2575,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, unsigned Regs[2]; unsigned FoundRegs = 0; + auto &MRI = MBB.getParent()->getRegInfo(); auto RegMask = Prev->getOperand(1); auto &RegClass = @@ -2580,6 +2589,10 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB, if (!RegMask.clobbersPhysReg(Candidate)) continue; + // Don't clobber reserved registers + if (MRI.isReserved(Candidate)) + continue; + bool IsDef = false; for (const MachineOperand &MO : Prev->implicit_operands()) { if (MO.isReg() && MO.isDef() && @@ -2635,10 +2648,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, Amount = alignTo(Amount, StackAlign); MachineModuleInfo &MMI = MF.getMMI(); - const Function *Fn = MF.getFunction(); + const Function &F = MF.getFunction(); bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); - bool DwarfCFI = !WindowsCFI && - (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry()); + bool DwarfCFI = !WindowsCFI && + (MMI.hasDebugInfo() || F.needsUnwindTableEntry()); // If we have any exception handlers in this function, and we adjust // the SP before calls, we may need to indicate this to the unwinder @@ -2680,7 +2693,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB, StackAdjustment += mergeSPUpdates(MBB, InsertPos, false); if (StackAdjustment) { - if (!(Fn->optForMinSize() && + if (!(F.optForMinSize() && adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment))) BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment, /*InEpilogue=*/false); @@ -2753,13 +2766,13 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const { bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const { // If we may need to emit frameless compact unwind information, give // up as this is currently broken: PR25614. - return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) && + return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) && // The lowering of segmented stack and HiPE only support entry blocks // as prologue blocks: PR26107. // This limitation may be lifted if we fix: // - adjustForSegmentedStacks // - adjustForHiPEPrologue - MF.getFunction()->getCallingConv() != CallingConv::HiPE && + MF.getFunction().getCallingConv() != CallingConv::HiPE && !MF.shouldSplitStack(); } @@ -2989,9 +3002,9 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized( // If this function isn't doing Win64-style C++ EH, we don't need to do // anything. - const Function *Fn = MF.getFunction(); + const Function &F = MF.getFunction(); if (!STI.is64Bit() || !MF.hasEHFunclets() || - classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX) + classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX) return; // Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h index 7d214cabad53..909319fc18fc 100644 --- a/lib/Target/X86/X86FrameLowering.h +++ b/lib/Target/X86/X86FrameLowering.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H #define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H -#include "llvm/Target/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetFrameLowering.h" namespace llvm { @@ -89,7 +89,7 @@ public: bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, - const std::vector<CalleeSavedInfo> &CSI, + std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const override; bool hasFP(const MachineFunction &MF) const override; @@ -157,15 +157,6 @@ public: void orderFrameObjects(const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const override; - /// convertArgMovsToPushes - This method tries to convert a call sequence - /// that uses sub and mov instructions to put the argument onto the stack - /// into a series of pushes. - /// Returns true if the transformation succeeded, false if not. - bool convertArgMovsToPushes(MachineFunction &MF, - MachineBasicBlock &MBB, - MachineBasicBlock::iterator I, - uint64_t Amount) const; - /// Wraps up getting a CFI index and building a MachineInstr for it. void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, const MCCFIInstruction &CFIInst) const; @@ -214,6 +205,11 @@ private: unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const; unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const; + + /// Materialize the catchret target MBB in RAX. + void emitCatchRetReturnValue(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + MachineInstr *CatchRet) const; }; } // End llvm namespace diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def index 06be142432f7..9cd3f96f83ac 100644 --- a/lib/Target/X86/X86GenRegisterBankInfo.def +++ b/lib/Target/X86/X86GenRegisterBankInfo.def @@ -11,10 +11,6 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - #ifdef GET_TARGET_REGBANK_INFO_IMPL RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{ /* StartIdx, Length, RegBank */ diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 8f24f98be681..a6c7c5f22a3a 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -13,7 +13,6 @@ //===----------------------------------------------------------------------===// #include "X86.h" -#include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" @@ -21,8 +20,6 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/SelectionDAGISel.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" @@ -194,6 +191,7 @@ namespace { bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM); bool matchWrapper(SDValue N, X86ISelAddressMode &AM); bool matchAddress(SDValue N, X86ISelAddressMode &AM); + bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM); bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth); bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, unsigned Depth); @@ -204,11 +202,6 @@ namespace { bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); - template <class GatherScatterSDNode> - bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N, - SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, - SDValue &Segment); bool selectMOV64Imm32(SDValue N, SDValue &Imm); bool selectLEAAddr(SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, @@ -226,11 +219,19 @@ namespace { SDValue &NodeWithChain); bool selectRelocImm(SDValue N, SDValue &Op); - bool tryFoldLoad(SDNode *P, SDValue N, + bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment); + // Convience method where P is also root. + bool tryFoldLoad(SDNode *P, SDValue N, + SDValue &Base, SDValue &Scale, + SDValue &Index, SDValue &Disp, + SDValue &Segment) { + return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment); + } + /// Implement addressing mode selection for inline asm expressions. bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID, @@ -366,6 +367,22 @@ namespace { return CurDAG->getTargetConstant(Imm, DL, MVT::i32); } + SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth, + const SDLoc &DL) { + assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); + uint64_t Index = N->getConstantOperandVal(1); + MVT VecVT = N->getOperand(0).getSimpleValueType(); + return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); + } + + SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth, + const SDLoc &DL) { + assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width"); + uint64_t Index = N->getConstantOperandVal(2); + MVT VecVT = N->getSimpleValueType(0); + return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL); + } + /// Return an SDNode that returns the value of the global base register. /// Output instructions required to initialize the global base register, /// if necessary. @@ -399,10 +416,71 @@ namespace { return isInt<Width>(CN->getSExtValue()); return isSExtAbsoluteSymbolRef(Width, N); } + + // Indicates we should prefer to use a non-temporal load for this load. + bool useNonTemporalLoad(LoadSDNode *N) const { + if (!N->isNonTemporal()) + return false; + + unsigned StoreSize = N->getMemoryVT().getStoreSize(); + + if (N->getAlignment() < StoreSize) + return false; + + switch (StoreSize) { + default: llvm_unreachable("Unsupported store size"); + case 16: + return Subtarget->hasSSE41(); + case 32: + return Subtarget->hasAVX2(); + case 64: + return Subtarget->hasAVX512(); + } + } + + bool foldLoadStoreIntoMemOperand(SDNode *Node); + + bool matchBEXTRFromAnd(SDNode *Node); + + bool isMaskZeroExtended(SDNode *N) const; }; } +// Returns true if this masked compare can be implemented legally with this +// type. +static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM || + Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM || + Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU || + Opcode == X86ISD::CMPM_RND) { + // We can get 256-bit 8 element types here without VLX being enabled. When + // this happens we will use 512-bit operations and the mask will not be + // zero extended. + EVT OpVT = N->getOperand(0).getValueType(); + if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32) + return Subtarget->hasVLX(); + + return true; + } + + return false; +} + +// Returns true if we can assume the writer of the mask has zero extended it +// for us. +bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const { + // If this is an AND, check if we have a compare on either side. As long as + // one side guarantees the mask is zero extended, the AND will preserve those + // zeros. + if (N->getOpcode() == ISD::AND) + return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) || + isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget); + + return isLegalMaskCompare(N, Subtarget); +} + bool X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const { if (OptLevel == CodeGenOpt::None) return false; @@ -541,8 +619,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) { void X86DAGToDAGISel::PreprocessISelDAG() { // OptFor[Min]Size are used in pattern predicates that isel is matching. - OptForSize = MF->getFunction()->optForSize(); - OptForMinSize = MF->getFunction()->optForMinSize(); + OptForSize = MF->getFunction().optForSize(); + OptForMinSize = MF->getFunction().optForMinSize(); assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize"); for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(), @@ -552,7 +630,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() { if (OptLevel != CodeGenOpt::None && // Only does this when target favors doesn't favor register indirect // call. - ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) || + ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) || (N->getOpcode() == X86ISD::TC_RETURN && // Only does this if load can be folded into TC_RETURN. (Subtarget->is64Bit() || @@ -675,9 +753,9 @@ void X86DAGToDAGISel::emitSpecialCodeForMain() { void X86DAGToDAGISel::EmitFunctionEntryCode() { // If this is main, emit special code for main. - if (const Function *Fn = MF->getFunction()) - if (Fn->hasExternalLinkage() && Fn->getName() == "main") - emitSpecialCodeForMain(); + const Function &F = MF->getFunction(); + if (F.hasExternalLinkage() && F.getName() == "main") + emitSpecialCodeForMain(); } static bool isDispSafeForFrameIndex(int64_t Val) { @@ -1423,12 +1501,30 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) { return false; } -template <class GatherScatterSDNode> -bool X86DAGToDAGISel::selectAddrOfGatherScatterNode( - GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale, - SDValue &Index, SDValue &Disp, SDValue &Segment) { +/// Helper for selectVectorAddr. Handles things that can be folded into a +/// gather scatter address. The index register and scale should have already +/// been handled. +bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) { + // TODO: Support other operations. + switch (N.getOpcode()) { + case X86ISD::Wrapper: + if (!matchWrapper(N, AM)) + return false; + break; + } + + return matchAddressBase(N, AM); +} + +bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, + SDValue &Scale, SDValue &Index, + SDValue &Disp, SDValue &Segment) { X86ISelAddressMode AM; - unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace(); + auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent); + AM.IndexReg = Mgs->getIndex(); + AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8; + + unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace(); // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS. if (AddrSpace == 256) AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16); @@ -1437,37 +1533,24 @@ bool X86DAGToDAGISel::selectAddrOfGatherScatterNode( if (AddrSpace == 258) AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16); - SDLoc DL(N); - Base = Mgs->getBasePtr(); - Index = Mgs->getIndex(); - unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits(); - Scale = getI8Imm(ScalarSize/8, DL); - // If Base is 0, the whole address is in index and the Scale is 1 - if (isa<ConstantSDNode>(Base)) { - assert(cast<ConstantSDNode>(Base)->isNullValue() && + if (isa<ConstantSDNode>(N)) { + assert(cast<ConstantSDNode>(N)->isNullValue() && "Unexpected base in gather/scatter"); - Scale = getI8Imm(1, DL); - Base = CurDAG->getRegister(0, MVT::i32); + AM.Scale = 1; } - if (AM.Segment.getNode()) - Segment = AM.Segment; - else - Segment = CurDAG->getRegister(0, MVT::i32); - Disp = CurDAG->getTargetConstant(0, DL, MVT::i32); - return true; -} + // Otherwise, try to match into the base and displacement fields. + else if (matchVectorAddress(N, AM)) + return false; -bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base, - SDValue &Scale, SDValue &Index, - SDValue &Disp, SDValue &Segment) { - if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent)) - return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>( - Mgs, N, Base, Scale, Index, Disp, Segment); - if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent)) - return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>( - X86Gather, N, Base, Scale, Index, Disp, Segment); - return false; + MVT VT = N.getSimpleValueType(); + if (AM.BaseType == X86ISelAddressMode::RegBase) { + if (!AM.Base_Reg.getNode()) + AM.Base_Reg = CurDAG->getRegister(0, VT); + } + + getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment); + return true; } /// Returns true if it is able to pattern match an addressing mode. @@ -1517,6 +1600,20 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base, return true; } +// We can only fold a load if all nodes between it and the root node have a +// single use. If there are additional uses, we could end up duplicating the +// load. +static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) { + SDNode *User = *N->use_begin(); + while (User != Root) { + if (!User->hasOneUse()) + return false; + User = *User->use_begin(); + } + + return true; +} + /// Match a scalar SSE load. In particular, we want to match a load whose top /// elements are either undef or zeros. The load flavor is derived from the /// type of N, which is either v4f32 or v2f64. @@ -1533,7 +1630,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, if (ISD::isNON_EXTLoad(N.getNode())) { PatternNodeWithChain = N; if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) { + IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) && + hasSingleUsesFromRoot(Root, N.getNode())) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment); @@ -1544,7 +1642,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, if (N.getOpcode() == X86ISD::VZEXT_LOAD) { PatternNodeWithChain = N; if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) { + IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) && + hasSingleUsesFromRoot(Root, N.getNode())) { auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain); return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp, Segment); @@ -1558,7 +1657,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, PatternNodeWithChain = N.getOperand(0); if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { + IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) && + hasSingleUsesFromRoot(Root, N.getNode())) { LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, Segment); @@ -1574,7 +1674,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, PatternNodeWithChain = N.getOperand(0).getOperand(0); if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) && IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) && - IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) { + IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) && + hasSingleUsesFromRoot(Root, N.getNode())) { // Okay, this is a zero extending load. Fold it. LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain); return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp, @@ -1589,7 +1690,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) { if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) { uint64_t ImmVal = CN->getZExtValue(); - if ((uint32_t)ImmVal != (uint64_t)ImmVal) + if (!isUInt<32>(ImmVal)) return false; Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64); @@ -1792,13 +1893,13 @@ bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) { return true; } -bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N, +bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N, SDValue &Base, SDValue &Scale, SDValue &Index, SDValue &Disp, SDValue &Segment) { if (!ISD::isNON_EXTLoad(N.getNode()) || - !IsProfitableToFold(N, P, P) || - !IsLegalToFold(N, P, P, OptLevel)) + !IsProfitableToFold(N, P, Root) || + !IsLegalToFold(N, P, Root, OptLevel)) return false; return selectAddr(N.getNode(), @@ -1891,15 +1992,79 @@ static bool hasNoSignedComparisonUses(SDNode *N) { return true; } -/// Check whether or not the chain ending in StoreNode is suitable for doing -/// the {load; increment or decrement; store} to modify transformation. -static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, - SDValue StoredVal, SelectionDAG *CurDAG, - LoadSDNode* &LoadNode, SDValue &InputChain) { - - // is the value stored the result of a DEC or INC? - if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false; +/// Test whether the given node which sets flags has any uses which require the +/// CF flag to be accurate. +static bool hasNoCarryFlagUses(SDNode *N) { + // Examine each user of the node. + for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE; + ++UI) { + // Only check things that use the flags. + if (UI.getUse().getResNo() != 1) + continue; + // Only examine CopyToReg uses. + if (UI->getOpcode() != ISD::CopyToReg) + return false; + // Only examine CopyToReg uses that copy to EFLAGS. + if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS) + return false; + // Examine each user of the CopyToReg use. + for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end(); + FlagUI != FlagUE; ++FlagUI) { + // Only examine the Flag result. + if (FlagUI.getUse().getResNo() != 1) + continue; + // Anything unusual: assume conservatively. + if (!FlagUI->isMachineOpcode()) + return false; + // Examine the opcode of the user. + switch (FlagUI->getMachineOpcode()) { + // Comparisons which don't examine the CF flag. + case X86::SETOr: case X86::SETNOr: case X86::SETEr: case X86::SETNEr: + case X86::SETSr: case X86::SETNSr: case X86::SETPr: case X86::SETNPr: + case X86::SETLr: case X86::SETGEr: case X86::SETLEr: case X86::SETGr: + case X86::JO_1: case X86::JNO_1: case X86::JE_1: case X86::JNE_1: + case X86::JS_1: case X86::JNS_1: case X86::JP_1: case X86::JNP_1: + case X86::JL_1: case X86::JGE_1: case X86::JLE_1: case X86::JG_1: + case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr: + case X86::CMOVO16rm: case X86::CMOVO32rm: case X86::CMOVO64rm: + case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr: + case X86::CMOVNO16rm: case X86::CMOVNO32rm: case X86::CMOVNO64rm: + case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr: + case X86::CMOVE16rm: case X86::CMOVE32rm: case X86::CMOVE64rm: + case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr: + case X86::CMOVNE16rm: case X86::CMOVNE32rm: case X86::CMOVNE64rm: + case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr: + case X86::CMOVS16rm: case X86::CMOVS32rm: case X86::CMOVS64rm: + case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr: + case X86::CMOVNS16rm: case X86::CMOVNS32rm: case X86::CMOVNS64rm: + case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr: + case X86::CMOVP16rm: case X86::CMOVP32rm: case X86::CMOVP64rm: + case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr: + case X86::CMOVNP16rm: case X86::CMOVNP32rm: case X86::CMOVNP64rm: + case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr: + case X86::CMOVL16rm: case X86::CMOVL32rm: case X86::CMOVL64rm: + case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr: + case X86::CMOVGE16rm: case X86::CMOVGE32rm: case X86::CMOVGE64rm: + case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr: + case X86::CMOVLE16rm: case X86::CMOVLE32rm: case X86::CMOVLE64rm: + case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr: + case X86::CMOVG16rm: case X86::CMOVG32rm: case X86::CMOVG64rm: + continue; + // Anything else: assume conservatively. + default: + return false; + } + } + } + return true; +} +/// Check whether or not the chain ending in StoreNode is suitable for doing +/// the {load; op; store} to modify transformation. +static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode, + SDValue StoredVal, SelectionDAG *CurDAG, + LoadSDNode *&LoadNode, + SDValue &InputChain) { // is the stored value result 0 of the load? if (StoredVal.getResNo() != 0) return false; @@ -1916,11 +2081,6 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, // Return LoadNode by reference. LoadNode = cast<LoadSDNode>(Load); - // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8) - EVT LdVT = LoadNode->getMemoryVT(); - if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 && - LdVT != MVT::i8) - return false; // Is store the only read of the loaded value? if (!Load.hasOneUse()) @@ -1978,22 +2138,294 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc, return true; } -/// Get the appropriate X86 opcode for an in-memory increment or decrement. -/// Opc should be X86ISD::DEC or X86ISD::INC. -static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) { - if (Opc == X86ISD::DEC) { - if (LdVT == MVT::i64) return X86::DEC64m; - if (LdVT == MVT::i32) return X86::DEC32m; - if (LdVT == MVT::i16) return X86::DEC16m; - if (LdVT == MVT::i8) return X86::DEC8m; +// Change a chain of {load; op; store} of the same value into a simple op +// through memory of that value, if the uses of the modified value and its +// address are suitable. +// +// The tablegen pattern memory operand pattern is currently not able to match +// the case where the EFLAGS on the original operation are used. +// +// To move this to tablegen, we'll need to improve tablegen to allow flags to +// be transferred from a node in the pattern to the result node, probably with +// a new keyword. For example, we have this +// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", +// [(store (add (loadi64 addr:$dst), -1), addr:$dst), +// (implicit EFLAGS)]>; +// but maybe need something like this +// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", +// [(store (add (loadi64 addr:$dst), -1), addr:$dst), +// (transferrable EFLAGS)]>; +// +// Until then, we manually fold these and instruction select the operation +// here. +bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) { + StoreSDNode *StoreNode = cast<StoreSDNode>(Node); + SDValue StoredVal = StoreNode->getOperand(1); + unsigned Opc = StoredVal->getOpcode(); + + // Before we try to select anything, make sure this is memory operand size + // and opcode we can handle. Note that this must match the code below that + // actually lowers the opcodes. + EVT MemVT = StoreNode->getMemoryVT(); + if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 && + MemVT != MVT::i8) + return false; + switch (Opc) { + default: + return false; + case X86ISD::INC: + case X86ISD::DEC: + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::AND: + case X86ISD::OR: + case X86ISD::XOR: + break; + } + + LoadSDNode *LoadNode = nullptr; + SDValue InputChain; + if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode, + InputChain)) + return false; + + SDValue Base, Scale, Index, Disp, Segment; + if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp, + Segment)) + return false; + + auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16, + unsigned Opc8) { + switch (MemVT.getSimpleVT().SimpleTy) { + case MVT::i64: + return Opc64; + case MVT::i32: + return Opc32; + case MVT::i16: + return Opc16; + case MVT::i8: + return Opc8; + default: + llvm_unreachable("Invalid size!"); + } + }; + + MachineSDNode *Result; + switch (Opc) { + case X86ISD::INC: + case X86ISD::DEC: { + unsigned NewOpc = + Opc == X86ISD::INC + ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m) + : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m); + const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain}; + Result = + CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); + break; + } + case X86ISD::ADD: + case X86ISD::SUB: + case X86ISD::AND: + case X86ISD::OR: + case X86ISD::XOR: { + auto SelectRegOpcode = [SelectOpcode](unsigned Opc) { + switch (Opc) { + case X86ISD::ADD: + return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr, + X86::ADD8mr); + case X86ISD::SUB: + return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr, + X86::SUB8mr); + case X86ISD::AND: + return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr, + X86::AND8mr); + case X86ISD::OR: + return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr); + case X86ISD::XOR: + return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr, + X86::XOR8mr); + default: + llvm_unreachable("Invalid opcode!"); + } + }; + auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) { + switch (Opc) { + case X86ISD::ADD: + return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0); + case X86ISD::SUB: + return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0); + case X86ISD::AND: + return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0); + case X86ISD::OR: + return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0); + case X86ISD::XOR: + return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0); + default: + llvm_unreachable("Invalid opcode!"); + } + }; + auto SelectImmOpcode = [SelectOpcode](unsigned Opc) { + switch (Opc) { + case X86ISD::ADD: + return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi, + X86::ADD8mi); + case X86ISD::SUB: + return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi, + X86::SUB8mi); + case X86ISD::AND: + return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi, + X86::AND8mi); + case X86ISD::OR: + return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi, + X86::OR8mi); + case X86ISD::XOR: + return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi, + X86::XOR8mi); + default: + llvm_unreachable("Invalid opcode!"); + } + }; + + unsigned NewOpc = SelectRegOpcode(Opc); + SDValue Operand = StoredVal->getOperand(1); + + // See if the operand is a constant that we can fold into an immediate + // operand. + if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) { + auto OperandV = OperandC->getAPIntValue(); + + // Check if we can shrink the operand enough to fit in an immediate (or + // fit into a smaller immediate) by negating it and switching the + // operation. + if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) && + ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 && + (-OperandV).getMinSignedBits() <= 8) || + (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 && + (-OperandV).getMinSignedBits() <= 32)) && + hasNoCarryFlagUses(StoredVal.getNode())) { + OperandV = -OperandV; + Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD; + } + + // First try to fit this into an Imm8 operand. If it doesn't fit, then try + // the larger immediate operand. + if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) { + Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); + NewOpc = SelectImm8Opcode(Opc); + } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() && + (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) { + Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT); + NewOpc = SelectImmOpcode(Opc); + } + } + + const SDValue Ops[] = {Base, Scale, Index, Disp, + Segment, Operand, InputChain}; + Result = + CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops); + break; + } + default: + llvm_unreachable("Invalid opcode!"); + } + + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); + MemOp[0] = StoreNode->getMemOperand(); + MemOp[1] = LoadNode->getMemOperand(); + Result->setMemRefs(MemOp, MemOp + 2); + + ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); + ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); + CurDAG->RemoveDeadNode(Node); + return true; +} + +// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI. +bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) { + MVT NVT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + SDValue N0 = Node->getOperand(0); + SDValue N1 = Node->getOperand(1); + + if (!Subtarget->hasBMI() && !Subtarget->hasTBM()) + return false; + + // Must have a shift right. + if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA) + return false; + + // Shift can't have additional users. + if (!N0->hasOneUse()) + return false; + + // Only supported for 32 and 64 bits. + if (NVT != MVT::i32 && NVT != MVT::i64) + return false; + + // Shift amount and RHS of and must be constant. + ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1); + ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1)); + if (!MaskCst || !ShiftCst) + return false; + + // And RHS must be a mask. + uint64_t Mask = MaskCst->getZExtValue(); + if (!isMask_64(Mask)) + return false; + + uint64_t Shift = ShiftCst->getZExtValue(); + uint64_t MaskSize = countPopulation(Mask); + + // Don't interfere with something that can be handled by extracting AH. + // TODO: If we are able to fold a load, BEXTR might still be better than AH. + if (Shift == 8 && MaskSize == 8) + return false; + + // Make sure we are only using bits that were in the original value, not + // shifted in. + if (Shift + MaskSize > NVT.getSizeInBits()) + return false; + + SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT); + unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri; + unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi; + + // BMI requires the immediate to placed in a register. + if (!Subtarget->hasTBM()) { + ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr; + MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm; + New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0); + if (NVT == MVT::i64) { + New = + SDValue(CurDAG->getMachineNode( + TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, + CurDAG->getTargetConstant(0, dl, MVT::i64), New, + CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)), + 0); + } + } + + MachineSDNode *NewNode; + SDValue Input = N0->getOperand(0); + SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4; + if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) { + SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) }; + SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other); + NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops); + // Update the chain. + ReplaceUses(Input.getValue(1), SDValue(NewNode, 1)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand(); + NewNode->setMemRefs(MemOp, MemOp + 1); } else { - assert(Opc == X86ISD::INC && "unrecognized opcode"); - if (LdVT == MVT::i64) return X86::INC64m; - if (LdVT == MVT::i32) return X86::INC32m; - if (LdVT == MVT::i16) return X86::INC16m; - if (LdVT == MVT::i8) return X86::INC8m; + NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New); } - llvm_unreachable("unrecognized size for LdVT"); + + ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); + return true; } void X86DAGToDAGISel::Select(SDNode *Node) { @@ -2037,20 +2469,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceNode(Node, getGlobalBaseReg()); return; + case X86ISD::SELECT: case X86ISD::SHRUNKBLEND: { - // SHRUNKBLEND selects like a regular VSELECT. + // SHRUNKBLEND selects like a regular VSELECT. Same with X86ISD::SELECT. SDValue VSelect = CurDAG->getNode( ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0), Node->getOperand(1), Node->getOperand(2)); - ReplaceUses(SDValue(Node, 0), VSelect); + ReplaceNode(Node, VSelect.getNode()); SelectCode(VSelect.getNode()); // We already called ReplaceUses. return; } case ISD::AND: + // Try to match BEXTR/BEXTRI instruction. + if (matchBEXTRFromAnd(Node)) + return; + + LLVM_FALLTHROUGH; case ISD::OR: case ISD::XOR: { + // For operations of the form (x << C1) op C2, check if we can use a smaller // encoding for C2 by transforming it into (x op (C2>>C1)) << C1. SDValue N0 = Node->getOperand(0); @@ -2157,7 +2596,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { unsigned LoReg; switch (NVT.SimpleTy) { default: llvm_unreachable("Unsupported VT!"); - case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break; + // MVT::i8 is handled by X86ISD::UMUL8. case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break; case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break; case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break; @@ -2263,12 +2702,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Update the chain. ReplaceUses(N1.getValue(1), Chain); // Record the mem-refs - LoadSDNode *LoadNode = cast<LoadSDNode>(N1); - if (LoadNode) { - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); - MemOp[0] = LoadNode->getMemOperand(); - CNode->setMemRefs(MemOp, MemOp + 1); - } + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); } else { SDValue Ops[] = { N1, InFlag }; if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) { @@ -2293,7 +2729,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Get the low part if needed. Don't use getCopyFromReg for aliasing // registers. if (!SDValue(Node, 0).use_empty()) - ReplaceUses(SDValue(Node, 1), + ReplaceUses(SDValue(Node, 0), CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result)); // Shift AX down 8 bits. @@ -2328,6 +2764,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n'); } + CurDAG->RemoveDeadNode(Node); return; } @@ -2447,11 +2884,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (foldedLoad) { SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0), InFlag }; - SDNode *CNode = + MachineSDNode *CNode = CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops); InFlag = SDValue(CNode, 1); // Update the chain. ReplaceUses(N1.getValue(1), SDValue(CNode, 0)); + // Record the mem-refs + MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); + MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand(); + CNode->setMemRefs(MemOp, MemOp + 1); } else { InFlag = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0); @@ -2476,19 +2917,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG || Opcode == X86ISD::SDIVREM8_SEXT_HREG) { - if (Node->getValueType(1) == MVT::i64) { - // It's not possible to directly movsx AH to a 64bit register, because - // the latter needs the REX prefix, but the former can't have it. - assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG && - "Unexpected i64 sext of h-register"); - Result = - SDValue(CurDAG->getMachineNode( - TargetOpcode::SUBREG_TO_REG, dl, MVT::i64, - CurDAG->getTargetConstant(0, dl, MVT::i64), Result, - CurDAG->getTargetConstant(X86::sub_32bit, dl, - MVT::i32)), - 0); - } + assert(Node->getValueType(1) == MVT::i32 && "Unexpected result type!"); } else { Result = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result); @@ -2512,6 +2941,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { ReplaceUses(SDValue(Node, 1), Result); DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n'); } + CurDAG->RemoveDeadNode(Node); return; } @@ -2531,34 +2961,21 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to // use a smaller encoding. // Look past the truncate if CMP is the only use of it. - if ((N0.getNode()->getOpcode() == ISD::AND || - (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) && + if ((N0.getOpcode() == ISD::AND || + (N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) && N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1)); if (!C) break; + uint64_t Mask = C->getZExtValue(); // For example, convert "testl %eax, $8" to "testb %al, $8" - if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 && - (!(C->getZExtValue() & 0x80) || - hasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); + if (isUInt<8>(Mask) && + (!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8); SDValue Reg = N0.getOperand(0); - // On x86-32, only the ABCD registers have 8-bit subregisters. - if (!Subtarget->is64Bit()) { - const TargetRegisterClass *TRC; - switch (N0.getSimpleValueType().SimpleTy) { - case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; - case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; - default: llvm_unreachable("Unsupported TEST operand type!"); - } - SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); - Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, - Reg.getValueType(), Reg, RC), 0); - } - // Extract the l-register. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Reg); @@ -2570,30 +2987,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); return; } // For example, "testl %eax, $2048" to "testb %ah, $8". - if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 && - (!(C->getZExtValue() & 0x8000) || - hasNoSignedComparisonUses(Node))) { + if (isShiftedUInt<8, 8>(Mask) && + (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { // Shift the immediate right by 8 bits. - SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, - dl, MVT::i8); + SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8); SDValue Reg = N0.getOperand(0); - // Put the value in an ABCD register. - const TargetRegisterClass *TRC; - switch (N0.getSimpleValueType().SimpleTy) { - case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break; - case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break; - case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break; - default: llvm_unreachable("Unsupported TEST operand type!"); - } - SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32); - Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl, - Reg.getValueType(), Reg, RC), 0); - // Extract the h-register. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, Reg); @@ -2607,16 +3011,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); return; } // For example, "testl %eax, $32776" to "testw %ax, $32776". - if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 && - N0.getValueType() != MVT::i16 && - (!(C->getZExtValue() & 0x8000) || - hasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, - MVT::i16); + // NOTE: We only want to form TESTW instructions if optimizing for + // min size. Otherwise we only save one byte and possibly get a length + // changing prefix penalty in the decoders. + if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 && + (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16); SDValue Reg = N0.getOperand(0); // Extract the 16-bit subregister. @@ -2630,16 +3035,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); return; } // For example, "testq %rax, $268468232" to "testl %eax, $268468232". - if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 && - N0.getValueType() == MVT::i64 && - (!(C->getZExtValue() & 0x80000000) || - hasNoSignedComparisonUses(Node))) { - SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, - MVT::i32); + if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 && + (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) { + SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32); SDValue Reg = N0.getOperand(0); // Extract the 32-bit subregister. @@ -2653,60 +3056,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // one, do not call ReplaceAllUsesWith. ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)), SDValue(NewNode, 0)); + CurDAG->RemoveDeadNode(Node); return; } } break; } - case ISD::STORE: { - // Change a chain of {load; incr or dec; store} of the same value into - // a simple increment or decrement through memory of that value, if the - // uses of the modified value and its address are suitable. - // The DEC64m tablegen pattern is currently not able to match the case where - // the EFLAGS on the original DEC are used. (This also applies to - // {INC,DEC}X{64,32,16,8}.) - // We'll need to improve tablegen to allow flags to be transferred from a - // node in the pattern to the result node. probably with a new keyword - // for example, we have this - // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", - // [(store (add (loadi64 addr:$dst), -1), addr:$dst), - // (implicit EFLAGS)]>; - // but maybe need something like this - // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", - // [(store (add (loadi64 addr:$dst), -1), addr:$dst), - // (transferrable EFLAGS)]>; - - StoreSDNode *StoreNode = cast<StoreSDNode>(Node); - SDValue StoredVal = StoreNode->getOperand(1); - unsigned Opc = StoredVal->getOpcode(); - - LoadSDNode *LoadNode = nullptr; - SDValue InputChain; - if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG, - LoadNode, InputChain)) - break; - - SDValue Base, Scale, Index, Disp, Segment; - if (!selectAddr(LoadNode, LoadNode->getBasePtr(), - Base, Scale, Index, Disp, Segment)) - break; - - MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2); - MemOp[0] = StoreNode->getMemOperand(); - MemOp[1] = LoadNode->getMemOperand(); - const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain }; - EVT LdVT = LoadNode->getMemoryVT(); - unsigned newOpc = getFusedLdStOpcode(LdVT, Opc); - MachineSDNode *Result = CurDAG->getMachineNode(newOpc, - SDLoc(Node), - MVT::i32, MVT::Other, Ops); - Result->setMemRefs(MemOp, MemOp + 2); - - ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1)); - ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0)); - CurDAG->RemoveDeadNode(Node); - return; - } + case ISD::STORE: + if (foldLoadStoreIntoMemOperand(Node)) + return; + break; } SelectCode(Node); diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 607bc4530abb..a72f4daa5e11 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -35,6 +35,7 @@ #include "llvm/CodeGen/MachineJumpTableInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/WinEHFuncInfo.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/CallingConv.h" @@ -55,7 +56,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" -#include "llvm/Target/TargetLowering.h" #include "llvm/Target/TargetOptions.h" #include <algorithm> #include <bitset> @@ -94,7 +94,7 @@ static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, const char *Msg) { MachineFunction &MF = DAG.getMachineFunction(); DAG.getContext()->diagnose( - DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc())); + DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc())); } X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, @@ -188,6 +188,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setCondCodeAction(ISD::SETUNE, MVT::f64, Expand); setCondCodeAction(ISD::SETUNE, MVT::f80, Expand); + // Integer absolute. + if (Subtarget.hasCMov()) { + setOperationAction(ISD::ABS , MVT::i16 , Custom); + setOperationAction(ISD::ABS , MVT::i32 , Custom); + if (Subtarget.is64Bit()) + setOperationAction(ISD::ABS , MVT::i64 , Custom); + } + // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this // operation. setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote); @@ -372,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // Special handling for half-precision floating point conversions. // If we don't have F16C support, then lower half float conversions // into library calls. - if (Subtarget.useSoftFloat() || - (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) { + if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) { setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand); setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand); } @@ -392,7 +399,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTruncStoreAction(MVT::f80, MVT::f16, Expand); if (Subtarget.hasPOPCNT()) { - setOperationAction(ISD::CTPOP , MVT::i8 , Promote); + setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32); } else { setOperationAction(ISD::CTPOP , MVT::i8 , Expand); setOperationAction(ISD::CTPOP , MVT::i16 , Expand); @@ -425,12 +432,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand); setOperationAction(ISD::EH_RETURN , MVT::Other, Custom); - // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support - // SjLj exception handling but a light-weight setjmp/longjmp replacement to - // support continuation, user-level threading, and etc.. As a result, no - // other SjLj exception interfaces are implemented and please don't build - // your own exception handling based on them. - // LLVM/Clang supports zero-cost DWARF exception handling. + // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since + // LLVM/Clang supports zero-cost DWARF and SEH exception handling. setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom); setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom); setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom); @@ -545,8 +548,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } else if (UseX87 && X86ScalarSSEf32) { // Use SSE for f32, x87 for f64. // Set up the FP register classes. - addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass - : &X86::FR32RegClass); + addRegisterClass(MVT::f32, &X86::FR32RegClass); addRegisterClass(MVT::f64, &X86::RFP64RegClass); // Use ANDPS to simulate FABS. @@ -573,11 +575,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS - if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f64, Expand); - setOperationAction(ISD::FCOS , MVT::f64, Expand); - setOperationAction(ISD::FSINCOS, MVT::f64, Expand); - } + // Always expand sin/cos functions even though x87 has an instruction. + setOperationAction(ISD::FSIN , MVT::f64, Expand); + setOperationAction(ISD::FCOS , MVT::f64, Expand); + setOperationAction(ISD::FSINCOS, MVT::f64, Expand); } else if (UseX87) { // f32 and f64 in x87. // Set up the FP register classes. @@ -588,11 +589,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::UNDEF, VT, Expand); setOperationAction(ISD::FCOPYSIGN, VT, Expand); - if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , VT, Expand); - setOperationAction(ISD::FCOS , VT, Expand); - setOperationAction(ISD::FSINCOS, VT, Expand); - } + // Always expand sin/cos functions even though x87 has an instruction. + setOperationAction(ISD::FSIN , VT, Expand); + setOperationAction(ISD::FCOS , VT, Expand); + setOperationAction(ISD::FSINCOS, VT, Expand); } addLegalFPImmediate(APFloat(+0.0)); // FLD0 addLegalFPImmediate(APFloat(+1.0)); // FLD1 @@ -636,11 +636,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addLegalFPImmediate(TmpFlt2); // FLD1/FCHS } - if (!TM.Options.UnsafeFPMath) { - setOperationAction(ISD::FSIN , MVT::f80, Expand); - setOperationAction(ISD::FCOS , MVT::f80, Expand); - setOperationAction(ISD::FSINCOS, MVT::f80, Expand); - } + // Always expand sin/cos functions even though x87 has an instruction. + setOperationAction(ISD::FSIN , MVT::f80, Expand); + setOperationAction(ISD::FCOS , MVT::f80, Expand); + setOperationAction(ISD::FSINCOS, MVT::f80, Expand); setOperationAction(ISD::FFLOOR, MVT::f80, Expand); setOperationAction(ISD::FCEIL, MVT::f80, Expand); @@ -861,8 +860,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom); setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom); // Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion. @@ -944,6 +941,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) { setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal); + setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal); setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal); setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal); @@ -1002,13 +1000,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote); setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom); - for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal); @@ -1104,7 +1098,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, // (result) is 128-bit but the source is 256-bit wide. for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64, MVT::v4f32, MVT::v2f64 }) { - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); } // Custom lower several nodes for 256-bit types. @@ -1131,6 +1125,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64); setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64); } + + if (HasInt256) { + // Custom legalize 2x32 to get a little better code. + setOperationAction(ISD::MGATHER, MVT::v2f32, Custom); + setOperationAction(ISD::MGATHER, MVT::v2i32, Custom); + + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) + setOperationAction(ISD::MGATHER, VT, Custom); + } } if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) { @@ -1143,13 +1147,55 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v8i1, &X86::VK8RegClass); addRegisterClass(MVT::v16i1, &X86::VK16RegClass); + setOperationAction(ISD::SELECT, MVT::v1i1, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); + setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); + + setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); + setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom); + + // Extends of v16i1/v8i1 to 128-bit vectors. + setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom); + + for (auto VT : { MVT::v8i1, MVT::v16i1 }) { + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::TRUNCATE, VT, Custom); + + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + } + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); + for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, + MVT::v16i1, MVT::v32i1, MVT::v64i1 }) + setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); + for (MVT VT : MVT::fp_vector_valuetypes()) setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal); - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal); setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal); - setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal); setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal); @@ -1173,98 +1219,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote); + setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote); setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote); + setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote); setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote); setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom); - setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom); - setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom); - setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal); - setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal); setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal); setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal); - if (Subtarget.hasVLX()){ - setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); - setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); - setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); - setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); - setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); - - setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); - setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); - setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); - setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); - setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); - } else { + + if (!Subtarget.hasVLX()) { + // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE + // to 512-bit rather than use the AVX2 instructions so that we can use + // k-masks. for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) { setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::MSTORE, VT, Custom); } } - setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); - - if (Subtarget.hasDQI()) { - for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) { - setOperationAction(ISD::SINT_TO_FP, VT, Legal); - setOperationAction(ISD::UINT_TO_FP, VT, Legal); - setOperationAction(ISD::FP_TO_SINT, VT, Legal); - setOperationAction(ISD::FP_TO_UINT, VT, Legal); - } - if (Subtarget.hasVLX()) { - // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion. - setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); - setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); - } - } - if (Subtarget.hasVLX()) { - setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); - setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal); - setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal); - setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); - setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); - - // FIXME. This commands are available on SSE/AVX2, add relevant patterns. - setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal); - setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal); - } + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom); @@ -1272,9 +1252,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom); for (auto VT : { MVT::v16f32, MVT::v8f64 }) { setOperationAction(ISD::FFLOOR, VT, Legal); @@ -1295,38 +1272,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom); setOperationAction(ISD::MUL, MVT::v8i64, Custom); + setOperationAction(ISD::MUL, MVT::v16i32, Legal); + + setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom); + setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom); setOperationAction(ISD::SELECT, MVT::v8f64, Custom); setOperationAction(ISD::SELECT, MVT::v8i64, Custom); setOperationAction(ISD::SELECT, MVT::v16f32, Custom); - setOperationAction(ISD::MUL, MVT::v16i32, Legal); - - // NonVLX sub-targets extend 128/256 vectors to use the 512 version. - setOperationAction(ISD::ABS, MVT::v4i64, Legal); - setOperationAction(ISD::ABS, MVT::v2i64, Legal); - - for (auto VT : { MVT::v8i1, MVT::v16i1 }) { - setOperationAction(ISD::ADD, VT, Custom); - setOperationAction(ISD::SUB, VT, Custom); - setOperationAction(ISD::MUL, VT, Custom); - setOperationAction(ISD::SETCC, VT, Custom); - setOperationAction(ISD::SELECT, VT, Custom); - setOperationAction(ISD::TRUNCATE, VT, Custom); - - setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Expand); - } - for (auto VT : { MVT::v16i32, MVT::v8i64 }) { setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); @@ -1338,11 +1294,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::SRA, VT, Custom); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); - } - - // NonVLX sub-targets extend 128/256 vectors to use the 512 version. - for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64, - MVT::v8i64}) { setOperationAction(ISD::ROTL, VT, Custom); setOperationAction(ISD::ROTR, VT, Custom); } @@ -1354,44 +1305,33 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64); setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64); + if (Subtarget.hasDQI()) { + setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal); + + setOperationAction(ISD::MUL, MVT::v8i64, Legal); + } + if (Subtarget.hasCDI()) { // NonVLX sub-targets extend 128/256 vectors to use the 512 version. - for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, - MVT::v4i64, MVT::v8i64}) { + for (auto VT : { MVT::v16i32, MVT::v8i64} ) { setOperationAction(ISD::CTLZ, VT, Legal); setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); } } // Subtarget.hasCDI() - if (Subtarget.hasDQI()) { - // NonVLX sub-targets extend 128/256 vectors to use the 512 version. - setOperationAction(ISD::MUL, MVT::v2i64, Legal); - setOperationAction(ISD::MUL, MVT::v4i64, Legal); - setOperationAction(ISD::MUL, MVT::v8i64, Legal); - } - if (Subtarget.hasVPOPCNTDQ()) { - // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512 - // version of popcntd/q. - for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64, - MVT::v4i32, MVT::v2i64}) + for (auto VT : { MVT::v16i32, MVT::v8i64 }) setOperationAction(ISD::CTPOP, VT, Legal); } - // Custom lower several nodes. - for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, - MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) { - setOperationAction(ISD::MGATHER, VT, Custom); - setOperationAction(ISD::MSCATTER, VT, Custom); - } // Extract subvector is special because the value type // (result) is 256-bit but the source is 512-bit wide. - // 128-bit was made Custom under AVX1. + // 128-bit was made Legal under AVX1. for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64, - MVT::v8f32, MVT::v4f64, MVT::v1i1 }) - setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom); - for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, - MVT::v16i1, MVT::v32i1, MVT::v64i1 }) + MVT::v8f32, MVT::v4f64 }) setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal); for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) { @@ -1404,7 +1344,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); setOperationAction(ISD::MLOAD, VT, Legal); setOperationAction(ISD::MSTORE, VT, Legal); - setOperationAction(ISD::MGATHER, VT, Legal); + setOperationAction(ISD::MGATHER, VT, Custom); setOperationAction(ISD::MSCATTER, VT, Custom); } for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) { @@ -1413,6 +1353,59 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, } }// has AVX-512 + if (!Subtarget.useSoftFloat() && + (Subtarget.hasAVX512() || Subtarget.hasVLX())) { + // These operations are handled on non-VLX by artificially widening in + // isel patterns. + // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? + + setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal); + setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom); + setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal); + setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal); + + for (auto VT : { MVT::v2i64, MVT::v4i64 }) { + setOperationAction(ISD::SMAX, VT, Legal); + setOperationAction(ISD::UMAX, VT, Legal); + setOperationAction(ISD::SMIN, VT, Legal); + setOperationAction(ISD::UMIN, VT, Legal); + setOperationAction(ISD::ABS, VT, Legal); + } + + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { + setOperationAction(ISD::ROTL, VT, Custom); + setOperationAction(ISD::ROTR, VT, Custom); + } + + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64, + MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) + setOperationAction(ISD::MSCATTER, VT, Custom); + + if (Subtarget.hasDQI()) { + for (auto VT : { MVT::v2i64, MVT::v4i64 }) { + setOperationAction(ISD::SINT_TO_FP, VT, Legal); + setOperationAction(ISD::UINT_TO_FP, VT, Legal); + setOperationAction(ISD::FP_TO_SINT, VT, Legal); + setOperationAction(ISD::FP_TO_UINT, VT, Legal); + + setOperationAction(ISD::MUL, VT, Legal); + } + } + + if (Subtarget.hasCDI()) { + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) { + setOperationAction(ISD::CTLZ, VT, Legal); + setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom); + } + } // Subtarget.hasCDI() + + if (Subtarget.hasVPOPCNTDQ()) { + for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) + setOperationAction(ISD::CTPOP, VT, Legal); + } + } + if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) { addRegisterClass(MVT::v32i16, &X86::VR512RegClass); addRegisterClass(MVT::v64i8, &X86::VR512RegClass); @@ -1420,77 +1413,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, addRegisterClass(MVT::v32i1, &X86::VK32RegClass); addRegisterClass(MVT::v64i1, &X86::VK64RegClass); - setOperationAction(ISD::ADD, MVT::v32i1, Custom); - setOperationAction(ISD::ADD, MVT::v64i1, Custom); - setOperationAction(ISD::SUB, MVT::v32i1, Custom); - setOperationAction(ISD::SUB, MVT::v64i1, Custom); - setOperationAction(ISD::MUL, MVT::v32i1, Custom); - setOperationAction(ISD::MUL, MVT::v64i1, Custom); + for (auto VT : { MVT::v32i1, MVT::v64i1 }) { + setOperationAction(ISD::ADD, VT, Custom); + setOperationAction(ISD::SUB, VT, Custom); + setOperationAction(ISD::MUL, VT, Custom); + setOperationAction(ISD::VSELECT, VT, Expand); + + setOperationAction(ISD::TRUNCATE, VT, Custom); + setOperationAction(ISD::SETCC, VT, Custom); + setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); + setOperationAction(ISD::SELECT, VT, Custom); + setOperationAction(ISD::BUILD_VECTOR, VT, Custom); + setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); + } + + setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); + setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); + setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); + + // Extends from v32i1 masks to 256-bit vectors. + setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom); + // Extends from v64i1 masks to 512-bit vectors. + setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::SETCC, MVT::v32i1, Custom); - setOperationAction(ISD::SETCC, MVT::v64i1, Custom); setOperationAction(ISD::MUL, MVT::v32i16, Legal); setOperationAction(ISD::MUL, MVT::v64i8, Custom); setOperationAction(ISD::MULHS, MVT::v32i16, Legal); setOperationAction(ISD::MULHU, MVT::v32i16, Legal); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom); - setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom); + setOperationAction(ISD::MULHS, MVT::v64i8, Custom); + setOperationAction(ISD::MULHU, MVT::v64i8, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom); - setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom); - setOperationAction(ISD::SELECT, MVT::v32i1, Custom); - setOperationAction(ISD::SELECT, MVT::v64i1, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom); setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom); setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom); - setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom); - setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); - setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom); - setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom); - setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i1, Expand); - setOperationAction(ISD::VSELECT, MVT::v64i1, Expand); setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom); setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom); setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal); - if (Subtarget.hasVLX()) { - setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); - setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); - } - - LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom; - for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { - setOperationAction(ISD::MLOAD, VT, Action); - setOperationAction(ISD::MSTORE, VT, Action); - } - - if (Subtarget.hasCDI()) { - setOperationAction(ISD::CTLZ, MVT::v32i16, Custom); - setOperationAction(ISD::CTLZ, MVT::v64i8, Custom); - } for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); @@ -1503,6 +1481,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::MSTORE, VT, Legal); setOperationAction(ISD::CTPOP, VT, Custom); setOperationAction(ISD::CTTZ, VT, Custom); + setOperationAction(ISD::CTLZ, VT, Custom); setOperationAction(ISD::SMAX, VT, Legal); setOperationAction(ISD::UMAX, VT, Legal); setOperationAction(ISD::SMIN, VT, Legal); @@ -1513,13 +1492,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64); } - for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) { + for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) { setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal); - if (Subtarget.hasVLX()) { - // FIXME. This commands are available on SSE/AVX2, add relevant patterns. - setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal); - setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal); - } + } + + if (Subtarget.hasBITALG()) { + for (auto VT : { MVT::v64i8, MVT::v32i16 }) + setOperationAction(ISD::CTPOP, VT, Legal); + } + } + + if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() && + (Subtarget.hasAVX512() || Subtarget.hasVLX())) { + for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) { + setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom); + setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom); + } + + // These operations are handled on non-VLX by artificially widening in + // isel patterns. + // TODO: Custom widen in lowering on non-VLX and drop the isel patterns? + + if (Subtarget.hasBITALG()) { + for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 }) + setOperationAction(ISD::CTPOP, VT, Legal); } } @@ -1542,16 +1538,47 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); } + // TODO: v8i1 concat should be legal without VLX to support concats of + // v1i1, but we won't legalize it correctly currently without introducing + // a v4i1 concat in the middle. setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom); setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom); - setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom); - for (auto VT : { MVT::v2i64, MVT::v4i64 }) { - setOperationAction(ISD::SMAX, VT, Legal); - setOperationAction(ISD::UMAX, VT, Legal); - setOperationAction(ISD::SMIN, VT, Legal); - setOperationAction(ISD::UMIN, VT, Legal); + // Extends from v2i1/v4i1 masks to 128-bit vectors. + setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom); + setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom); + + setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal); + setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal); + setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal); + + setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal); + setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal); + setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal); + + if (Subtarget.hasDQI()) { + // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion. + // v2f32 UINT_TO_FP is already custom under SSE2. + setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom); + assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) && + "Unexpected operation action!"); + // v2i64 FP_TO_S/UINT(v2f32) custom conversion. + setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom); + setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom); + } + + if (Subtarget.hasBWI()) { + setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal); + setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal); } } @@ -1592,6 +1619,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setLibcallName(RTLIB::SHL_I128, nullptr); setLibcallName(RTLIB::SRL_I128, nullptr); setLibcallName(RTLIB::SRA_I128, nullptr); + setLibcallName(RTLIB::MUL_I128, nullptr); } // Combine sin / cos into one node or libcall if possible. @@ -1631,6 +1659,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); setTargetDAGCombine(ISD::INSERT_SUBVECTOR); + setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR); setTargetDAGCombine(ISD::BITCAST); setTargetDAGCombine(ISD::VSELECT); setTargetDAGCombine(ISD::SELECT); @@ -1698,6 +1727,19 @@ bool X86TargetLowering::useLoadStackGuardNode() const { return Subtarget.isTargetMachO() && Subtarget.is64Bit(); } +bool X86TargetLowering::useStackGuardXorFP() const { + // Currently only MSVC CRTs XOR the frame pointer into the stack guard value. + return Subtarget.getTargetTriple().isOSMSVCRT(); +} + +SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, + const SDLoc &DL) const { + EVT PtrTy = getPointerTy(DAG.getDataLayout()); + unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP; + MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val); + return SDValue(Node, 0); +} + TargetLoweringBase::LegalizeTypeAction X86TargetLowering::getPreferredVectorAction(EVT VT) const { if (ExperimentalVectorWideningLegalization && @@ -1714,40 +1756,26 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL, if (!VT.isVector()) return MVT::i8; - if (VT.isSimple()) { - MVT VVT = VT.getSimpleVT(); - const unsigned NumElts = VVT.getVectorNumElements(); - MVT EltVT = VVT.getVectorElementType(); - if (VVT.is512BitVector()) { - if (Subtarget.hasAVX512()) - if (EltVT == MVT::i32 || EltVT == MVT::i64 || - EltVT == MVT::f32 || EltVT == MVT::f64) - switch(NumElts) { - case 8: return MVT::v8i1; - case 16: return MVT::v16i1; - } - if (Subtarget.hasBWI()) - if (EltVT == MVT::i8 || EltVT == MVT::i16) - switch(NumElts) { - case 32: return MVT::v32i1; - case 64: return MVT::v64i1; - } - } + if (Subtarget.hasAVX512()) { + const unsigned NumElts = VT.getVectorNumElements(); - if (Subtarget.hasBWI() && Subtarget.hasVLX()) - return MVT::getVectorVT(MVT::i1, NumElts); + // Figure out what this type will be legalized to. + EVT LegalVT = VT; + while (getTypeAction(Context, LegalVT) != TypeLegal) + LegalVT = getTypeToTransformTo(Context, LegalVT); - if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) { - EVT LegalVT = getTypeToTransformTo(Context, VT); - EltVT = LegalVT.getVectorElementType().getSimpleVT(); - } + // If we got a 512-bit vector then we'll definitely have a vXi1 compare. + if (LegalVT.getSimpleVT().is512BitVector()) + return EVT::getVectorVT(Context, MVT::i1, NumElts); - if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32) - switch(NumElts) { - case 2: return MVT::v2i1; - case 4: return MVT::v4i1; - case 8: return MVT::v8i1; - } + if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) { + // If we legalized to less than a 512-bit vector, then we will use a vXi1 + // compare for vXi32/vXi64 for sure. If we have BWI we will also support + // vXi16/vXi8. + MVT EltVT = LegalVT.getSimpleVT().getVectorElementType(); + if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32) + return EVT::getVectorVT(Context, MVT::i1, NumElts); + } } return VT.changeVectorElementTypeToInteger(); @@ -1815,8 +1843,8 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - const Function *F = MF.getFunction(); - if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) { + const Function &F = MF.getFunction(); + if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) { if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() || ((DstAlign == 0 || DstAlign >= 16) && @@ -1912,7 +1940,7 @@ void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC, if (CC != CallingConv::C && CC != CallingConv::X86_StdCall) return; unsigned ParamRegs = 0; - if (auto *M = MF->getFunction()->getParent()) + if (auto *M = MF->getFunction().getParent()) ParamRegs = M->getNumberRegisterParameters(); // Mark the first N int arguments as having reg @@ -2017,7 +2045,7 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const { // sysdeps/{i386,x86_64}/nptl/tls.h) if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) { if (Subtarget.isTargetFuchsia()) { - // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value. + // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value. return SegmentOffset(IRB, 0x10, getAddressSpace()); } else { // %fs:0x28, unless we're using a Kernel code model, in which case @@ -2082,7 +2110,7 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const { // Fuchsia is similar. if (Subtarget.isTargetFuchsia()) { - // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value. + // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value. return SegmentOffset(IRB, 0x18, getAddressSpace()); } @@ -2145,8 +2173,7 @@ static void Passv64i1ArgInRegs( const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg, SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA, CCValAssign &NextVA, const X86Subtarget &Subtarget) { - assert((Subtarget.hasBWI() || Subtarget.hasBMI()) && - "Expected AVX512BW or AVX512BMI target!"); + assert(Subtarget.hasBWI() && "Expected AVX512BW target!"); assert(Subtarget.is32Bit() && "Expecting 32 bit target"); assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value"); assert(VA.isRegLoc() && NextVA.isRegLoc() && @@ -2180,7 +2207,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // For example, when they are used for argument passing. bool ShouldDisableCalleeSavedRegister = CallConv == CallingConv::X86_RegCall || - MF.getFunction()->hasFnAttribute("no_caller_saved_registers"); + MF.getFunction().hasFnAttribute("no_caller_saved_registers"); if (CallConv == CallingConv::X86_INTR && !Outs.empty()) report_fatal_error("X86 interrupts may not return any value"); @@ -2862,8 +2889,8 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF, return None; } - const Function *Fn = MF.getFunction(); - bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat); + const Function &F = MF.getFunction(); + bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool isSoftFloat = Subtarget.useSoftFloat(); assert(!(isSoftFloat && NoImplicitFloatOps) && "SSE register cannot be used when SSE is disabled!"); @@ -2896,10 +2923,9 @@ SDValue X86TargetLowering::LowerFormalArguments( X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>(); const TargetFrameLowering &TFI = *Subtarget.getFrameLowering(); - const Function *Fn = MF.getFunction(); - if (Fn->hasExternalLinkage() && - Subtarget.isTargetCygMing() && - Fn->getName() == "main") + const Function &F = MF.getFunction(); + if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() && + F.getName() == "main") FuncInfo->setForceFramePointer(true); MachineFrameInfo &MFI = MF.getFrameInfo(); @@ -3074,7 +3100,7 @@ SDValue X86TargetLowering::LowerFormalArguments( // Figure out if XMM registers are in use. assert(!(Subtarget.useSoftFloat() && - Fn->hasFnAttribute(Attribute::NoImplicitFloat)) && + F.hasFnAttribute(Attribute::NoImplicitFloat)) && "SSE register cannot be used when SSE is disabled!"); // 64-bit calling conventions support varargs and register parameters, so we @@ -3231,7 +3257,7 @@ SDValue X86TargetLowering::LowerFormalArguments( FuncInfo->setArgumentStackSize(StackSize); if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) { - EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn()); + EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn()); if (Personality == EHPersonality::CoreCLR) { assert(Is64Bit); // TODO: Add a mechanism to frame lowering that will allow us to indicate @@ -3248,10 +3274,10 @@ SDValue X86TargetLowering::LowerFormalArguments( } if (CallConv == CallingConv::X86_RegCall || - Fn->hasFnAttribute("no_caller_saved_registers")) { - const MachineRegisterInfo &MRI = MF.getRegInfo(); - for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end())) - MF.getRegInfo().disableCalleeSavedRegister(Pair.first); + F.hasFnAttribute("no_caller_saved_registers")) { + MachineRegisterInfo &MRI = MF.getRegInfo(); + for (std::pair<unsigned, unsigned> Pair : MRI.liveins()) + MRI.disableCalleeSavedRegister(Pair.first); } return Chain; @@ -3339,9 +3365,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU()); bool IsSibcall = false; X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>(); - auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls"); - const CallInst *CI = - CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr; + auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls"); + const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction()); const Function *Fn = CI ? CI->getCalledFunction() : nullptr; bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) || (Fn && Fn->hasFnAttribute("no_caller_saved_registers")); @@ -3365,7 +3390,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, isTailCall = false; } - bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall(); + bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall(); if (IsMustTail) { // Force this to be a tail call. The verifier rules are enough to ensure // that we can lower this successfully without moving the return address @@ -3375,7 +3400,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Check if it's really possible to do a tail call. isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg, SR != NotStructReturn, - MF.getFunction()->hasStructRetAttr(), CLI.RetTy, + MF.getFunction().hasStructRetAttr(), CLI.RetTy, Outs, OutVals, Ins, DAG); // Sibcalls are automatically detected tailcalls which do not require @@ -3721,7 +3746,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } } } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) { - const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlags = Subtarget.classifyGlobalFunctionReference(nullptr, *Mod); @@ -3769,11 +3794,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // is thrown, the runtime will not restore CSRs. // FIXME: Model this more precisely so that we can register allocate across // the normal edge and spill and fill across the exceptional edge. - if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) { - const Function *CallerFn = MF.getFunction(); + if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) { + const Function &CallerFn = MF.getFunction(); EHPersonality Pers = - CallerFn->hasPersonalityFn() - ? classifyEHPersonality(CallerFn->getPersonalityFn()) + CallerFn.hasPersonalityFn() + ? classifyEHPersonality(CallerFn.getPersonalityFn()) : EHPersonality::Unknown; if (isFuncletEHPersonality(Pers)) Mask = RegInfo->getNoPreservedMask(); @@ -4021,15 +4046,15 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization( // If -tailcallopt is specified, make fastcc functions tail-callable. MachineFunction &MF = DAG.getMachineFunction(); - const Function *CallerF = MF.getFunction(); + const Function &CallerF = MF.getFunction(); // If the function return type is x86_fp80 and the callee return type is not, // then the FP_EXTEND of the call result is not a nop. It's not safe to // perform a tailcall optimization here. - if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) + if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty()) return false; - CallingConv::ID CallerCC = CallerF->getCallingConv(); + CallingConv::ID CallerCC = CallerF.getCallingConv(); bool CCMatch = CallerCC == CalleeCC; bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC); bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC); @@ -4243,7 +4268,6 @@ static bool isTargetShuffle(unsigned Opcode) { case X86ISD::VSHLDQ: case X86ISD::VSRLDQ: case X86ISD::MOVLHPS: - case X86ISD::MOVLHPD: case X86ISD::MOVHLPS: case X86ISD::MOVLPS: case X86ISD::MOVLPD: @@ -4491,6 +4515,7 @@ static bool hasFPCMov(unsigned X86CC) { bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const { const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic); @@ -4498,9 +4523,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, return false; Info.opc = ISD::INTRINSIC_W_CHAIN; - Info.readMem = false; - Info.writeMem = false; - Info.vol = false; + Info.flags = MachineMemOperand::MONone; Info.offset = 0; switch (IntrData->Type) { @@ -4508,14 +4531,14 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.ptrVal = I.getArgOperand(0); Info.memVT = MVT::getVT(I.getType()); Info.align = 1; - Info.readMem = true; + Info.flags |= MachineMemOperand::MOLoad; break; } case COMPRESS_TO_MEM: { Info.ptrVal = I.getArgOperand(0); Info.memVT = MVT::getVT(I.getArgOperand(1)->getType()); Info.align = 1; - Info.writeMem = true; + Info.flags |= MachineMemOperand::MOStore; break; } case TRUNCATE_TO_MEM_VI8: @@ -4533,7 +4556,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info, Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements()); Info.align = 1; - Info.writeMem = true; + Info.flags |= MachineMemOperand::MOStore; break; } default: @@ -4578,12 +4601,27 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm, return true; } -bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, +bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const { + // TODO: It might be a win to ease or lift this restriction, but the generic + // folds in DAGCombiner conflict with vector folds for an AVX512 target. + if (VT.isVector() && Subtarget.hasAVX512()) + return false; + + return true; +} + +bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, unsigned Index) const { if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT)) return false; - return (Index == 0 || Index == ResVT.getVectorNumElements()); + // Mask vectors support all subregister combinations and operations that + // extract half of vector. + if (ResVT.getVectorElementType() == MVT::i1) + return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) && + (Index == ResVT.getVectorNumElements())); + + return (Index % ResVT.getVectorNumElements()) == 0; } bool X86TargetLowering::isCheapToSpeculateCttz() const { @@ -4596,6 +4634,20 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const { return Subtarget.hasLZCNT(); } +bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const SelectionDAG &DAG) const { + // Do not merge to float value size (128 bytes) if no implicit + // float attribute is set. + bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat); + + if (NoFloat) { + unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32; + return (MemVT.getSizeInBits() <= MaxIntSize); + } + return true; +} + bool X86TargetLowering::isCtlzFast() const { return Subtarget.hasFastLZCNT(); } @@ -4778,123 +4830,6 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask, return true; } -/// Helper function to scale a shuffle or target shuffle mask, replacing each -/// mask index with the scaled sequential indices for an equivalent narrowed -/// mask. This is the reverse process to canWidenShuffleElements, but can always -/// succeed. -static void scaleShuffleMask(int Scale, ArrayRef<int> Mask, - SmallVectorImpl<int> &ScaledMask) { - assert(0 < Scale && "Unexpected scaling factor"); - int NumElts = Mask.size(); - ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1); - - for (int i = 0; i != NumElts; ++i) { - int M = Mask[i]; - - // Repeat sentinel values in every mask element. - if (M < 0) { - for (int s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = M; - continue; - } - - // Scale mask element and increment across each mask element. - for (int s = 0; s != Scale; ++s) - ScaledMask[(Scale * i) + s] = (Scale * M) + s; - } -} - -/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector -/// extract that is suitable for instruction that extract 128 or 256 bit vectors -static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) { - assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); - if (!isa<ConstantSDNode>(N->getOperand(1).getNode())) - return false; - - // The index should be aligned on a vecWidth-bit boundary. - uint64_t Index = N->getConstantOperandVal(1); - MVT VT = N->getSimpleValueType(0); - unsigned ElSize = VT.getScalarSizeInBits(); - return (Index * ElSize) % vecWidth == 0; -} - -/// Return true if the specified INSERT_SUBVECTOR -/// operand specifies a subvector insert that is suitable for input to -/// insertion of 128 or 256-bit subvectors -static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) { - assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width"); - if (!isa<ConstantSDNode>(N->getOperand(2).getNode())) - return false; - - // The index should be aligned on a vecWidth-bit boundary. - uint64_t Index = N->getConstantOperandVal(2); - MVT VT = N->getSimpleValueType(0); - unsigned ElSize = VT.getScalarSizeInBits(); - return (Index * ElSize) % vecWidth == 0; -} - -bool X86::isVINSERT128Index(SDNode *N) { - return isVINSERTIndex(N, 128); -} - -bool X86::isVINSERT256Index(SDNode *N) { - return isVINSERTIndex(N, 256); -} - -bool X86::isVEXTRACT128Index(SDNode *N) { - return isVEXTRACTIndex(N, 128); -} - -bool X86::isVEXTRACT256Index(SDNode *N) { - return isVEXTRACTIndex(N, 256); -} - -static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) { - assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) && - "Illegal extract subvector for VEXTRACT"); - - uint64_t Index = N->getConstantOperandVal(1); - MVT VecVT = N->getOperand(0).getSimpleValueType(); - unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits(); - return Index / NumElemsPerChunk; -} - -static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) { - assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width"); - assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) && - "Illegal insert subvector for VINSERT"); - - uint64_t Index = N->getConstantOperandVal(2); - MVT VecVT = N->getSimpleValueType(0); - unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits(); - return Index / NumElemsPerChunk; -} - -/// Return the appropriate immediate to extract the specified -/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions. -unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) { - return getExtractVEXTRACTImmediate(N, 128); -} - -/// Return the appropriate immediate to extract the specified -/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions. -unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) { - return getExtractVEXTRACTImmediate(N, 256); -} - -/// Return the appropriate immediate to insert at the specified -/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions. -unsigned X86::getInsertVINSERT128Immediate(SDNode *N) { - return getInsertVINSERTImmediate(N, 128); -} - -/// Return the appropriate immediate to insert at the specified -/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions. -unsigned X86::getInsertVINSERT256Immediate(SDNode *N) { - return getInsertVINSERTImmediate(N, 256); -} - /// Returns true if Elt is a constant zero or a floating point constant +0.0. bool X86::isZeroNode(SDValue Elt) { return isNullConstant(Elt) || isNullFPConstant(Elt); @@ -5018,8 +4953,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG, // If the input is a buildvector just emit a smaller one. if (Vec.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getBuildVector( - ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk)); + return DAG.getBuildVector(ResultVT, dl, + Vec->ops().slice(IdxVal, ElemsPerChunk)); SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx); @@ -5093,10 +5028,13 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) { switch (Opcode) { default: return false; + case X86ISD::TESTM: + case X86ISD::TESTNM: case X86ISD::PCMPEQM: case X86ISD::PCMPGTM: case X86ISD::CMPM: case X86ISD::CMPMU: + case X86ISD::CMPM_RND: return true; } } @@ -5113,113 +5051,128 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG, if (!isa<ConstantSDNode>(Idx)) return SDValue(); + // Inserting undef is a nop. We can just return the original vector. + if (SubVec.isUndef()) + return Vec; + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - if (IdxVal == 0 && Vec.isUndef()) // the operation is legal + if (IdxVal == 0 && Vec.isUndef()) // the operation is legal return Op; MVT OpVT = Op.getSimpleValueType(); - MVT SubVecVT = SubVec.getSimpleValueType(); unsigned NumElems = OpVT.getVectorNumElements(); + + SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); + + // Extend to natively supported kshift. + MVT WideOpVT = OpVT; + if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8) + WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; + + // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts + // if necessary. + if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) { + // May need to promote to a legal type. + Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + SubVec, Idx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + } + + MVT SubVecVT = SubVec.getSimpleValueType(); unsigned SubVecNumElems = SubVecVT.getVectorNumElements(); assert(IdxVal + SubVecNumElems <= NumElems && IdxVal % SubVecVT.getSizeInBits() == 0 && "Unexpected index value in INSERT_SUBVECTOR"); - // There are 3 possible cases: - // 1. Subvector should be inserted in the lower part (IdxVal == 0) - // 2. Subvector should be inserted in the upper part - // (IdxVal + SubVecNumElems == NumElems) - // 3. Subvector should be inserted in the middle (for example v2i1 - // to v16i1, index 2) - - // If this node widens - by concatenating zeroes - the type of the result - // of a node with instruction that zeroes all upper (irrelevant) bits of the - // output register, mark this node as legal to enable replacing them with - // the v8i1 version of the previous instruction during instruction selection. - // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg, - // while zeroing all the upper remaining 60 bits of the register. if the - // result of such instruction is inserted into an allZeroVector, then we can - // safely remove insert_vector (in instruction selection) as the cmp instr - // already zeroed the rest of the register. - if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 && - (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) || - (SubVec.getOpcode() == ISD::AND && - (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) || - isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode()))))) - return Op; - - // extend to natively supported kshift - MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1; - MVT WideOpVT = OpVT; - if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits()) - WideOpVT = MinVT; - - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); SDValue Undef = DAG.getUNDEF(WideOpVT); - SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - Undef, SubVec, ZeroIdx); - // Extract sub-vector if require. - auto ExtractSubVec = [&](SDValue V) { - return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, - OpVT, V, ZeroIdx); - }; + if (IdxVal == 0) { + // Zero lower bits of the Vec + SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, + ZeroIdx); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + // Merge them together, SubVec should be zero extended. + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + SubVec, ZeroIdx); + Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + } + + SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + Undef, SubVec, ZeroIdx); if (Vec.isUndef()) { - if (IdxVal != 0) { - SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8); - WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, - ShiftBits); - } - return ExtractSubVec(WideSubVec); + assert(IdxVal != 0 && "Unexpected index"); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + assert(IdxVal != 0 && "Unexpected index"); NumElems = WideOpVT.getVectorNumElements(); unsigned ShiftLeft = NumElems - SubVecNumElems; unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, - DAG.getConstant(ShiftLeft, dl, MVT::i8)); - Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, - DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec; - return ExtractSubVec(Vec); - } - - if (IdxVal == 0) { - // Zero lower bits of the Vec - SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); - // Merge them together, SubVec should be zero extended. - WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, - getZeroVector(WideOpVT, Subtarget, DAG, dl), - SubVec, ZeroIdx); - Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); - return ExtractSubVec(Vec); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + if (ShiftRight != 0) + SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec, + DAG.getConstant(ShiftRight, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx); } // Simple case when we put subvector in the upper part if (IdxVal + SubVecNumElems == NumElems) { - // Zero upper bits of the Vec - WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec, - DAG.getConstant(IdxVal, dl, MVT::i8)); - SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8); - Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); - Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec); - return ExtractSubVec(Vec); - } - // Subvector should be inserted in the middle - use shuffle - WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef, - SubVec, ZeroIdx); - SmallVector<int, 64> Mask; - for (unsigned i = 0; i < NumElems; ++i) - Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ? - i : i + NumElems); - return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask); + SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + if (SubVecNumElems * 2 == NumElems) { + // Special case, use legal zero extending insert_subvector. This allows + // isel to opimitize when bits are known zero. + Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx); + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + getZeroVector(WideOpVT, Subtarget, DAG, dl), + Vec, ZeroIdx); + } else { + // Otherwise use explicit shifts to zero the bits. + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, + Undef, Vec, ZeroIdx); + NumElems = WideOpVT.getVectorNumElements(); + SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8); + Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits); + Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits); + } + Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); + } + + // Inserting into the middle is more complicated. + + NumElems = WideOpVT.getVectorNumElements(); + + // Widen the vector if needed. + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx); + // Move the current value of the bit to be replace to the lsbs. + Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + // Xor with the new bit. + Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec); + // Shift to MSB, filling bottom bits with 0. + unsigned ShiftLeft = NumElems - SubVecNumElems; + Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op, + DAG.getConstant(ShiftLeft, dl, MVT::i8)); + // Shift to the final position, filling upper bits with 0. + unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal; + Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op, + DAG.getConstant(ShiftRight, dl, MVT::i8)); + // Xor with original vector leaving the new value. + Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op); + // Reduce to original width if needed. + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx); } /// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128 @@ -5273,22 +5226,6 @@ static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In, return DAG.getNode(Opc, DL, VT, In); } -/// Generate unpacklo/unpackhi shuffle mask. -static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo, - bool Unary) { - assert(Mask.empty() && "Expected an empty shuffle mask vector"); - int NumElts = VT.getVectorNumElements(); - int NumEltsInLane = 128 / VT.getScalarSizeInBits(); - - for (int i = 0; i < NumElts; ++i) { - unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; - int Pos = (i % NumEltsInLane) / 2 + LaneStart; - Pos += (Unary ? 0 : NumElts * (i % 2)); - Pos += (Lo ? 0 : NumEltsInLane / 2); - Mask.push_back(Pos); - } -} - /// Returns a vector_shuffle node for an unpackl operation. static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT, SDValue V1, SDValue V2) { @@ -5448,6 +5385,20 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits, return false; }; + // Handle UNDEFs. + if (Op.isUndef()) { + APInt UndefSrcElts = APInt::getAllOnesValue(NumElts); + SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0)); + return CastBitData(UndefSrcElts, SrcEltBits); + } + + // Extract scalar constant bits. + if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) { + APInt UndefSrcElts = APInt::getNullValue(1); + SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue()); + return CastBitData(UndefSrcElts, SrcEltBits); + } + // Extract constant bits from build vector. if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { unsigned SrcEltSizeInBits = VT.getScalarSizeInBits(); @@ -5542,6 +5493,24 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode, return true; } +/// Create a shuffle mask that matches the PACKSS/PACKUS truncation. +/// Note: This ignores saturation, so inputs must be checked first. +static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, + bool Unary) { + assert(Mask.empty() && "Expected an empty shuffle mask vector"); + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits(); + unsigned Offset = Unary ? 0 : NumElts; + + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) + Mask.push_back(Elt + (Lane * NumEltsPerLane)); + for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2) + Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset); + } +} + /// Calculates the shuffle mask corresponding to the target-specific opcode. /// If the mask could be calculated, returns it in \p Mask, returns the shuffle /// operands in \p Ops, and returns true. @@ -5562,21 +5531,28 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::BLENDI: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::SHUFP: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::INSERTPS: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::EXTRQI: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); if (isa<ConstantSDNode>(N->getOperand(1)) && isa<ConstantSDNode>(N->getOperand(2))) { int BitLen = N->getConstantOperandVal(1); @@ -5586,6 +5562,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, } break; case X86ISD::INSERTQI: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); if (isa<ConstantSDNode>(N->getOperand(2)) && isa<ConstantSDNode>(N->getOperand(3))) { int BitLen = N->getConstantOperandVal(2); @@ -5595,23 +5573,33 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, } break; case X86ISD::UNPCKH: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::UNPCKL: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeUNPCKLMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVHLPS: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVLHPS: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::PALIGNR: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); @@ -5620,33 +5608,39 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, break; case X86ISD::VSHLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VSRLDQ: assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands() - 1); DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFHW: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::PSHUFLW: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::VZEXT_MOVL: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeZeroMoveLowMask(VT, Mask); IsUnary = true; break; @@ -5670,6 +5664,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::VPERMILPV: { + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); unsigned MaskEltSize = VT.getScalarSizeInBits(); @@ -5685,6 +5680,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::PSHUFB: { + assert(VT.getScalarType() == MVT::i8 && "Byte vector expected"); + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; SDValue MaskNode = N->getOperand(1); SmallVector<uint64_t, 32> RawMask; @@ -5699,37 +5697,46 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::VPERMI: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = true; break; case X86ISD::MOVSS: case X86ISD::MOVSD: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); break; case X86ISD::VPERM2X128: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); break; case X86ISD::MOVSLDUP: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSLDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVSHDUP: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVSHDUPMask(VT, Mask); IsUnary = true; break; case X86ISD::MOVDDUP: + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); DecodeMOVDDUPMask(VT, Mask); IsUnary = true; break; - case X86ISD::MOVLHPD: case X86ISD::MOVLPD: case X86ISD::MOVLPS: // Not yet implemented return false; case X86ISD::VPERMIL2: { + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); unsigned MaskEltSize = VT.getScalarSizeInBits(); SDValue MaskNode = N->getOperand(2); @@ -5749,6 +5756,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::VPPERM: { + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); SDValue MaskNode = N->getOperand(2); SmallVector<uint64_t, 32> RawMask; @@ -5763,6 +5772,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::VPERMV: { + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); IsUnary = true; // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. Ops.push_back(N->getOperand(1)); @@ -5780,6 +5790,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::VPERMV3: { + assert(N->getOperand(0).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2); // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. Ops.push_back(N->getOperand(0)); @@ -5793,6 +5805,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, return false; } case X86ISD::VPERMIV3: { + assert(N->getOperand(1).getValueType() == VT && "Unexpected value type"); + assert(N->getOperand(2).getValueType() == VT && "Unexpected value type"); IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2); // Unlike most shuffle nodes, VPERMIV3's mask operand is the first one. Ops.push_back(N->getOperand(1)); @@ -5965,19 +5979,13 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, SDValue N0 = N.getOperand(0); SDValue SrcExtract; - if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && - N0.getOperand(0).getValueType() == VT) { + if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT && + N0.getOperand(0).getValueType() == VT) || + (N0.getOpcode() == X86ISD::PEXTRW && + N0.getOperand(0).getValueType() == MVT::v8i16) || + (N0.getOpcode() == X86ISD::PEXTRB && + N0.getOperand(0).getValueType() == MVT::v16i8)) { SrcExtract = N0; - } else if (N0.getOpcode() == ISD::AssertZext && - N0.getOperand(0).getOpcode() == X86ISD::PEXTRW && - cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) { - SrcExtract = N0.getOperand(0); - assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16); - } else if (N0.getOpcode() == ISD::AssertZext && - N0.getOperand(0).getOpcode() == X86ISD::PEXTRB && - cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) { - SrcExtract = N0.getOperand(0); - assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8); } if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1))) @@ -6013,16 +6021,15 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, return true; } - // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern. + // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern. // TODO: Expand this to support INSERT_VECTOR_ELT/etc. unsigned ExOp = (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW); - if (InScl.getOpcode() != ISD::AssertZext || - InScl.getOperand(0).getOpcode() != ExOp) + if (InScl.getOpcode() != ExOp) return false; - SDValue ExVec = InScl.getOperand(0).getOperand(0); - uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1); + SDValue ExVec = InScl.getOperand(0); + uint64_t ExIdx = InScl.getConstantOperandVal(1); assert(ExIdx < NumElts && "Illegal extraction index"); Ops.push_back(InVec); Ops.push_back(ExVec); @@ -6030,17 +6037,34 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask, Mask.push_back(i == InIdx ? NumElts + ExIdx : i); return true; } - case X86ISD::PACKSS: { + case X86ISD::PACKSS: + case X86ISD::PACKUS: { + SDValue N0 = N.getOperand(0); + SDValue N1 = N.getOperand(1); + assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) && + N1.getValueType().getVectorNumElements() == (NumElts / 2) && + "Unexpected input value type"); + // If we know input saturation won't happen we can treat this // as a truncation shuffle. - if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt || - DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt) - return false; + if (Opcode == X86ISD::PACKSS) { + if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) || + (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt)) + return false; + } else { + APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt); + if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) || + (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask))) + return false; + } - Ops.push_back(N.getOperand(0)); - Ops.push_back(N.getOperand(1)); - for (unsigned i = 0; i != NumElts; ++i) - Mask.push_back(i * 2); + bool IsUnary = (N0 == N1); + + Ops.push_back(N0); + if (!IsUnary) + Ops.push_back(N1); + + createPackShuffleMask(VT, Mask, IsUnary); return true; } case X86ISD::VSHLI: @@ -6099,6 +6123,14 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs, for (int i = 0, e = Inputs.size(); i < e; ++i) { int lo = UsedInputs.size() * MaskWidth; int hi = lo + MaskWidth; + + // Strip UNDEF input usage. + if (Inputs[i].isUndef()) + for (int &M : Mask) + if ((lo <= M) && (M < hi)) + M = SM_SentinelUndef; + + // Check for unused inputs. if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) { UsedInputs.push_back(Inputs[i]); continue; @@ -6196,6 +6228,49 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG, return SDValue(); } +// Use PINSRB/PINSRW/PINSRD to create a build vector. +static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros, + unsigned NumNonZero, unsigned NumZero, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = Op.getSimpleValueType(); + unsigned NumElts = VT.getVectorNumElements(); + assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) || + ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) && + "Illegal vector insertion"); + + SDLoc dl(Op); + SDValue V; + bool First = true; + + for (unsigned i = 0; i < NumElts; ++i) { + bool IsNonZero = (NonZeros & (1 << i)) != 0; + if (!IsNonZero) + continue; + + // If the build vector contains zeros or our first insertion is not the + // first index then insert into zero vector to break any register + // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. + if (First) { + First = false; + if (NumZero || 0 != i) + V = getZeroVector(VT, Subtarget, DAG, dl); + else { + assert(0 == i && "Expected insertion into zero-index"); + V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); + V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); + V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); + V = DAG.getBitcast(VT, V); + continue; + } + } + V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i), + DAG.getIntPtrConstant(i, dl)); + } + + return V; +} + /// Custom lower build_vector of v16i8. static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, unsigned NumNonZero, unsigned NumZero, @@ -6204,39 +6279,15 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros, if (NumNonZero > 8 && !Subtarget.hasSSE41()) return SDValue(); + // SSE4.1 - use PINSRB to insert each byte directly. + if (Subtarget.hasSSE41()) + return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, + Subtarget); + SDLoc dl(Op); SDValue V; bool First = true; - // SSE4.1 - use PINSRB to insert each byte directly. - if (Subtarget.hasSSE41()) { - for (unsigned i = 0; i < 16; ++i) { - bool IsNonZero = (NonZeros & (1 << i)) != 0; - if (IsNonZero) { - // If the build vector contains zeros or our first insertion is not the - // first index then insert into zero vector to break any register - // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. - if (First) { - First = false; - if (NumZero || 0 != i) - V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl); - else { - assert(0 == i && "Expected insertion into zero-index"); - V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); - V = DAG.getBitcast(MVT::v16i8, V); - continue; - } - } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V, - Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); - } - } - - return V; - } - // Pre-SSE4.1 - merge byte pairs and insert with PINSRW. for (unsigned i = 0; i < 16; ++i) { bool ThisIsNonZero = (NonZeros & (1 << i)) != 0; @@ -6292,34 +6343,9 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros, if (NumNonZero > 4 && !Subtarget.hasSSE41()) return SDValue(); - SDLoc dl(Op); - SDValue V; - bool First = true; - for (unsigned i = 0; i < 8; ++i) { - bool IsNonZero = (NonZeros & (1 << i)) != 0; - if (IsNonZero) { - // If the build vector contains zeros or our first insertion is not the - // first index then insert into zero vector to break any register - // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL. - if (First) { - First = false; - if (NumZero || 0 != i) - V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl); - else { - assert(0 == i && "Expected insertion into zero-index"); - V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32); - V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V); - V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V); - V = DAG.getBitcast(MVT::v8i16, V); - continue; - } - } - V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, - Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); - } - } - - return V; + // Use PINSRW to insert each byte directly. + return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG, + Subtarget); } /// Custom lower build_vector of v4i32 or v4f32. @@ -6589,14 +6615,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, } } - auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) { + SmallVector<LoadSDNode *, 8> Loads; + for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i) + if (LoadMask[i]) + Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i]))); + + auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) { auto MMOFlags = LDBase->getMemOperand()->getFlags(); assert(!(MMOFlags & MachineMemOperand::MOVolatile) && "Cannot merge volatile loads."); SDValue NewLd = DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(), LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags); - DAG.makeEquivalentMemoryOrdering(LDBase, NewLd); + for (auto *LD : Loads) + DAG.makeEquivalentMemoryOrdering(LD, NewLd); return NewLd; }; @@ -6659,9 +6691,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts, DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(), LDBase->getAlignment(), - false/*isVolatile*/, true/*ReadMem*/, - false/*WriteMem*/); - DAG.makeEquivalentMemoryOrdering(LDBase, ResNode); + MachineMemOperand::MOLoad); + for (auto *LD : Loads) + DAG.makeEquivalentMemoryOrdering(LD, ResNode); return DAG.getBitcast(VT, ResNode); } } @@ -6702,6 +6734,43 @@ static bool isUseOfShuffle(SDNode *N) { return false; } +// Check if the current node of build vector is a zero extended vector. +// // If so, return the value extended. +// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a. +// // NumElt - return the number of zero extended identical values. +// // EltType - return the type of the value include the zero extend. +static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op, + unsigned &NumElt, MVT &EltType) { + SDValue ExtValue = Op->getOperand(0); + unsigned NumElts = Op->getNumOperands(); + unsigned Delta = NumElts; + + for (unsigned i = 1; i < NumElts; i++) { + if (Op->getOperand(i) == ExtValue) { + Delta = i; + break; + } + if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i)))) + return SDValue(); + } + if (!isPowerOf2_32(Delta) || Delta == 1) + return SDValue(); + + for (unsigned i = Delta; i < NumElts; i++) { + if (i % Delta == 0) { + if (Op->getOperand(i) != ExtValue) + return SDValue(); + } else if (!(isNullConstant(Op->getOperand(i)) || + Op->getOperand(i).isUndef())) + return SDValue(); + } + unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits(); + unsigned ExtVTSize = EltSize * Delta; + EltType = MVT::getIntegerVT(ExtVTSize); + NumElt = NumElts / Delta; + return ExtValue; +} + /// Attempt to use the vbroadcast instruction to generate a splat value /// from a splat BUILD_VECTOR which uses: /// a. A single scalar load, or a constant. @@ -6727,6 +6796,39 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, BitVector UndefElements; SDValue Ld = BVOp->getSplatValue(&UndefElements); + // Attempt to use VBROADCASTM + // From this paterrn: + // a. t0 = (zext_i64 (bitcast_i8 v2i1 X)) + // b. t1 = (build_vector t0 t0) + // + // Create (VBROADCASTM v2i1 X) + if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) { + MVT EltType = VT.getScalarType(); + unsigned NumElts = VT.getVectorNumElements(); + SDValue BOperand; + SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType); + if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) || + (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND && + Ld.getOperand(0).getOpcode() == ISD::BITCAST)) { + if (ZeroExtended) + BOperand = ZeroExtended.getOperand(0); + else + BOperand = Ld.getOperand(0).getOperand(0); + if (BOperand.getValueType().isVector() && + BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) { + if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 || + NumElts == 8)) || // for broadcastmb2q + (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 || + NumElts == 16))) { // for broadcastmw2d + SDValue Brdcst = + DAG.getNode(X86ISD::VBROADCASTM, dl, + MVT::getVectorVT(EltType, NumElts), BOperand); + return DAG.getBitcast(VT, Brdcst); + } + } + } + } + // We need a splat of a single value to use broadcast, and it doesn't // make any sense if the value is only in one element of the vector. if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) { @@ -6824,7 +6926,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp, // TODO: If multiple splats are generated to load the same constant, // it may be detrimental to overall size. There needs to be a way to detect // that condition to know if this is truly a size win. - bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); // Handle broadcasting a single constant scalar from the constant pool // into a vector. @@ -6902,10 +7004,10 @@ static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec, // For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already // lowered this: - // (extract_vector_elt (v8f32 %vreg1), Constant<6>) + // (extract_vector_elt (v8f32 %1), Constant<6>) // to: // (extract_vector_elt (vector_shuffle<2,u,u,u> - // (extract_subvector (v8f32 %vreg0), Constant<4>), + // (extract_subvector (v8f32 %0), Constant<4>), // undef) // Constant<0>) // In this case the vector is the extract_subvector expression and the index @@ -7020,10 +7122,10 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); if (ISD::isBuildVectorAllZeros(Op.getNode())) - return DAG.getTargetConstant(0, dl, VT); + return Op; if (ISD::isBuildVectorAllOnes(Op.getNode())) - return DAG.getTargetConstant(1, dl, VT); + return Op; if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) { if (VT == MVT::v64i1 && !Subtarget.is64Bit()) { @@ -7272,7 +7374,8 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1, /// are written to the parameters \p Opnd0 and \p Opnd1. static bool isAddSub(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG, - SDValue &Opnd0, SDValue &Opnd1) { + SDValue &Opnd0, SDValue &Opnd1, + unsigned &NumExtracts) { MVT VT = BV->getSimpleValueType(0); if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) && @@ -7284,6 +7387,8 @@ static bool isAddSub(const BuildVectorSDNode *BV, SDValue InVec0 = DAG.getUNDEF(VT); SDValue InVec1 = DAG.getUNDEF(VT); + NumExtracts = 0; + // Odd-numbered elements in the input build vector are obtained from // adding two integer/float elements. // Even-numbered elements in the input build vector are obtained from @@ -7360,6 +7465,9 @@ static bool isAddSub(const BuildVectorSDNode *BV, // Update the pair of expected opcodes. std::swap(ExpectedOpcode, NextExpectedOpcode); + + // Increment the number of extractions done. + ++NumExtracts; } // Don't try to fold this build_vector into an ADDSUB if the inputs are undef. @@ -7398,8 +7506,10 @@ static bool isAddSub(const BuildVectorSDNode *BV, /// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit /// FMADDSUB is. static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG, - SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) { - if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 || + SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2, + unsigned ExpectedUses) { + if (Opnd0.getOpcode() != ISD::FMUL || + !Opnd0->hasNUsesOfValue(ExpectedUses, 0) || !Subtarget.hasAnyFMA()) return false; @@ -7426,7 +7536,8 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, const X86Subtarget &Subtarget, SelectionDAG &DAG) { SDValue Opnd0, Opnd1; - if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1)) + unsigned NumExtracts; + if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts)) return SDValue(); MVT VT = BV->getSimpleValueType(0); @@ -7434,7 +7545,9 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV, // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + // TODO: According to coverage reports, the FMADDSUB transform is not + // triggered by any tests. + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -7658,6 +7771,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG, return SDValue(); } +// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be +// reasoned to be a permutation of a vector by indices in a non-constant vector. +// (build_vector (extract_elt V, (extract_elt I, 0)), +// (extract_elt V, (extract_elt I, 1)), +// ... +// -> +// (vpermv I, V) +// +// TODO: Handle undefs +// TODO: Utilize pshufb and zero mask blending to support more efficient +// construction of vectors with constant-0 elements. +// TODO: Use smaller-element vectors of same width, and "interpolate" the indices, +// when no native operation available. +static SDValue +LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Look for VPERMV and PSHUFB opportunities. + MVT VT = V.getSimpleValueType(); + switch (VT.SimpleTy) { + default: + return SDValue(); + case MVT::v16i8: + if (!Subtarget.hasSSE3()) + return SDValue(); + break; + case MVT::v8f32: + case MVT::v8i32: + if (!Subtarget.hasAVX2()) + return SDValue(); + break; + case MVT::v4i64: + case MVT::v4f64: + if (!Subtarget.hasVLX()) + return SDValue(); + break; + case MVT::v16f32: + case MVT::v8f64: + case MVT::v16i32: + case MVT::v8i64: + if (!Subtarget.hasAVX512()) + return SDValue(); + break; + case MVT::v32i16: + if (!Subtarget.hasBWI()) + return SDValue(); + break; + case MVT::v8i16: + case MVT::v16i16: + if (!Subtarget.hasVLX() || !Subtarget.hasBWI()) + return SDValue(); + break; + case MVT::v64i8: + if (!Subtarget.hasVBMI()) + return SDValue(); + break; + case MVT::v32i8: + if (!Subtarget.hasVLX() || !Subtarget.hasVBMI()) + return SDValue(); + break; + } + SDValue SrcVec, IndicesVec; + // Check for a match of the permute source vector and permute index elements. + // This is done by checking that the i-th build_vector operand is of the form: + // (extract_elt SrcVec, (extract_elt IndicesVec, i)). + for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) { + SDValue Op = V.getOperand(Idx); + if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // If this is the first extract encountered in V, set the source vector, + // otherwise verify the extract is from the previously defined source + // vector. + if (!SrcVec) + SrcVec = Op.getOperand(0); + else if (SrcVec != Op.getOperand(0)) + return SDValue(); + SDValue ExtractedIndex = Op->getOperand(1); + // Peek through extends. + if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND || + ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND) + ExtractedIndex = ExtractedIndex.getOperand(0); + if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + // If this is the first extract from the index vector candidate, set the + // indices vector, otherwise verify the extract is from the previously + // defined indices vector. + if (!IndicesVec) + IndicesVec = ExtractedIndex.getOperand(0); + else if (IndicesVec != ExtractedIndex.getOperand(0)) + return SDValue(); + + auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1)); + if (!PermIdx || PermIdx->getZExtValue() != Idx) + return SDValue(); + } + MVT IndicesVT = VT; + if (VT.isFloatingPoint()) + IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()), + VT.getVectorNumElements()); + IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT); + return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV, + SDLoc(V), VT, IndicesVec, SrcVec); +} + SDValue X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { SDLoc dl(Op); @@ -7674,6 +7892,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return VectorConstant; BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode()); + // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB + // transform here. if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG)) return AddSub; if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG)) @@ -7690,14 +7910,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { uint64_t NonZeros = 0; bool IsAllConstants = true; SmallSet<SDValue, 8> Values; + unsigned NumConstants = NumElems; for (unsigned i = 0; i < NumElems; ++i) { SDValue Elt = Op.getOperand(i); if (Elt.isUndef()) continue; Values.insert(Elt); - if (Elt.getOpcode() != ISD::Constant && - Elt.getOpcode() != ISD::ConstantFP) + if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) { IsAllConstants = false; + NumConstants--; + } if (X86::isZeroNode(Elt)) NumZero++; else { @@ -7711,6 +7933,52 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (NumNonZero == 0) return DAG.getUNDEF(VT); + // If we are inserting one variable into a vector of non-zero constants, try + // to avoid loading each constant element as a scalar. Load the constants as a + // vector and then insert the variable scalar element. If insertion is not + // supported, we assume that we will fall back to a shuffle to get the scalar + // blended with the constants. Insertion into a zero vector is handled as a + // special-case somewhere below here. + LLVMContext &Context = *DAG.getContext(); + if (NumConstants == NumElems - 1 && NumNonZero != 1 && + (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) || + isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) { + // Create an all-constant vector. The variable element in the old + // build vector is replaced by undef in the constant vector. Save the + // variable scalar element and its index for use in the insertelement. + Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context); + SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType)); + SDValue VarElt; + SDValue InsIndex; + for (unsigned i = 0; i != NumElems; ++i) { + SDValue Elt = Op.getOperand(i); + if (auto *C = dyn_cast<ConstantSDNode>(Elt)) + ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue()); + else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt)) + ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF()); + else if (!Elt.isUndef()) { + assert(!VarElt.getNode() && !InsIndex.getNode() && + "Expected one variable element in this vector"); + VarElt = Elt; + InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout())); + } + } + Constant *CV = ConstantVector::get(ConstVecOps); + SDValue DAGConstVec = DAG.getConstantPool(CV, VT); + + // The constants we just created may not be legal (eg, floating point). We + // must lower the vector right here because we can not guarantee that we'll + // legalize it before loading it. This is also why we could not just create + // a new build vector here. If the build vector contains illegal constants, + // it could get split back up into a series of insert elements. + // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD. + SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG); + MachineFunction &MF = DAG.getMachineFunction(); + MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF); + SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI); + return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex); + } + // Special case for single non-zero, non-undef, element. if (NumNonZero == 1) { unsigned Idx = countTrailingZeros(NonZeros); @@ -7825,6 +8093,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { if (IsAllConstants) return SDValue(); + if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget)) + return V; + // See if we can use a vector load to get all of the elements. if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) { SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); @@ -7836,15 +8107,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { // For AVX-length vectors, build the individual 128-bit pieces and use // shuffles to put them in place. if (VT.is256BitVector() || VT.is512BitVector()) { - SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems); - - EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2); + EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2); // Build both the lower and upper subvector. SDValue Lower = - DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2)); + DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2)); SDValue Upper = DAG.getBuildVector( - HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2)); + HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2)); // Recreate the wider vector with the lower and upper part. if (VT.is256BitVector()) @@ -7892,8 +8161,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { } for (unsigned i = 0; i < 2; ++i) { - switch ((NonZeros & (0x3 << i*2)) >> (i*2)) { - default: break; + switch ((NonZeros >> (i*2)) & 0x3) { + default: llvm_unreachable("Unexpected NonZero count"); case 0: Ops[i] = Ops[i*2]; // Must be a zero vector. break; @@ -7920,57 +8189,56 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const { return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec); } - if (Values.size() > 1 && VT.is128BitVector()) { - // Check for a build vector from mostly shuffle plus few inserting. - if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) - return Sh; + assert(Values.size() > 1 && "Expected non-undef and non-splat vector"); - // For SSE 4.1, use insertps to put the high elements into the low element. - if (Subtarget.hasSSE41()) { - SDValue Result; - if (!Op.getOperand(0).isUndef()) - Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); - else - Result = DAG.getUNDEF(VT); + // Check for a build vector from mostly shuffle plus few inserting. + if (SDValue Sh = buildFromShuffleMostly(Op, DAG)) + return Sh; - for (unsigned i = 1; i < NumElems; ++i) { - if (Op.getOperand(i).isUndef()) continue; - Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, - Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); - } - return Result; - } + // For SSE 4.1, use insertps to put the high elements into the low element. + if (Subtarget.hasSSE41()) { + SDValue Result; + if (!Op.getOperand(0).isUndef()) + Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0)); + else + Result = DAG.getUNDEF(VT); - // Otherwise, expand into a number of unpckl*, start by extending each of - // our (non-undef) elements to the full vector width with the element in the - // bottom slot of the vector (which generates no code for SSE). - SmallVector<SDValue, 8> Ops(NumElems); - for (unsigned i = 0; i < NumElems; ++i) { - if (!Op.getOperand(i).isUndef()) - Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); - else - Ops[i] = DAG.getUNDEF(VT); + for (unsigned i = 1; i < NumElems; ++i) { + if (Op.getOperand(i).isUndef()) continue; + Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result, + Op.getOperand(i), DAG.getIntPtrConstant(i, dl)); } + return Result; + } + + // Otherwise, expand into a number of unpckl*, start by extending each of + // our (non-undef) elements to the full vector width with the element in the + // bottom slot of the vector (which generates no code for SSE). + SmallVector<SDValue, 8> Ops(NumElems); + for (unsigned i = 0; i < NumElems; ++i) { + if (!Op.getOperand(i).isUndef()) + Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i)); + else + Ops[i] = DAG.getUNDEF(VT); + } - // Next, we iteratively mix elements, e.g. for v4f32: - // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0> - // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2> - // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> - for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { - // Generate scaled UNPCKL shuffle mask. - SmallVector<int, 16> Mask; - for(unsigned i = 0; i != Scale; ++i) - Mask.push_back(i); - for (unsigned i = 0; i != Scale; ++i) - Mask.push_back(NumElems+i); - Mask.append(NumElems - Mask.size(), SM_SentinelUndef); + // Next, we iteratively mix elements, e.g. for v4f32: + // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0> + // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2> + // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0> + for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) { + // Generate scaled UNPCKL shuffle mask. + SmallVector<int, 16> Mask; + for(unsigned i = 0; i != Scale; ++i) + Mask.push_back(i); + for (unsigned i = 0; i != Scale; ++i) + Mask.push_back(NumElems+i); + Mask.append(NumElems - Mask.size(), SM_SentinelUndef); - for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) - Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); - } - return Ops[0]; + for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i) + Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask); } - return SDValue(); + return Ops[0]; } // 256-bit AVX can use the vinsertf128 instruction @@ -8060,87 +8328,74 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op, SelectionDAG & DAG) { SDLoc dl(Op); MVT ResVT = Op.getSimpleValueType(); - unsigned NumOfOperands = Op.getNumOperands(); + unsigned NumOperands = Op.getNumOperands(); - assert(isPowerOf2_32(NumOfOperands) && + assert(NumOperands > 1 && isPowerOf2_32(NumOperands) && "Unexpected number of operands in CONCAT_VECTORS"); // If this node promotes - by concatenating zeroes - the type of the result // of a node with instruction that zeroes all upper (irrelevant) bits of the // output register, mark it as legal and catch the pattern in instruction - // selection to avoid emitting extra insturctions (for zeroing upper bits). + // selection to avoid emitting extra instructions (for zeroing upper bits). if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) { - SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64); - SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC); + SDValue ZeroC = DAG.getIntPtrConstant(0, dl); + SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl); return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted, ZeroC); } - SDValue Undef = DAG.getUNDEF(ResVT); - if (NumOfOperands > 2) { - // Specialize the cases when all, or all but one, of the operands are undef. - unsigned NumOfDefinedOps = 0; - unsigned OpIdx = 0; - for (unsigned i = 0; i < NumOfOperands; i++) - if (!Op.getOperand(i).isUndef()) { - NumOfDefinedOps++; - OpIdx = i; - } - if (NumOfDefinedOps == 0) - return Undef; - if (NumOfDefinedOps == 1) { - unsigned SubVecNumElts = - Op.getOperand(OpIdx).getValueType().getVectorNumElements(); - SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, - Op.getOperand(OpIdx), IdxVal); + unsigned NumZero = 0; + unsigned NumNonZero = 0; + uint64_t NonZeros = 0; + for (unsigned i = 0; i != NumOperands; ++i) { + SDValue SubVec = Op.getOperand(i); + if (SubVec.isUndef()) + continue; + if (ISD::isBuildVectorAllZeros(SubVec.getNode())) + ++NumZero; + else { + assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range. + NonZeros |= (uint64_t)1 << i; + ++NumNonZero; } + } + + // If there are zero or one non-zeros we can handle this very simply. + if (NumNonZero <= 1) { + SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl) + : DAG.getUNDEF(ResVT); + if (!NumNonZero) + return Vec; + unsigned Idx = countTrailingZeros(NonZeros); + SDValue SubVec = Op.getOperand(Idx); + unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements(); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec, + DAG.getIntPtrConstant(Idx * SubVecNumElts, dl)); + } + + if (NumOperands > 2) { MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(), ResVT.getVectorNumElements()/2); - SmallVector<SDValue, 2> Ops; - for (unsigned i = 0; i < NumOfOperands/2; i++) - Ops.push_back(Op.getOperand(i)); - SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); - Ops.clear(); - for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++) - Ops.push_back(Op.getOperand(i)); - SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops); + ArrayRef<SDUse> Ops = Op->ops(); + SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, + Ops.slice(0, NumOperands/2)); + SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, + Ops.slice(NumOperands/2)); return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi); } - // 2 operands - SDValue V1 = Op.getOperand(0); - SDValue V2 = Op.getOperand(1); - unsigned NumElems = ResVT.getVectorNumElements(); - assert(V1.getValueType() == V2.getValueType() && - V1.getValueType().getVectorNumElements() == NumElems/2 && - "Unexpected operands in CONCAT_VECTORS"); + assert(NumNonZero == 2 && "Simple cases not handled?"); - if (ResVT.getSizeInBits() >= 16) + if (ResVT.getVectorNumElements() >= 16) return Op; // The operation is legal with KUNPCK - bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode()); - bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode()); - SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl); - if (IsZeroV1 && IsZeroV2) - return ZeroVec; - - SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl); - if (V2.isUndef()) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); - if (IsZeroV2) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx); - - SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl); - if (V1.isUndef()) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal); - - if (IsZeroV1) - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal); - - V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx); - return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal); + SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, + DAG.getUNDEF(ResVT), Op.getOperand(0), + DAG.getIntPtrConstant(0, dl)); + unsigned NumElems = ResVT.getVectorNumElements(); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1), + DAG.getIntPtrConstant(NumElems/2, dl)); } static SDValue LowerCONCAT_VECTORS(SDValue Op, @@ -8723,6 +8978,76 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT, return SDValue(); } +// X86 has dedicated pack instructions that can handle specific truncation +// operations: PACKSS and PACKUS. +static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, + SDValue &V2, unsigned &PackOpcode, + ArrayRef<int> TargetMask, + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned BitSize = VT.getScalarSizeInBits(); + MVT PackSVT = MVT::getIntegerVT(BitSize * 2); + MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2); + + auto MatchPACK = [&](SDValue N1, SDValue N2) { + SDValue VV1 = DAG.getBitcast(PackVT, N1); + SDValue VV2 = DAG.getBitcast(PackVT, N2); + if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) && + (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) { + V1 = VV1; + V2 = VV2; + SrcVT = PackVT; + PackOpcode = X86ISD::PACKSS; + return true; + } + + if (Subtarget.hasSSE41() || PackSVT == MVT::i16) { + APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize); + if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) && + (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) { + V1 = VV1; + V2 = VV2; + SrcVT = PackVT; + PackOpcode = X86ISD::PACKUS; + return true; + } + } + + return false; + }; + + // Try binary shuffle. + SmallVector<int, 32> BinaryMask; + createPackShuffleMask(VT, BinaryMask, false); + if (isTargetShuffleEquivalent(TargetMask, BinaryMask)) + if (MatchPACK(V1, V2)) + return true; + + // Try unary shuffle. + SmallVector<int, 32> UnaryMask; + createPackShuffleMask(VT, UnaryMask, true); + if (isTargetShuffleEquivalent(TargetMask, UnaryMask)) + if (MatchPACK(V1, V1)) + return true; + + return false; +} + +static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT, + ArrayRef<int> Mask, SDValue V1, + SDValue V2, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT PackVT; + unsigned PackOpcode; + if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG, + Subtarget)) + return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1), + DAG.getBitcast(PackVT, V2)); + + return SDValue(); +} + /// \brief Try to emit a bitmask instruction for a shuffle. /// /// This handles cases where we can model a blend exactly as a bitmask due to @@ -8834,7 +9159,8 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2, return true; } -uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) { +static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, + int Scale) { uint64_t ScaledMask = 0; for (int i = 0; i != Size; ++i) if (BlendMask & (1ull << i)) @@ -9869,7 +10195,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( return SDValue(); // Zero-extend directly to i32. - ExtVT = MVT::v4i32; + ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32); V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S); } V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S); @@ -9891,10 +10217,7 @@ static SDValue lowerVectorShuffleAsElementInsertion( V1Mask[V2Index] = -1; if (!isNoopShuffleMask(V1Mask)) return SDValue(); - // This is essentially a special case blend operation, but if we have - // general purpose blend operations, they are always faster. Bail and let - // the rest of the lowering handle these as blends. - if (Subtarget.hasSSE41()) + if (!VT.is128BitVector()) return SDValue(); // Otherwise, use MOVSD or MOVSS. @@ -10005,7 +10328,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise // we can only broadcast from a register with AVX2. unsigned NumElts = Mask.size(); - unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST; + unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : X86ISD::VBROADCAST; bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2(); // Check that the mask is a broadcast. @@ -10030,9 +10355,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, for (;;) { switch (V.getOpcode()) { case ISD::BITCAST: { + // Peek through bitcasts as long as BroadcastIdx can be adjusted. SDValue VSrc = V.getOperand(0); - MVT SrcVT = VSrc.getSimpleValueType(); - if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits()) + unsigned NumEltBits = V.getScalarValueSizeInBits(); + unsigned NumSrcBits = VSrc.getScalarValueSizeInBits(); + if ((NumEltBits % NumSrcBits) == 0) + BroadcastIdx *= (NumEltBits / NumSrcBits); + else if ((NumSrcBits % NumEltBits) == 0 && + (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0) + BroadcastIdx /= (NumSrcBits / NumEltBits); + else break; V = VSrc; continue; @@ -10064,6 +10396,23 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, break; } + // Ensure the source vector and BroadcastIdx are for a suitable type. + if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) { + unsigned NumEltBits = VT.getScalarSizeInBits(); + unsigned NumSrcBits = V.getScalarValueSizeInBits(); + if ((NumSrcBits % NumEltBits) == 0) + BroadcastIdx *= (NumSrcBits / NumEltBits); + else if ((NumEltBits % NumSrcBits) == 0 && + (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0) + BroadcastIdx /= (NumEltBits / NumSrcBits); + else + return SDValue(); + + unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits; + MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts); + V = DAG.getBitcast(SrcVT, V); + } + // Check if this is a broadcast of a scalar. We special case lowering // for scalars so that we can more effectively fold with loads. // First, look through bitcast: if the original value has a larger element @@ -10091,7 +10440,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // 32-bit targets need to load i64 as a f64 and then bitcast the result. if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) { BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements()); - Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode); + Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2()) + ? X86ISD::MOVDDUP + : Opcode; } // If we are broadcasting a load that is only used by the shuffle @@ -10127,15 +10478,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // The shuffle input might have been a bitcast we looked through; look at // the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll // later bitcast it to BroadcastVT. - MVT SrcVT = V.getSimpleValueType(); - assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() && + assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() && "Unexpected vector element size"); - assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) && + assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) && "Unexpected vector size"); - - MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize); - V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V, - DAG.getIntPtrConstant(BroadcastIdx, DL)); + V = extract128BitVector(V, BroadcastIdx, DAG, DL); } if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector()) @@ -10165,9 +10512,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT, // We only support broadcasting from 128-bit vectors to minimize the // number of patterns we need to deal with in isel. So extract down to - // 128-bits. - if (SrcVT.getSizeInBits() > 128) - V = extract128BitVector(V, 0, DAG, DL); + // 128-bits, removing as many bitcasts as possible. + if (SrcVT.getSizeInBits() > 128) { + MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), + 128 / SrcVT.getScalarSizeInBits()); + V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL); + V = DAG.getBitcast(ExtVT, V); + } return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V)); } @@ -10517,26 +10868,6 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(Mask[0] < 2 && "We sort V1 to be the first input."); assert(Mask[1] >= 2 && "We sort V2 to be the second input."); - // If we have a blend of two same-type PACKUS operations and the blend aligns - // with the low and high halves, we can just merge the PACKUS operations. - // This is particularly important as it lets us merge shuffles that this - // routine itself creates. - auto GetPackNode = [](SDValue V) { - V = peekThroughBitcasts(V); - return V.getOpcode() == X86ISD::PACKUS ? V : SDValue(); - }; - if (SDValue V1Pack = GetPackNode(V1)) - if (SDValue V2Pack = GetPackNode(V2)) { - EVT PackVT = V1Pack.getValueType(); - if (PackVT == V2Pack.getValueType()) - return DAG.getBitcast(MVT::v2i64, - DAG.getNode(X86ISD::PACKUS, DL, PackVT, - Mask[0] == 0 ? V1Pack.getOperand(0) - : V1Pack.getOperand(1), - Mask[1] == 2 ? V2Pack.getOperand(0) - : V2Pack.getOperand(1))); - } - // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -10569,10 +10900,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // If we have direct support for blends, we should lower by decomposing into // a permute. That will be faster than the domain cross. @@ -10736,6 +11073,15 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Mask, DL, DAG)); } + // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid + // in SSE1 because otherwise they are widened to v2f64 and never get here. + if (!Subtarget.hasSSE2()) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1})) + return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1); + if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3})) + return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1); + } + // Otherwise, use a straight shuffle of a single input vector. We pass the // input vector to both operands to simulate this with a SHUFPS. return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1, @@ -10768,11 +11114,14 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, return BlendPerm; } - // Use low/high mov instructions. - if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) - return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); - if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) - return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); + // Use low/high mov instructions. These are only valid in SSE1 because + // otherwise they are widened to v2f64 and never get here. + if (!Subtarget.hasSSE2()) { + if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) + return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2); + if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7})) + return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1); + } // Use dedicated unpack instructions for masks that match their pattern. if (SDValue V = @@ -10857,10 +11206,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Try to use byte rotation instructions. // Its more profitable for pre-SSSE3 to use shuffles/unpacks. - if (Subtarget.hasSSSE3()) + if (Subtarget.hasSSSE3()) { + if (Subtarget.hasVLX()) + if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2, + Mask, Subtarget, DAG)) + return Rotate; + if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG)) return Rotate; + } // Assume that a single SHUFPS is faster than an alternative sequence of // multiple instructions (even if the CPU has a domain penalty). @@ -11449,6 +11804,11 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, + DAG, Subtarget)) + return V; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1, Mask, Subtarget, DAG)) @@ -11499,6 +11859,11 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG, + Subtarget)) + return V; + // Try to use byte rotation instructions. if (SDValue Rotate = lowerVectorShuffleAsByteRotate( DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG)) @@ -11619,6 +11984,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG)) return Rotate; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG, + Subtarget)) + return V; + // Try to use a zext lowering. if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend( DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -12105,7 +12475,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // FIXME: This should probably be generalized for 512-bit vectors as well. assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!"); int Size = Mask.size(); @@ -12114,12 +12485,21 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, // If there are only inputs from one 128-bit lane, splitting will in fact be // less expensive. The flags track whether the given lane contains an element // that crosses to another lane. - bool LaneCrossing[2] = {false, false}; - for (int i = 0; i < Size; ++i) - if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) - LaneCrossing[(Mask[i] % Size) / LaneSize] = true; - if (!LaneCrossing[0] || !LaneCrossing[1]) - return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + if (!Subtarget.hasAVX2()) { + bool LaneCrossing[2] = {false, false}; + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize) + LaneCrossing[(Mask[i] % Size) / LaneSize] = true; + if (!LaneCrossing[0] || !LaneCrossing[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + } else { + bool LaneUsed[2] = {false, false}; + for (int i = 0; i < Size; ++i) + if (Mask[i] >= 0) + LaneUsed[(Mask[i] / LaneSize)] = true; + if (!LaneUsed[0] || !LaneUsed[1]) + return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG); + } assert(V2.isUndef() && "This last part of this routine only works on single input shuffles"); @@ -12132,14 +12512,12 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT, : Mask[i] % LaneSize + (i / LaneSize) * LaneSize + Size); - // Flip the vector, and blend the results which should now be in-lane. The - // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and - // 5 for the high source. The value 3 selects the high half of source 2 and - // the value 2 selects the low half of source 2. We only use source 2 to - // allow folding it into a memory operand. - unsigned PERMMask = 3 | 2 << 4; - SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT), - V1, DAG.getConstant(PERMMask, DL, MVT::i8)); + // Flip the vector, and blend the results which should now be in-lane. + MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64; + SDValue Flipped = DAG.getBitcast(PVT, V1); + Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT), + { 2, 3, 0, 1 }); + Flipped = DAG.getBitcast(VT, Flipped); return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask); } @@ -12149,6 +12527,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding. + if (Subtarget.hasAVX2() && V2.isUndef()) + return SDValue(); + SmallVector<int, 4> WidenedMask; if (!canWidenShuffleElements(Mask, WidenedMask)) return SDValue(); @@ -12162,19 +12544,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, Zeroable, Subtarget, DAG)) return Blend; - bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode()); + bool IsLowZero = (Zeroable & 0x3) == 0x3; + bool IsHighZero = (Zeroable & 0xc) == 0xc; // If either input operand is a zero vector, use VPERM2X128 because its mask // allows us to replace the zero input with an implicit zero. - if (!IsV1Zero && !IsV2Zero) { + if (!IsLowZero && !IsHighZero) { // Check for patterns which can be matched with a single insert of a 128-bit // subvector. bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}); if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) { - // With AVX2, use VPERMQ/VPERMPD to allow memory folding. - if (Subtarget.hasAVX2() && V2.isUndef()) - return SDValue(); // With AVX1, use vperm2f128 (below) to allow load folding. Otherwise, // this will likely become vinsertf128 which can't fold a 256-bit memop. @@ -12189,6 +12568,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV); } } + + // Try to use SHUF128 if possible. + if (Subtarget.hasVLX()) { + if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) { + unsigned PermMask = ((WidenedMask[0] % 2) << 0) | + ((WidenedMask[1] % 2) << 1); + return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2, + DAG.getConstant(PermMask, DL, MVT::i8)); + } + } } // Otherwise form a 128-bit permutation. After accounting for undefs, @@ -12204,30 +12593,17 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1, // [6] - ignore // [7] - zero high half of destination - int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0]; - int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1]; + assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?"); - unsigned PermMask = MaskLO | (MaskHI << 4); + unsigned PermMask = 0; + PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0); + PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4); - // If either input is a zero vector, replace it with an undef input. - // Shuffle mask values < 4 are selecting elements of V1. - // Shuffle mask values >= 4 are selecting elements of V2. - // Adjust each half of the permute mask by clearing the half that was - // selecting the zero vector and setting the zero mask bit. - if (IsV1Zero) { + // Check the immediate mask and replace unused sources with undef. + if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00) V1 = DAG.getUNDEF(VT); - if (MaskLO < 2) - PermMask = (PermMask & 0xf0) | 0x08; - if (MaskHI < 2) - PermMask = (PermMask & 0x0f) | 0x80; - } - if (IsV2Zero) { + if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20) V2 = DAG.getUNDEF(VT); - if (MaskLO >= 2) - PermMask = (PermMask & 0xf0) | 0x08; - if (MaskHI >= 2) - PermMask = (PermMask & 0x0f) | 0x80; - } return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2, DAG.getConstant(PermMask, DL, MVT::i8)); @@ -12311,7 +12687,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes( return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask); } -/// Lower shuffles where an entire half of a 256-bit vector is UNDEF. +/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF. /// This allows for fast cases such as subvector extraction/insertion /// or shuffling smaller vector types which can lower more efficiently. static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, @@ -12319,7 +12695,8 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, ArrayRef<int> Mask, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(VT.is256BitVector() && "Expected 256-bit vector"); + assert((VT.is256BitVector() || VT.is512BitVector()) && + "Expected 256-bit or 512-bit vector"); unsigned NumElts = VT.getVectorNumElements(); unsigned HalfNumElts = NumElts / 2; @@ -12415,6 +12792,10 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT, } } + // AVX512 - XXXXuuuu - always extract lowers. + if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0)) + return SDValue(); + auto GetHalfVector = [&](int HalfIdx) { if (HalfIdx < 0) return DAG.getUNDEF(HalfVT); @@ -12729,7 +13110,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask, - DAG); + DAG, Subtarget); } // Use dedicated unpack instructions for masks that match their pattern. @@ -12810,7 +13191,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, SmallVector<int, 2> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) { SmallVector<int, 4> PSHUFDMask; - scaleShuffleMask(2, RepeatedMask, PSHUFDMask); + scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask); return DAG.getBitcast( MVT::v4i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32, @@ -12932,7 +13313,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Otherwise, fall back. return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask, - DAG); + DAG, Subtarget); } // Try to simplify this by merging 128-bit lanes to enable a lane-based @@ -13112,6 +13493,11 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG, + Subtarget)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -13133,7 +13519,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // element types. if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2, - Mask, DAG); + Mask, DAG, Subtarget); SmallVector<int, 8> RepeatedMask; if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) { @@ -13198,6 +13584,11 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG)) return V; + // Use dedicated pack instructions for masks that match their pattern. + if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG, + Subtarget)) + return V; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -13218,7 +13609,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // element types. if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask, - DAG); + DAG, Subtarget); if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB( DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG)) @@ -13485,6 +13876,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, // Otherwise, fall back to a SHUFPS sequence. return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG); } + + // If we have a single input shuffle with different shuffle patterns in the + // 128-bit lanes and don't lane cross, use variable mask VPERMILPS. + if (V2.isUndef() && + !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) { + SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true); + return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask); + } + // If we have AVX512F support, we can use VEXPAND. if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask, V1, V2, DAG, Subtarget)) @@ -13503,10 +13903,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!"); assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!"); - if (SDValue Shuf128 = - lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) - return Shuf128; - if (V2.isUndef()) { // When the shuffle is mirrored between the 128-bit lanes of the unit, we // can use lower latency instructions that will operate on all four @@ -13514,7 +13910,7 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, SmallVector<int, 2> Repeated128Mask; if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) { SmallVector<int, 4> PSHUFDMask; - scaleShuffleMask(2, Repeated128Mask, PSHUFDMask); + scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask); return DAG.getBitcast( MVT::v8i64, DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32, @@ -13528,6 +13924,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG)); } + if (SDValue Shuf128 = + lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG)) + return Shuf128; + // Try to use shift instructions. if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask, Zeroable, Subtarget, DAG)) @@ -13758,6 +14158,11 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask, DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG)) return Insertion; + // Handle special cases where the lower or upper half is UNDEF. + if (SDValue V = + lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG)) + return V; + // Check for being able to broadcast a single element. if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG)) @@ -14046,16 +14451,16 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); - // If this VSELECT has a vector if i1 as a mask, it will be directly matched - // with patterns on the mask registers on AVX-512. - if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) - return Op; - // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) return BlendOp; + // If this VSELECT has a vector if i1 as a mask, it will be directly matched + // with patterns on the mask registers on AVX-512. + if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) + return Op; + // Variable blends are only legal from SSE4.1 onward. if (!Subtarget.hasSSE41()) return SDValue(); @@ -14097,10 +14502,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { case MVT::v8i16: case MVT::v16i16: - // AVX-512 BWI and VLX features support VSELECT with i16 elements. - if (Subtarget.hasBWI() && Subtarget.hasVLX()) - return Op; - // FIXME: We should custom lower this by fixing the condition and using i8 // blends. return SDValue(); @@ -14117,9 +14518,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { if (VT.getSizeInBits() == 8) { SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); - SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (VT == MVT::f32) { @@ -14153,8 +14552,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) { /// Extract one bit from mask vector, like v16i1 or v8i1. /// AVX-512 feature. -SDValue -X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const { +static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Vec = Op.getOperand(0); SDLoc dl(Vec); MVT VecVT = Vec.getSimpleValueType(); @@ -14171,30 +14570,42 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const // Extending v8i1/v16i1 to 512-bit get better performance on KNL // than extending to 128/256bit. unsigned VecSize = (NumElts <= 4 ? 128 : 512); - MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts); SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec); SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ExtVT.getVectorElementType(), Ext, Idx); return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt); } + // Canonicalize result type to MVT::i32. + if (EltVT != MVT::i32) { + SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, + Vec, Idx); + return DAG.getAnyExtOrTrunc(Extract, dl, EltVT); + } + unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); + + // Extracts from element 0 are always allowed. + if (IdxVal == 0) + return Op; + + // If the kshift instructions of the correct width aren't natively supported + // then we need to promote the vector to the native size to get the correct + // zeroing behavior. if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) || (VecVT.getVectorNumElements() < 8)) { - // Use kshiftlw/rw instruction. VecVT = MVT::v16i1; Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, DAG.getUNDEF(VecVT), Vec, DAG.getIntPtrConstant(0, dl)); } - unsigned MaxSift = VecVT.getVectorNumElements() - 1; - if (MaxSift - IdxVal) - Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, - DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8)); + + // Use kshiftr instruction to move to the lower element. Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, - DAG.getConstant(MaxSift, dl, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec, DAG.getIntPtrConstant(0, dl)); } @@ -14207,7 +14618,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, SDValue Idx = Op.getOperand(1); if (VecVT.getVectorElementType() == MVT::i1) - return ExtractBitFromMaskVector(Op, DAG); + return ExtractBitFromMaskVector(Op, DAG, Subtarget); if (!isa<ConstantSDNode>(Idx)) { // Its more profitable to go through memory (1 cycles throughput) @@ -14278,9 +14689,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, // Transform it so it match pextrw which produces a 32-bit result. SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Op.getOperand(0), Op.getOperand(1)); - SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract, - DAG.getValueType(VT)); - return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract); } if (Subtarget.hasSSE41()) @@ -14347,8 +14756,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op, /// Insert one bit to mask vector, like v16i1 or v8i1. /// AVX-512 feature. -SDValue -X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { +static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDLoc dl(Op); SDValue Vec = Op.getOperand(0); SDValue Elt = Op.getOperand(1); @@ -14358,8 +14767,10 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { if (!isa<ConstantSDNode>(Idx)) { // Non constant index. Extend source and destination, // insert element and then truncate the result. - MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32); - MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32); + unsigned NumElts = VecVT.getVectorNumElements(); + unsigned VecSize = (NumElts <= 4 ? 128 : 512); + MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts); + MVT ExtEltVT = ExtVecVT.getVectorElementType(); SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT, DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec), DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx); @@ -14367,10 +14778,24 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { } unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); unsigned NumElems = VecVT.getVectorNumElements(); - if(Vec.isUndef()) { + // If the kshift instructions of the correct width aren't natively supported + // then we need to promote the vector to the native size to get the correct + // zeroing behavior. + if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) { + // Need to promote to v16i1, do the insert, then extract back. + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1, + DAG.getUNDEF(MVT::v16i1), Vec, + DAG.getIntPtrConstant(0, dl)); + Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx); + return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op, + DAG.getIntPtrConstant(0, dl)); + } + + SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt); + + if (Vec.isUndef()) { if (IdxVal) EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); @@ -14393,25 +14818,33 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } // Insertion of one bit into last position - if (IdxVal == NumElems -1) { + if (IdxVal == NumElems - 1) { // Move the bit to the last position inside the vector. EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec, DAG.getConstant(IdxVal, dl, MVT::i8)); // Clean the last bit in the source vector. Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec, - DAG.getConstant(1, dl, MVT::i8)); + DAG.getConstant(1, dl, MVT::i8)); Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, - DAG.getConstant(1 , dl, MVT::i8)); + DAG.getConstant(1 , dl, MVT::i8)); return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec); } - // Use shuffle to insert element. - SmallVector<int, 64> MaskVec(NumElems); - for (unsigned i = 0; i != NumElems; ++i) - MaskVec[i] = (i == IdxVal) ? NumElems : i; - - return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec); + // Move the current value of the bit to be replace to bit 0. + SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec, + DAG.getConstant(IdxVal, dl, MVT::i8)); + // Xor with the new bit. + Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec); + // Shift to MSB, filling bottom bits with 0. + Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged, + DAG.getConstant(NumElems - 1, dl, MVT::i8)); + // Shift to the final position, filling upper bits with 0. + Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged, + DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8)); + // Xor with original vector to cancel out the original bit value that's still + // present. + return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec); } SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, @@ -14421,7 +14854,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, unsigned NumElts = VT.getVectorNumElements(); if (EltVT == MVT::i1) - return InsertBitToMaskVector(Op, DAG); + return InsertBitToMaskVector(Op, DAG, Subtarget); SDLoc dl(Op); SDValue N0 = Op.getOperand(0); @@ -14444,7 +14877,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, for (unsigned i = 0; i != NumElts; ++i) BlendMask.push_back(i == IdxVal ? i + NumElts : i); SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl) - : DAG.getConstant(-1, dl, VT); + : getOnesVector(VT, DAG, dl); return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask); } @@ -14513,7 +14946,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, // Bits [3:0] of the constant are the zero mask. The DAG Combiner may // combine either bitwise AND or insert of float 0.0 to set these bits. - bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); + bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize(); if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather @@ -14574,48 +15007,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget, OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt)); } -// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in -// a simple subregister reference or explicit instructions to grab -// upper bits of a vector. -static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { - assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX"); - - SDLoc dl(Op); - SDValue In = Op.getOperand(0); - SDValue Idx = Op.getOperand(1); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - MVT ResVT = Op.getSimpleValueType(); - - // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond - // would result with: v1i1 = extract_subvector(vXi1, idx). - // Lower these into extract_vector_elt which is already selectable. - if (ResVT == MVT::v1i1) { - assert(Subtarget.hasAVX512() && - "Boolean EXTRACT_SUBVECTOR requires AVX512"); - - MVT EltVT = ResVT.getVectorElementType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - MVT LegalVT = - (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT(); - SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx); - return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res); - } - - assert((In.getSimpleValueType().is256BitVector() || - In.getSimpleValueType().is512BitVector()) && - "Can only extract from 256-bit or 512-bit vectors"); - - // If the input is a buildvector just emit a smaller one. - unsigned ElemsPerChunk = ResVT.getVectorNumElements(); - if (In.getOpcode() == ISD::BUILD_VECTOR) - return DAG.getBuildVector( - ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk)); - - // Everything else is legal. - return Op; -} - // Lower a node with an INSERT_SUBVECTOR opcode. This may result in a // simple superregister reference or explicit instructions to insert // the upper bits of a vector. @@ -14696,7 +15087,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const { // In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the // global base reg. - const Module *Mod = DAG.getMachineFunction().getFunction()->getParent(); + const Module *Mod = DAG.getMachineFunction().getFunction().getParent(); unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod); auto PtrVT = getPointerTy(DAG.getDataLayout()); @@ -15516,24 +15907,12 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op, switch (SrcVT.SimpleTy) { default: llvm_unreachable("Custom UINT_TO_FP is not supported!"); - case MVT::v4i8: - case MVT::v4i16: - case MVT::v8i8: - case MVT::v8i16: { - MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements()); - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0)); - } case MVT::v2i32: return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl); case MVT::v4i32: case MVT::v8i32: + assert(!Subtarget.hasAVX512()); return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget); - case MVT::v16i8: - case MVT::v16i16: - assert(Subtarget.hasAVX512()); - return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(), - DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0)); } } @@ -15543,12 +15922,6 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, SDLoc dl(Op); auto PtrVT = getPointerTy(DAG.getDataLayout()); - // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't - // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform - // the optimization here. - if (DAG.SignBitIsZero(N0)) - return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0); - if (Op.getSimpleValueType().isVector()) return lowerUINT_TO_FP_vec(Op, DAG); @@ -15827,8 +16200,18 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); - if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In); + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && + (VT != MVT::v8i32 || InVT != MVT::v8i16) && + (VT != MVT::v16i16 || InVT != MVT::v16i8) && + (VT != MVT::v8i64 || InVT != MVT::v8i32) && + (VT != MVT::v8i64 || InVT != MVT::v8i16) && + (VT != MVT::v16i32 || InVT != MVT::v16i16) && + (VT != MVT::v16i32 || InVT != MVT::v16i8) && + (VT != MVT::v32i16 || InVT != MVT::v32i8)) + return SDValue(); + + if (Subtarget.hasInt256()) + return DAG.getNode(X86ISD::VZEXT, dl, VT, In); // Optimize vectors in AVX mode: // @@ -15843,14 +16226,6 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, // Concat upper and lower parts. // - if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) && - ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) && - ((VT != MVT::v4i64) || (InVT != MVT::v4i32))) - return SDValue(); - - if (Subtarget.hasInt256()) - return DAG.getNode(X86ISD::VZEXT, dl, VT, In); - SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl); SDValue Undef = DAG.getUNDEF(InVT); bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND; @@ -15866,39 +16241,60 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG, return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi); } -static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, - const X86Subtarget &Subtarget, SelectionDAG &DAG) { +static SDValue LowerZERO_EXTEND_Mask(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); + assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); SDLoc DL(Op); unsigned NumElts = VT.getVectorNumElements(); - if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 && - (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) - return DAG.getNode(X86ISD::VZEXT, DL, VT, In); + // Extend VT if the scalar type is v8/v16 and BWI is not supported. + MVT ExtVT = VT; + if (!Subtarget.hasBWI() && + (VT.getVectorElementType().getSizeInBits() <= 16)) + ExtVT = MVT::getVectorVT(MVT::i32, NumElts); + + // Widen to 512-bits if VLX is not supported. + MVT WideVT = ExtVT; + if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { + NumElts *= 512 / ExtVT.getSizeInBits(); + InVT = MVT::getVectorVT(MVT::i1, NumElts); + In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT), + In, DAG.getIntPtrConstant(0, DL)); + WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), + NumElts); + } - if (InVT.getVectorElementType() != MVT::i1) - return SDValue(); + SDValue One = DAG.getConstant(1, DL, WideVT); + SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL); - // Extend VT if the target is 256 or 128bit vector and VLX is not supported. - MVT ExtVT = VT; - if (!VT.is512BitVector() && !Subtarget.hasVLX()) - ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero); - SDValue One = - DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT); - SDValue Zero = - DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); + // Truncate if we had to extend i16/i8 above. + if (VT != ExtVT) { + WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts); + SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal); + } + + // Extract back to 128/256-bit if we widened. + if (WideVT != VT) + SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal, + DAG.getIntPtrConstant(0, DL)); - SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero); - if (VT == ExtVT) - return SelectedVal; - return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); + return SelectedVal; } static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { + SDValue In = Op->getOperand(0); + MVT InVT = In.getSimpleValueType(); + + if (InVT.getVectorElementType() == MVT::i1) + return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); + if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; @@ -15908,32 +16304,33 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - SDLoc DL(Op); - MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT SVT = In.getSimpleValueType(); - if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1) - return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG); + if (SVT.getVectorElementType() == MVT::i1) + return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG); if (Subtarget.hasFp256()) if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget)) return Res; - assert(!VT.is256BitVector() || !SVT.is128BitVector() || - VT.getVectorNumElements() != SVT.getVectorNumElements()); + assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() || + Op.getSimpleValueType().getVectorNumElements() != + SVT.getVectorNumElements()); return SDValue(); } -/// Helper to recursively truncate vector elements in half with PACKSS. -/// It makes use of the fact that vector comparison results will be all-zeros -/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types. -/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates +/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS. +/// It makes use of the fact that vectors with enough leading sign/zero bits +/// prevent the PACKSS/PACKUS from saturating the results. +/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates /// within each 128-bit lane. -static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In, - const SDLoc &DL, - SelectionDAG &DAG, - const X86Subtarget &Subtarget) { +static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In, + const SDLoc &DL, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) && + "Unexpected PACK opcode"); + // Requires SSE2 but AVX512 has fast truncate. if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) return SDValue(); @@ -15946,40 +16343,52 @@ static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In, // We only support vector truncation to 128bits or greater from a // 256bits or greater source. - if ((DstVT.getSizeInBits() % 128) != 0) - return SDValue(); - if ((SrcVT.getSizeInBits() % 256) != 0) + unsigned DstSizeInBits = DstVT.getSizeInBits(); + unsigned SrcSizeInBits = SrcVT.getSizeInBits(); + if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0) return SDValue(); + LLVMContext &Ctx = *DAG.getContext(); unsigned NumElems = SrcVT.getVectorNumElements(); assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation"); - assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation"); + assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation"); - EVT PackedSVT = - EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2); + EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2); // Extract lower/upper subvectors. unsigned NumSubElts = NumElems / 2; - unsigned SrcSizeInBits = SrcVT.getSizeInBits(); SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2); SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2); - // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors. + // Pack to the largest type possible: + // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB. + EVT InVT = MVT::i16, OutVT = MVT::i8; + if (DstVT.getScalarSizeInBits() > 8 && + (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) { + InVT = MVT::i32; + OutVT = MVT::i16; + } + + unsigned SubSizeInBits = SrcSizeInBits / 2; + InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits()); + OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits()); + + // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors. if (SrcVT.is256BitVector()) { - Lo = DAG.getBitcast(MVT::v8i16, Lo); - Hi = DAG.getBitcast(MVT::v8i16, Hi); - SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi); + Lo = DAG.getBitcast(InVT, Lo); + Hi = DAG.getBitcast(InVT, Hi); + SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); return DAG.getBitcast(DstVT, Res); } - // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors. - // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS). + // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors. + // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK). if (SrcVT.is512BitVector() && Subtarget.hasInt256()) { - Lo = DAG.getBitcast(MVT::v16i16, Lo); - Hi = DAG.getBitcast(MVT::v16i16, Hi); - SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi); + Lo = DAG.getBitcast(InVT, Lo); + Hi = DAG.getBitcast(InVT, Hi); + SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi); - // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), + // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)), // so we need to shuffle to get ((LO0,HI0),(LO1,HI1)). Res = DAG.getBitcast(MVT::v4i64, Res); Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3}); @@ -15988,20 +16397,20 @@ static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In, return DAG.getBitcast(DstVT, Res); // If 512bit -> 128bit truncate another stage. - EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems); + EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); Res = DAG.getBitcast(PackedVT, Res); - return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget); + return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } // Recursively pack lower/upper subvectors, concat result and pack again. - assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater"); - EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2); - Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget); - Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget); + assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater"); + EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts); + Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget); + Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget); - PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems); + PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems); SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi); - return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget); + return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget); } static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG, @@ -16047,15 +16456,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); MVT InVT = In.getSimpleValueType(); + unsigned InNumEltBits = InVT.getScalarSizeInBits(); - if (VT == MVT::i1) { - assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) && - "Invalid scalar TRUNCATE operation"); - if (InVT.getSizeInBits() >= 32) - return SDValue(); - In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In); - return DAG.getNode(ISD::TRUNCATE, DL, VT, In); - } assert(VT.getVectorNumElements() == InVT.getVectorNumElements() && "Invalid TRUNCATE operation"); @@ -16071,9 +16473,23 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const { return DAG.getNode(X86ISD::VTRUNC, DL, VT, In); } - // Truncate with PACKSS if we are truncating a vector zero/all-bits result. - if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In)) - if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget)) + // Truncate with PACKSS if we are truncating a vector with sign-bits that + // extend all the way to the packed/truncated value. + unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16); + if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In)) + if (SDValue V = + truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget)) + return V; + + // Truncate with PACKUS if we are truncating a vector with leading zero bits + // that extend all the way to the packed/truncated value. + // Pre-SSE41 we can only use PACKUSWB. + KnownBits Known; + DAG.computeKnownBits(In, Known); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros()) + if (SDValue V = + truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget)) return V; if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) { @@ -16579,16 +16995,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, // non-casted variable when we check for possible users. switch (ArithOp.getOpcode()) { case ISD::ADD: - // Due to an isel shortcoming, be conservative if this add is likely to be - // selected as part of a load-modify-store instruction. When the root node - // in a match is a store, isel doesn't know how to remap non-chain non-flag - // uses of other nodes in the match, such as the ADD in this case. This - // leads to the ADD being left around and reselected, with the result being - // two adds in the output. Alas, even if none our users are stores, that - // doesn't prove we're O.K. Ergo, if we have any parents that aren't - // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require - // climbing the DAG back to the root, and it doesn't seem to be worth the - // effort. + // We only want to rewrite this as a target-specific node with attached + // flags if there is a reasonable chance of either using that to do custom + // instructions selection that can fold some of the memory operands, or if + // only the flags are used. If there are other uses, leave the node alone + // and emit a test instruction. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) if (UI->getOpcode() != ISD::CopyToReg && @@ -16596,17 +17007,20 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, UI->getOpcode() != ISD::STORE) goto default_case; - if (ConstantSDNode *C = - dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) { + if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) { // An add of one will be selected as an INC. - if (C->isOne() && !Subtarget.slowIncDec()) { + if (C->isOne() && + (!Subtarget.slowIncDec() || + DAG.getMachineFunction().getFunction().optForSize())) { Opcode = X86ISD::INC; NumOperands = 1; break; } // An add of negative one (subtract of one) will be selected as a DEC. - if (C->isAllOnesValue() && !Subtarget.slowIncDec()) { + if (C->isAllOnesValue() && + (!Subtarget.slowIncDec() || + DAG.getMachineFunction().getFunction().optForSize())) { Opcode = X86ISD::DEC; NumOperands = 1; break; @@ -16699,11 +17113,13 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl, case ISD::SUB: case ISD::OR: case ISD::XOR: - // Due to the ISEL shortcoming noted above, be conservative if this op is - // likely to be selected as part of a load-modify-store instruction. + // Similar to ISD::ADD above, check if the uses will preclude useful + // lowering of the target-specific node. for (SDNode::use_iterator UI = Op.getNode()->use_begin(), UE = Op.getNode()->use_end(); UI != UE; ++UI) - if (UI->getOpcode() == ISD::STORE) + if (UI->getOpcode() != ISD::CopyToReg && + UI->getOpcode() != ISD::SETCC && + UI->getOpcode() != ISD::STORE) goto default_case; // Otherwise use a regular EFLAGS-setting instruction. @@ -16799,7 +17215,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, // with an immediate. 16 bit immediates are to be avoided. if ((Op0.getValueType() == MVT::i16 && (isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) && - !DAG.getMachineFunction().getFunction()->optForMinSize() && + !DAG.getMachineFunction().getFunction().optForMinSize() && !Subtarget.isAtom()) { unsigned ExtendOp = isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND; @@ -16808,8 +17224,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, } // Use SUB instead of CMP to enable CSE between SUB and CMP. SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32); - SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, - Op0, Op1); + SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1); return SDValue(Sub.getNode(), 1); } return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1); @@ -16871,8 +17286,11 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op, // instructions: convert to single, rsqrtss, convert back to double, refine // (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA // along with FMA, this could be a throughput win. + // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32 + // after legalize types. if ((VT == MVT::f32 && Subtarget.hasSSE1()) || - (VT == MVT::v4f32 && Subtarget.hasSSE1()) || + (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) || + (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) || (VT == MVT::v8f32 && Subtarget.hasAVX())) { if (RefinementSteps == ReciprocalEstimate::Unspecified) RefinementSteps = 1; @@ -16965,6 +17383,7 @@ static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC, /// Result of 'and' is compared against zero. Change to a BT node if possible. static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, SelectionDAG &DAG) { + assert(And.getOpcode() == ISD::AND && "Expected AND node!"); SDValue Op0 = And.getOperand(0); SDValue Op1 = And.getOperand(1); if (Op0.getOpcode() == ISD::TRUNCATE) @@ -17013,36 +17432,10 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, return SDValue(); } -// Convert (truncate (srl X, N) to i1) to (bt X, N) -static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG) { - - assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 && - "Expected TRUNCATE to i1 node"); - - if (Op.getOperand(0).getOpcode() != ISD::SRL) - return SDValue(); - - SDValue ShiftRight = Op.getOperand(0); - return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1), - CC, dl, DAG); -} - -/// Result of 'and' or 'trunc to i1' is compared against zero. -/// Change to a BT node if possible. -SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC, - const SDLoc &dl, SelectionDAG &DAG) const { - if (Op.getOpcode() == ISD::AND) - return LowerAndToBT(Op, CC, dl, DAG); - if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1) - return LowerTruncateToBT(Op, CC, dl, DAG); - return SDValue(); -} - /// Turns an ISD::CondCode into a value suitable for SSE floating-point mask /// CMPs. -static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, - SDValue &Op1) { +static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, + SDValue &Op1) { unsigned SSECC; bool Swap = false; @@ -17075,8 +17468,8 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0, case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH; case ISD::SETUGT: SSECC = 6; break; case ISD::SETO: SSECC = 7; break; - case ISD::SETUEQ: - case ISD::SETONE: SSECC = 8; break; + case ISD::SETUEQ: SSECC = 8; break; + case ISD::SETONE: SSECC = 12; break; } if (Swap) std::swap(Op0, Op1); @@ -17189,6 +17582,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) { if (Swap) std::swap(Op0, Op1); + + // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM. + if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) { + SDValue A = peekThroughBitcasts(Op0); + if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) && + ISD::isBuildVectorAllZeros(Op1.getNode())) { + MVT VT0 = Op0.getSimpleValueType(); + SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0)); + SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1)); + return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM, + dl, VT, RHS, LHS); + } + } + if (Opc) return DAG.getNode(Opc, dl, VT, Op0, Op1); Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM; @@ -17256,25 +17663,21 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE), // emit two comparisons and a logic op to tie them together. - // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is - // available. SDValue Cmp; unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1); - if (SSECC == 8) { + if (SSECC >= 8 && !Subtarget.hasAVX()) { // LLVM predicate is SETUEQ or SETONE. unsigned CC0, CC1; unsigned CombineOpc; if (Cond == ISD::SETUEQ) { CC0 = 3; // UNORD CC1 = 0; // EQ - CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) : - static_cast<unsigned>(ISD::OR); + CombineOpc = X86ISD::FOR; } else { assert(Cond == ISD::SETONE); CC0 = 7; // ORD CC1 = 4; // NEQ - CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) : - static_cast<unsigned>(ISD::AND); + CombineOpc = X86ISD::FAND; } SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1, @@ -17379,6 +17782,24 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, DAG.getConstant(CmpMode, dl, MVT::i8)); } + // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2. + // Revert part of the simplifySetCCWithAnd combine, to avoid an invert. + if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) { + SDValue BC0 = peekThroughBitcasts(Op0); + if (BC0.getOpcode() == ISD::AND) { + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + if (getTargetConstantBitsFromNode(BC0.getOperand(1), + VT.getScalarSizeInBits(), UndefElts, + EltBits, false, false)) { + if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) { + Cond = ISD::SETEQ; + Op1 = DAG.getBitcast(VT, BC0.getOperand(1)); + } + } + } + } + // We are handling one of the integer comparisons here. Since SSE only has // GT and EQ comparisons for integer, swapping operands and multiple // operations may be required for some comparisons. @@ -17399,7 +17820,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget, // Special case: Use min/max operations for SETULE/SETUGE MVT VET = VT.getVectorElementType(); bool HasMinMax = - (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) || + (Subtarget.hasAVX512() && VET == MVT::i64) || + (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) || (Subtarget.hasSSE2() && (VET == MVT::i8)); bool MinMax = false; if (HasMinMax) { @@ -17560,14 +17982,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { // Lower (X & (1 << N)) == 0 to BT(X, N). // Lower ((X >>u N) & 1) != 0 to BT(X, N). // Lower ((X >>s N) & 1) != 0 to BT(X, N). - // Lower (trunc (X >> N) to i1) to BT(X, N). - if (Op0.hasOneUse() && isNullConstant(Op1) && + if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) { - if (VT == MVT::i1) - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC); + if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG)) return NewSetCC; - } } // Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of @@ -17584,20 +18002,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { return Op0; CCode = X86::GetOppositeBranchCondition(CCode); - SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG); - if (VT == MVT::i1) - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); - return SetCC; - } - } - if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) { - if (isOneConstant(Op1)) { - ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true); - return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC); - } - if (!isNullConstant(Op1)) { - SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1); - return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC); + return getSETCC(CCode, Op0.getOperand(1), dl, DAG); } } @@ -17608,10 +18013,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const { SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG); EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG); - SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG); - if (VT == MVT::i1) - return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC); - return SetCC; + return getSETCC(X86CC, EFLAGS, dl, DAG); } SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const { @@ -17632,10 +18034,7 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32); SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1)); - SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG); - if (Op.getSimpleValueType() == MVT::i1) - return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC); - return SetCC; + return getSETCC(CC, Cmp.getValue(1), DL, DAG); } /// Return true if opcode is a X86 logical comparison. @@ -17646,7 +18045,7 @@ static bool isX86LogicalCmp(SDValue Op) { return true; if (Op.getResNo() == 1 && (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC || - Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL || + Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR || Opc == X86ISD::XOR || Opc == X86ISD::AND)) return true; @@ -17684,17 +18083,17 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { (Subtarget.hasSSE1() && VT == MVT::f32)) && VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) { SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1); - int SSECC = translateX86FSETCC( + unsigned SSECC = translateX86FSETCC( cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1); - if (SSECC != 8) { - if (Subtarget.hasAVX512()) { - SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, - CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); - return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS, - DL, VT, Cmp, Op1, Op2); - } + if (Subtarget.hasAVX512()) { + SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0, + CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); + assert(!VT.isVector() && "Not a scalar type?"); + return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2); + } + if (SSECC < 8 || Subtarget.hasAVX()) { SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1, DAG.getConstant(SSECC, DL, MVT::i8)); @@ -17941,7 +18340,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { // We know the result of AND is compared against zero. Try to match // it to BT. if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { - if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) { + if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); AddTest = false; @@ -17983,66 +18382,68 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { if (T1.getValueType() == T2.getValueType() && // Blacklist CopyFromReg to avoid partial register stalls. T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){ - SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue); - SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond); + SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1, + CC, Cond); return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov); } } // X86ISD::CMOV means set the result (which is operand 1) to the RHS if // condition is true. - SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue); SDValue Ops[] = { Op2, Op1, CC, Cond }; - return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops); + return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops); } -static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, - const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +static SDValue LowerSIGN_EXTEND_Mask(SDValue Op, + const X86Subtarget &Subtarget, + SelectionDAG &DAG) { MVT VT = Op->getSimpleValueType(0); SDValue In = Op->getOperand(0); MVT InVT = In.getSimpleValueType(); + assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!"); MVT VTElt = VT.getVectorElementType(); - MVT InVTElt = InVT.getVectorElementType(); SDLoc dl(Op); - // SKX processor - if ((InVTElt == MVT::i1) && - (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) || - - ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32)))) - - return DAG.getNode(X86ISD::VSEXT, dl, VT, In); - unsigned NumElts = VT.getVectorNumElements(); - if (VT.is512BitVector() && InVTElt != MVT::i1 && - (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) { - if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT) - return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG); - return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG); - } - - if (InVTElt != MVT::i1) - return SDValue(); - + // Extend VT if the scalar type is v8/v16 and BWI is not supported. MVT ExtVT = VT; - if (!VT.is512BitVector() && !Subtarget.hasVLX()) - ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts); + if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16) + ExtVT = MVT::getVectorVT(MVT::i32, NumElts); + + // Widen to 512-bits if VLX is not supported. + MVT WideVT = ExtVT; + if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) { + NumElts *= 512 / ExtVT.getSizeInBits(); + InVT = MVT::getVectorVT(MVT::i1, NumElts); + In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT), + In, DAG.getIntPtrConstant(0, dl)); + WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts); + } SDValue V; - if (Subtarget.hasDQI()) { - V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG); - assert(!VT.is512BitVector() && "Unexpected vector type"); + MVT WideEltVT = WideVT.getVectorElementType(); + if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) || + (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) { + V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG); } else { - SDValue NegOne = getOnesVector(ExtVT, DAG, dl); - SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); - V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero); - if (ExtVT == VT) - return V; + SDValue NegOne = getOnesVector(WideVT, DAG, dl); + SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl); + V = DAG.getSelect(dl, WideVT, In, NegOne, Zero); } - return DAG.getNode(X86ISD::VTRUNC, dl, VT, V); + // Truncate if we had to extend i16/i8 above. + if (VT != ExtVT) { + WideVT = MVT::getVectorVT(VTElt, NumElts); + V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V); + } + + // Extract back to 128/256-bit if we widened. + if (WideVT != VT) + V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V, + DAG.getIntPtrConstant(0, dl)); + + return V; } // Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG. @@ -18139,12 +18540,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget, MVT InVT = In.getSimpleValueType(); SDLoc dl(Op); - if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1) - return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG); + if (InVT.getVectorElementType() == MVT::i1) + return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG); - if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && - (VT != MVT::v8i32 || InVT != MVT::v8i16) && - (VT != MVT::v16i16 || InVT != MVT::v16i8)) + if ((VT != MVT::v4i64 || InVT != MVT::v4i32) && + (VT != MVT::v8i32 || InVT != MVT::v8i16) && + (VT != MVT::v16i16 || InVT != MVT::v16i8) && + (VT != MVT::v8i64 || InVT != MVT::v8i32) && + (VT != MVT::v8i64 || InVT != MVT::v8i16) && + (VT != MVT::v16i32 || InVT != MVT::v16i16) && + (VT != MVT::v16i32 || InVT != MVT::v16i8) && + (VT != MVT::v32i16 || InVT != MVT::v32i8)) return SDValue(); if (Subtarget.hasInt256()) @@ -18311,13 +18717,10 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, assert(VT == MVT::v32i8 && "Unexpected extload type"); - SmallVector<SDValue, 2> Chains; - SDValue BasePtr = Ld->getBasePtr(); SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), Ld->getBasePtr(), Ld->getMemOperand()); - Chains.push_back(LoadLo.getValue(1)); SDValue BasePtrHi = DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr, @@ -18326,8 +18729,9 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op, SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(), BasePtrHi, Ld->getMemOperand()); - Chains.push_back(LoadHi.getValue(1)); - SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains); + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, + LoadLo.getValue(1), LoadHi.getValue(1)); DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain); SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo); @@ -18443,6 +18847,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, if (Ext == ISD::SEXTLOAD && RegSz >= 256) loadRegZize = 128; + // If we don't have BWI we won't be able to create the shuffle needed for + // v8i8->v8i64. + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) + loadRegZize = 128; + // Represent our vector as a sequence of elements which are the // largest scalar that we can load. EVT LoadUnitVecVT = EVT::getVectorVT( @@ -18509,6 +18919,13 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget, return Shuff; } + if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 && + MemVT == MVT::v8i8) { + SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG); + DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF); + return Sext; + } + // Redistribute the loaded elements into the different locations. SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1); for (unsigned i = 0; i != NumElems; ++i) @@ -18796,9 +19213,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const { if (isTruncWithZeroHighBitsInput(Cond, DAG)) Cond = Cond.getOperand(0); - // We know the result is compared against zero. Try to match it to BT. - if (Cond.hasOneUse()) { - if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) { + // We know the result of AND is compared against zero. Try to match + // it to BT. + if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) { + if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) { CC = NewSetCC.getOperand(0); Cond = NewSetCC.getOperand(1); addTest = false; @@ -18867,8 +19285,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, if (Is64Bit) { // The 64 bit implementation of segmented stacks needs to clobber both r10 // r11. This makes it impossible to use it along with nested parameters. - const Function *F = MF.getFunction(); - for (const auto &A : F->args()) { + const Function &F = MF.getFunction(); + for (const auto &A : F.args()) { if (A.hasNestAttr()) report_fatal_error("Cannot use segmented stacks with functions that " "have nested arguments."); @@ -18915,7 +19333,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(Op); if (!Subtarget.is64Bit() || - Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) { + Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) { // vastart just stores the address of the VarArgsFrameIndex slot into the // memory location argument. SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT); @@ -18969,7 +19387,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { assert(Op.getNumOperands() == 4); MachineFunction &MF = DAG.getMachineFunction(); - if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) + if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) // The Win64 ABI uses char* instead of a structure. return DAG.expandVAArg(Op.getNode()); @@ -19000,7 +19418,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { if (ArgMode == 2) { // Sanity Check: Make sure using fp_offset makes sense. assert(!Subtarget.useSoftFloat() && - !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) && + !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) && Subtarget.hasSSE1()); } @@ -19010,13 +19428,12 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const { DAG.getConstant(ArgMode, dl, MVT::i8), DAG.getConstant(Align, dl, MVT::i32)}; SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other); - SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl, - VTs, InstOps, MVT::i64, - MachinePointerInfo(SV), - /*Align=*/0, - /*Volatile=*/false, - /*ReadMem=*/true, - /*WriteMem=*/true); + SDValue VAARG = DAG.getMemIntrinsicNode( + X86ISD::VAARG_64, dl, + VTs, InstOps, MVT::i64, + MachinePointerInfo(SV), + /*Align=*/0, + MachineMemOperand::MOLoad | MachineMemOperand::MOStore); Chain = VAARG.getValue(1); // Load the next argument and return it @@ -19029,7 +19446,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget, // where a va_list is still an i8*. assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!"); if (Subtarget.isCallingConvWin64( - DAG.getMachineFunction().getFunction()->getCallingConv())) + DAG.getMachineFunction().getFunction().getCallingConv())) // Probably a Win64 va_copy. return DAG.expandVACopy(Op.getNode()); @@ -19172,8 +19589,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT, ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt); ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64); } else { - SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT), - DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; + SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT), + DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)}; ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps); } @@ -19193,9 +19610,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT, const SDLoc &dl) { if (isAllOnesConstant(Mask)) - return DAG.getTargetConstant(1, dl, MaskVT); + return DAG.getConstant(1, dl, MaskVT); if (X86::isZeroNode(Mask)) - return DAG.getTargetConstant(0, dl, MaskVT); + return DAG.getConstant(0, dl, MaskVT); if (MaskVT.bitsGT(Mask.getSimpleValueType())) { // Mask should be extended @@ -19255,13 +19672,12 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask, switch (Op.getOpcode()) { default: break; - case X86ISD::PCMPEQM: - case X86ISD::PCMPGTM: case X86ISD::CMPM: + case X86ISD::CMPM_RND: case X86ISD::CMPMU: + case X86ISD::VPSHUFBITQMB: return DAG.getNode(ISD::AND, dl, VT, Op, VMask); case X86ISD::VFPCLASS: - case X86ISD::VFPCLASSS: return DAG.getNode(ISD::OR, dl, VT, Op, VMask); case X86ISD::VTRUNC: case X86ISD::VTRUNCS: @@ -19370,8 +19786,8 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset); } -static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget, - SelectionDAG &DAG) { +SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, + SelectionDAG &DAG) const { // Helper to detect if the operand is CUR_DIRECTION rounding mode. auto isRoundModeCurDirection = [](SDValue Rnd) { if (!isa<ConstantSDNode>(Rnd)) @@ -19442,14 +19858,36 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue passThru = Op.getOperand(3); SDValue Mask = Op.getOperand(4); unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; - if (IntrWithRoundingModeOpcode != 0) { - SDValue Rnd = Op.getOperand(5); - if (!isRoundModeCurDirection(Rnd)) + // There are 2 kinds of intrinsics in this group: + // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands + // (2) With rounding mode and sae - 7 operands. + bool HasRounding = IntrWithRoundingModeOpcode != 0; + if (Op.getNumOperands() == (5U + HasRounding)) { + if (HasRounding) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, VT, Src1, Src2, Rnd), + Mask, passThru, Subtarget, DAG); + } + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2), + Mask, passThru, Subtarget, DAG); + } + + assert(Op.getNumOperands() == (6U + HasRounding) && + "Unexpected intrinsic form"); + SDValue RoundingMode = Op.getOperand(5); + if (HasRounding) { + SDValue Sae = Op.getOperand(6); + if (!isRoundModeCurDirection(Sae)) return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, - dl, VT, Src1, Src2, Rnd), + dl, VT, Src1, Src2, + RoundingMode, Sae), Mask, passThru, Subtarget, DAG); } - return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2), + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, + Src2, RoundingMode), Mask, passThru, Subtarget, DAG); } case INTR_TYPE_SCALAR_MASK_RM: { @@ -19518,16 +19956,23 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Src1, Src2, Rnd), Mask, PassThru, Subtarget, DAG); } - case INTR_TYPE_3OP_SCALAR_MASK_RM: { + case INTR_TYPE_3OP_SCALAR_MASK: { SDValue Src1 = Op.getOperand(1); SDValue Src2 = Op.getOperand(2); SDValue Src3 = Op.getOperand(3); SDValue PassThru = Op.getOperand(4); SDValue Mask = Op.getOperand(5); - SDValue Sae = Op.getOperand(6); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(6); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, + dl, VT, Src1, Src2, Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, - Src2, Src3, Sae), + Src2, Src3), Mask, PassThru, Subtarget, DAG); } case INTR_TYPE_3OP_MASK_RM: { @@ -19664,10 +20109,39 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget else PassThru = Src1; - SDValue Rnd = Op.getOperand(5); + unsigned IntrWithRoundingModeOpcode = IntrData->Opc1; + if (IntrWithRoundingModeOpcode != 0) { + SDValue Rnd = Op.getOperand(5); + if (!isRoundModeCurDirection(Rnd)) + return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl, + Op.getValueType(), Src1, Src2, + Src3, Rnd), + Mask, PassThru, Subtarget, DAG); + } + return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Src1, Src2, - Src3, Rnd), + Src3), + Mask, PassThru, Subtarget, DAG); + } + case IFMA_OP_MASKZ: + case IFMA_OP_MASK: { + SDValue Src1 = Op.getOperand(1); + SDValue Src2 = Op.getOperand(2); + SDValue Src3 = Op.getOperand(3); + SDValue Mask = Op.getOperand(4); + MVT VT = Op.getSimpleValueType(); + SDValue PassThru = Src1; + + // set PassThru element + if (IntrData->Type == IFMA_OP_MASKZ) + PassThru = getZeroVector(VT, Subtarget, DAG, dl); + + // Node we need to swizzle the operands to pass the multiply operands + // first. + return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, + dl, Op.getValueType(), + Src2, Src3, Src1), Mask, PassThru, Subtarget, DAG); } case TERLOG_OP_MASK: @@ -19726,9 +20200,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget MVT BitcastVT = MVT::getVectorVT(MVT::i1, Mask.getSimpleValueType().getSizeInBits()); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm); - SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, - DAG.getTargetConstant(0, dl, MaskVT), - Subtarget, DAG); + SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(), + Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), FPclassMask, DAG.getIntPtrConstant(0, dl)); @@ -19739,9 +20212,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Imm = Op.getOperand(2); SDValue Mask = Op.getOperand(3); SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm); - SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, - DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask, + SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(), + Subtarget, DAG); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask, DAG.getIntPtrConstant(0, dl)); } case CMP_MASK: @@ -19783,9 +20256,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1), Op.getOperand(2)); } - SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, - DAG.getTargetConstant(0, dl, - MaskVT), + SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT, DAG.getUNDEF(BitcastVT), CmpMask, @@ -19808,11 +20279,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget if(!Cmp.getNode()) Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC); - SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, - DAG.getTargetConstant(0, dl, - MVT::i1), + SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(), Subtarget, DAG); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask, DAG.getIntPtrConstant(0, dl)); } case COMI: { // Comparison intrinsics @@ -19866,7 +20335,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget else FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS, DAG.getConstant(CondVal, dl, MVT::i8), Sae); - return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp, + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp, DAG.getIntPtrConstant(0, dl)); } case VSHIFT: @@ -19891,18 +20360,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget Mask = DAG.getBitcast(MaskVT, Mask); return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask); } - case KUNPCK: { - MVT VT = Op.getSimpleValueType(); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2); - - SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl); - SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl); - // Arguments should be swapped. - SDValue Res = DAG.getNode(IntrData->Opc0, dl, - MVT::getVectorVT(MVT::i1, VT.getSizeInBits()), - Src2, Src1); - return DAG.getBitcast(VT, Res); - } case MASK_BINOP: { MVT VT = Op.getSimpleValueType(); MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()); @@ -19953,37 +20410,25 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } - case BRCST_SUBVEC_TO_VEC: { - SDValue Src = Op.getOperand(1); - SDValue Passthru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - EVT resVT = Passthru.getValueType(); - SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT, - DAG.getUNDEF(resVT), Src, - DAG.getIntPtrConstant(0, dl)); - SDValue immVal; - if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector()) - immVal = DAG.getConstant(0x44, dl, MVT::i8); - else - immVal = DAG.getConstant(0, dl, MVT::i8); - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, - subVec, subVec, immVal), - Mask, Passthru, Subtarget, DAG); - } - case BRCST32x2_TO_VEC: { - SDValue Src = Op.getOperand(1); - SDValue PassThru = Op.getOperand(2); - SDValue Mask = Op.getOperand(3); - - assert((VT.getScalarType() == MVT::i32 || - VT.getScalarType() == MVT::f32) && "Unexpected type!"); - //bitcast Src to packed 64 - MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64; - MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64); - Src = DAG.getBitcast(BitcastVT, Src); - - return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src), - Mask, PassThru, Subtarget, DAG); + case ROUNDP: { + assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode"); + // Clear the upper bits of the rounding immediate so that the legacy + // intrinsic can't trigger the scaling behavior of VRNDSCALE. + SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, + Op.getOperand(2), + DAG.getConstant(0xf, dl, MVT::i32)); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), RoundingMode); + } + case ROUNDS: { + assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode"); + // Clear the upper bits of the rounding immediate so that the legacy + // intrinsic can't trigger the scaling behavior of VRNDSCALE. + SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32, + Op.getOperand(3), + DAG.getConstant(0xf, dl, MVT::i32)); + return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), + Op.getOperand(1), Op.getOperand(2), RoundingMode); } default: break; @@ -20187,7 +20632,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget auto &Context = MF.getMMI().getContext(); MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") + Twine(MF.getFunctionNumber())); - return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT)); + return DAG.getNode(getGlobalWrapperKind(), dl, VT, + DAG.getMCSymbol(S, PtrVT)); } case Intrinsic::x86_seh_lsda: { @@ -20589,18 +21035,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget, case RDSEED: case RDRAND: { // Emit the node with the right value type. - SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other); + SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other); SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0)); // If the value returned by RDRAND/RDSEED was valid (CF=1), return 1. // Otherwise return the value from Rand, which is always 0, casted to i32. SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)), DAG.getConstant(1, dl, Op->getValueType(1)), - DAG.getConstant(X86::COND_B, dl, MVT::i32), + DAG.getConstant(X86::COND_B, dl, MVT::i8), SDValue(Result.getNode(), 1) }; - SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, - DAG.getVTList(Op->getValueType(1), MVT::Glue), - Ops); + SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops); // Return { result, isValid, chain }. return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid, @@ -21292,7 +21736,14 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT); SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask); SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift); - SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); + SDValue HiZ; + if (CurrVT.is512BitVector()) { + MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); + HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ); + HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); + } else { + HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ); + } Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo); Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi); @@ -21312,8 +21763,15 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL, SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT); // Check if the upper half of the input element is zero. - SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), - DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); + if (CurrVT.is512BitVector()) { + MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements()); + HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0), + DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); + HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ); + } else { + HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0), + DAG.getBitcast(CurrVT, Zero), ISD::SETEQ); + } HiZ = DAG.getBitcast(NextVT, HiZ); // Move the upper/lower halves to the lower bits as we'll be extending to @@ -21505,6 +21963,19 @@ static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) { } static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) { + MVT VT = Op.getSimpleValueType(); + if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) { + // Since X86 does not have CMOV for 8-bit integer, we don't convert + // 8-bit integer abs to NEG and CMOV. + SDLoc DL(Op); + SDValue N0 = Op.getOperand(0); + SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), + DAG.getConstant(0, DL, VT), N0); + SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8), + SDValue(Neg.getNode(), 1)}; + return DAG.getNode(X86ISD::CMOV, DL, VT, Ops); + } + assert(Op.getSimpleValueType().is256BitVector() && Op.getSimpleValueType().isInteger() && "Only handle AVX 256-bit vector integer operation"); @@ -21700,7 +22171,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, return Lower256IntArith(Op, DAG); // Only i8 vectors should need custom lowering after this. - assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) && + assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) || + (VT == MVT::v64i8 && Subtarget.hasBWI())) && "Unsupported vector type"); // Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply, @@ -21712,22 +22184,36 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, // and then ashr/lshr the upper bits down to the lower bits before multiply. unsigned Opcode = Op.getOpcode(); unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA); - unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT); + unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND); + + // For 512-bit vectors, split into 256-bit vectors to allow the + // sign-extension to occur. + if (VT == MVT::v64i8) + return Lower512IntArith(Op, DAG); // AVX2 implementations - extend xmm subvectors to ymm. if (Subtarget.hasInt256()) { + unsigned NumElems = VT.getVectorNumElements(); SDValue Lo = DAG.getIntPtrConstant(0, dl); - SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl); + SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl); if (VT == MVT::v32i8) { - SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo); - SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo); - SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi); - SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi); - ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo); - BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo); - AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi); - BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi); + if (Subtarget.hasBWI()) { + SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A); + SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B); + SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB); + Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul, + DAG.getConstant(8, dl, MVT::v32i16)); + return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); + } + SDValue ALo = extract128BitVector(A, 0, DAG, dl); + SDValue BLo = extract128BitVector(B, 0, DAG, dl); + SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl); + SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl); + ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo); + BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo); + AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi); + BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi); Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16, DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo), DAG.getConstant(8, dl, MVT::v16i16)); @@ -21745,19 +22231,23 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget, DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask)); } - SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG); - SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG); + SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A); + SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B); SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB); - SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, - DAG.getConstant(8, dl, MVT::v16i16)); - Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo); - Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi); + Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul, + DAG.getConstant(8, dl, MVT::v16i16)); + // If we have BWI we can use truncate instruction. + if (Subtarget.hasBWI()) + return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul); + Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo); + Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi); return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi); } assert(VT == MVT::v16i8 && "Pre-AVX2 support only supports v16i8 multiplication"); MVT ExVT = MVT::v8i16; + unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT); // Extract the lo parts and zero/sign extend to i16. SDValue ALo, BLo; @@ -21885,7 +22375,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, } assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) || - (VT == MVT::v8i32 && Subtarget.hasInt256())); + (VT == MVT::v8i32 && Subtarget.hasInt256()) || + (VT == MVT::v16i32 && Subtarget.hasAVX512())); + + int NumElts = VT.getVectorNumElements(); // PMULxD operations multiply each even value (starting at 0) of LHS with // the related value of RHS and produce a widen result. @@ -21899,17 +22392,17 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, // // Place the odd value at an even position (basically, shift all values 1 // step to the left): - const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1}; + const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1}; // <a|b|c|d> => <b|undef|d|undef> SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0, - makeArrayRef(&Mask[0], VT.getVectorNumElements())); + makeArrayRef(&Mask[0], NumElts)); // <e|f|g|h> => <f|undef|h|undef> SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1, - makeArrayRef(&Mask[0], VT.getVectorNumElements())); + makeArrayRef(&Mask[0], NumElts)); // Emit two multiplies, one for the lower 2 ints and one for the higher 2 // ints. - MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64; + MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2); bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI; unsigned Opcode = (!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ; @@ -21921,19 +22414,16 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget, SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1)); // Shuffle it back into the right order. - SDValue Highs, Lows; - if (VT == MVT::v8i32) { - const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15}; - Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14}; - Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); - } else { - const int HighMask[] = {1, 5, 3, 7}; - Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); - const int LowMask[] = {0, 4, 2, 6}; - Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + SmallVector<int, 16> HighMask(NumElts); + SmallVector<int, 16> LowMask(NumElts); + for (int i = 0; i != NumElts; ++i) { + HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1; + LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts); } + SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask); + SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask); + // If we have a signed multiply but no PMULDQ fix up the high parts of a // unsigned multiply. if (IsSigned && !Subtarget.hasSSE41()) { @@ -22123,9 +22613,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } } - // Special case in 32-bit mode, where i64 is expanded into high and low parts. + // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. // TODO: Replace constant extraction with getTargetConstantBitsFromNode. - if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && + if (!Subtarget.hasXOP() && (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || (Subtarget.hasAVX512() && VT == MVT::v8i64))) { @@ -22252,9 +22742,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG, } } - // Special case in 32-bit mode, where i64 is expanded into high and low parts. - if (!Subtarget.is64Bit() && VT == MVT::v2i64 && - Amt.getOpcode() == ISD::BITCAST && + // Check cases (mainly 32-bit) where i64 is expanded into high and low parts. + if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST && Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) { Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / @@ -22389,7 +22878,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // the vector shift into four scalar shifts plus four pairs of vector // insert/extract. if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) { - unsigned TargetOpcode = X86ISD::MOVSS; + bool UseMOVSD = false; bool CanBeSimplified; // The splat value for the first packed shift (the 'X' from the example). SDValue Amt1 = Amt->getOperand(0); @@ -22406,7 +22895,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, // Otherwise, check if we can still simplify this node using a MOVSD. CanBeSimplified = Amt1 == Amt->getOperand(1) && Amt->getOperand(2) == Amt->getOperand(3); - TargetOpcode = X86ISD::MOVSD; + UseMOVSD = true; Amt2 = Amt->getOperand(2); } } else { @@ -22417,7 +22906,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, CanBeSimplified = Amt2 == Amt->getOperand(i); if (!CanBeSimplified) { - TargetOpcode = X86ISD::MOVSD; + UseMOVSD = true; CanBeSimplified = true; Amt2 = Amt->getOperand(4); for (unsigned i=0; i != 4 && CanBeSimplified; ++i) @@ -22430,19 +22919,18 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, if (CanBeSimplified && isa<ConstantSDNode>(Amt1) && isa<ConstantSDNode>(Amt2)) { // Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND. - MVT CastVT = MVT::v4i32; SDValue Splat1 = DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT); SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1); SDValue Splat2 = DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT); SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2); - SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1); - SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2); - if (TargetOpcode == X86ISD::MOVSD) - return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1, + SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1); + SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2); + if (UseMOVSD) + return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1, BitCast2, {0, 1, 6, 7})); - return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1, + return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1, BitCast2, {0, 5, 6, 7})); } } @@ -22752,7 +23240,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, assert((Opcode == ISD::ROTL) && "Only ROTL supported"); // XOP has 128-bit vector variable + immediate rotates. - // +ve/-ve Amt = rotate left/right. + // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL. // Split 256-bit integers. if (VT.is256BitVector()) @@ -22765,13 +23253,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget, if (auto *RotateConst = BVAmt->getConstantSplatNode()) { uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue(); assert(RotateAmt < EltSizeInBits && "Rotation out of range"); - return DAG.getNode(X86ISD::VPROTI, DL, VT, R, + return DAG.getNode(X86ISD::VROTLI, DL, VT, R, DAG.getConstant(RotateAmt, DL, MVT::i8)); } } // Use general rotate by variable (per-element). - return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt); + return Op; } static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) { @@ -23319,15 +23807,14 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget, // TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions. if (Subtarget.hasVPOPCNTDQ()) { - if (VT == MVT::v8i16) { - Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0); - Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op); - return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); - } - if (VT == MVT::v16i8 || VT == MVT::v16i16) { - Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0); - Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op); - return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op); + unsigned NumElems = VT.getVectorNumElements(); + assert((VT.getVectorElementType() == MVT::i8 || + VT.getVectorElementType() == MVT::i16) && "Unexpected type"); + if (NumElems <= 16) { + MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems); + Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0); + Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op); + return DAG.getNode(ISD::TRUNCATE, DL, VT, Op); } } @@ -23402,12 +23889,13 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) { static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (Subtarget.hasXOP()) + MVT VT = Op.getSimpleValueType(); + + if (Subtarget.hasXOP() && !VT.is512BitVector()) return LowerBITREVERSE_XOP(Op, DAG); assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE"); - MVT VT = Op.getSimpleValueType(); SDValue In = Op.getOperand(0); SDLoc DL(Op); @@ -23450,7 +23938,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget, return DAG.getNode(ISD::OR, DL, VT, Lo, Hi); } -static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { +static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG, + const X86Subtarget &Subtarget, + bool AllowIncDec = true) { unsigned NewOpc = 0; switch (N->getOpcode()) { case ISD::ATOMIC_LOAD_ADD: @@ -23473,6 +23963,26 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) { } MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); + + if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) { + // Convert to inc/dec if they aren't slow or we are optimizing for size. + if (AllowIncDec && (!Subtarget.slowIncDec() || + DAG.getMachineFunction().getFunction().optForSize())) { + if ((NewOpc == X86ISD::LADD && C->isOne()) || + (NewOpc == X86ISD::LSUB && C->isAllOnesValue())) + return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N), + DAG.getVTList(MVT::i32, MVT::Other), + {N->getOperand(0), N->getOperand(1)}, + /*MemVT=*/N->getSimpleValueType(0), MMO); + if ((NewOpc == X86ISD::LSUB && C->isOne()) || + (NewOpc == X86ISD::LADD && C->isAllOnesValue())) + return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N), + DAG.getVTList(MVT::i32, MVT::Other), + {N->getOperand(0), N->getOperand(1)}, + /*MemVT=*/N->getSimpleValueType(0), MMO); + } + } + return DAG.getMemIntrinsicNode( NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other), {N->getOperand(0), N->getOperand(1), N->getOperand(2)}, @@ -23506,7 +24016,7 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG, return N; } - SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG); + SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget); // RAUW the chain, but don't worry about the result, as it's unused. assert(!N->hasAnyUseOfValue(0)); DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1)); @@ -23675,19 +24185,12 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, assert(Subtarget.hasAVX512() && "MGATHER/MSCATTER are supported on AVX-512 arch only"); - // X86 scatter kills mask register, so its type should be added to - // the list of return values. - // If the "scatter" has 2 return values, it is already handled. - if (Op.getNode()->getNumValues() == 2) - return Op; - MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode()); SDValue Src = N->getValue(); MVT VT = Src.getSimpleValueType(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op"); SDLoc dl(Op); - SDValue NewScatter; SDValue Index = N->getIndex(); SDValue Mask = N->getMask(); SDValue Chain = N->getChain(); @@ -23758,8 +24261,8 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget, // The mask is killed by scatter, add it to the values SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other); SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index}; - NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops, - N->getMemOperand()); + SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>( + VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand()); DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1)); return SDValue(NewScatter.getNode(), 1); } @@ -23874,8 +24377,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget, static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - assert(Subtarget.hasAVX512() && - "MGATHER/MSCATTER are supported on AVX-512 arch only"); + assert(Subtarget.hasAVX2() && + "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only"); MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode()); SDLoc dl(Op); @@ -23889,17 +24392,22 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, unsigned NumElts = VT.getVectorNumElements(); assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op"); - if (!Subtarget.hasVLX() && !VT.is512BitVector() && + // If the index is v2i32, we're being called by type legalization. + if (IndexVT == MVT::v2i32) + return SDValue(); + + if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() && !Index.getSimpleValueType().is512BitVector()) { // AVX512F supports only 512-bit vectors. Or data or index should // be 512 bit wide. If now the both index and data are 256-bit, but // the vector contains 8 elements, we just sign-extend the index if (NumElts == 8) { Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index); - SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2), - N->getOperand(3), Index }; - DAG.UpdateNodeOperands(N, Ops); - return Op; + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl); } // Minimal number of elements in Gather @@ -23923,67 +24431,21 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget, Src0 = ExtendToType(Src0, NewVT, DAG); SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; - SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other), - N->getMemoryVT(), dl, Ops, - N->getMemOperand()); - SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, - NewGather.getValue(0), - DAG.getIntPtrConstant(0, dl)); - SDValue RetOps[] = {Exract, NewGather.getValue(1)}; - return DAG.getMergeValues(RetOps, dl); - } - if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) { - // There is a special case when the return type is v2i32 is illegal and - // the type legaizer extended it to v2i64. Without this conversion we end up - // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD. - // In order to avoid this situation, we'll build an X86 specific Gather node - // with index v2i64 and value type v4i32. - assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 && - "Unexpected type in masked gather"); - Src0 = DAG.getVectorShuffle(MVT::v4i32, dl, - DAG.getBitcast(MVT::v4i32, Src0), - DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 }); - // The mask should match the destination type. Extending mask with zeroes - // is not necessary since instruction itself reads only two values from - // memory. - Mask = ExtendToType(Mask, MVT::v4i1, DAG, false); - SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(), - N->getMemOperand()); - - SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64, - NewGather.getValue(0), DAG); - SDValue RetOps[] = { Sext, NewGather.getValue(1) }; + DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, + NewGather.getValue(0), + DAG.getIntPtrConstant(0, dl)); + SDValue RetOps[] = {Extract, NewGather.getValue(2)}; return DAG.getMergeValues(RetOps, dl); } - if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) { - // This transformation is for optimization only. - // The type legalizer extended mask and index to 4 elements vector - // in order to match requirements of the common gather node - same - // vector width of index and value. X86 Gather node allows mismatch - // of vector width in order to select more optimal instruction at the - // end. - assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 && - "Unexpected type in masked gather"); - if (Mask.getOpcode() == ISD::CONCAT_VECTORS && - ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) && - Index.getOpcode() == ISD::CONCAT_VECTORS && - Index.getOperand(1).isUndef()) { - Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false); - Index = Index.getOperand(0); - } else - return Op; - SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; - SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( - DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(), - N->getMemOperand()); - - SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) }; - return DAG.getMergeValues(RetOps, dl); - } - return Op; + SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index }; + SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(), + N->getMemOperand()); + return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl); } SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op, @@ -24049,7 +24511,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VSELECT: return LowerVSELECT(Op, DAG); case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG); case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG); - case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG); case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG); case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG); case ISD::ConstantPool: return LowerConstantPool(Op, DAG); @@ -24085,7 +24546,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { case ISD::VASTART: return LowerVASTART(Op, DAG); case ISD::VAARG: return LowerVAARG(Op, DAG); case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG); - case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG); + case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); case ISD::INTRINSIC_VOID: case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG); case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG); @@ -24203,8 +24664,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops); SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1); - Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, - DAG.getIntPtrConstant(0, dl))); + if (!ExperimentalVectorWideningLegalization) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); return; } // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32. @@ -24242,11 +24705,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, assert(Subtarget.hasSSE2() && "Requires at least SSE2!"); SDValue Src = N->getOperand(0); if (Src.getValueType() == MVT::v2f64) { - SDValue Idx = DAG.getIntPtrConstant(0, dl); - SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI - : X86ISD::CVTTP2UI, - dl, MVT::v4i32, Src); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); + MVT ResVT = MVT::v4i32; + unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI; + if (!IsSigned && !Subtarget.hasVLX()) { + // Widen to 512-bits. + ResVT = MVT::v8i32; + Opc = ISD::FP_TO_UINT; + Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, + DAG.getUNDEF(MVT::v8f64), + Src, DAG.getIntPtrConstant(0, dl)); + } + SDValue Res = DAG.getNode(Opc, dl, ResVT, Src); + ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32 + : MVT::v2i32; + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res, + DAG.getIntPtrConstant(0, dl)); Results.push_back(Res); return; } @@ -24256,7 +24729,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, DAG.getUNDEF(MVT::v2f32)); Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT : ISD::FP_TO_UINT, dl, MVT::v4i32, Res); - Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); + if (!ExperimentalVectorWideningLegalization) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx); Results.push_back(Res); return; } @@ -24345,7 +24819,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, } } case ISD::INTRINSIC_WO_CHAIN: { - if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG)) + if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG)) Results.push_back(V); return; } @@ -24480,6 +24954,89 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N, ToVecInt, DAG.getIntPtrConstant(i, dl))); Results.push_back(DAG.getBuildVector(DstVT, dl, Elts)); + return; + } + case ISD::MGATHER: { + EVT VT = N->getValueType(0); + if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + auto *Gather = cast<MaskedGatherSDNode>(N); + SDValue Index = Gather->getIndex(); + if (Index.getValueType() != MVT::v2i64) + return; + SDValue Mask = Gather->getMask(); + assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); + SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, + Gather->getValue(), + DAG.getUNDEF(MVT::v2f32)); + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); + } + SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), + Index }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); + Results.push_back(Res); + Results.push_back(Res.getValue(2)); + return; + } + if (VT == MVT::v2i32) { + auto *Gather = cast<MaskedGatherSDNode>(N); + SDValue Index = Gather->getIndex(); + SDValue Mask = Gather->getMask(); + assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type"); + SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, + Gather->getValue(), + DAG.getUNDEF(MVT::v2i32)); + // If the index is v2i64 we can use it directly. + if (Index.getValueType() == MVT::v2i64 && + (Subtarget.hasVLX() || !Subtarget.hasAVX512())) { + if (!Subtarget.hasVLX()) { + // We need to widen the mask, but the instruction will only use 2 + // of its elements. So we can use undef. + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getUNDEF(MVT::v2i1)); + Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask); + } + SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), + Index }; + SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>( + DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl, + Gather->getMemoryVT(), Gather->getMemOperand()); + SDValue Chain = Res.getValue(2); + if (!ExperimentalVectorWideningLegalization) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + EVT IndexVT = Index.getValueType(); + EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(), + IndexVT.getScalarType(), 4); + // Otherwise we need to custom widen everything to avoid promotion. + Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index, + DAG.getUNDEF(IndexVT)); + Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask, + DAG.getConstant(0, dl, MVT::v2i1)); + SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(), + Index }; + SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other), + Gather->getMemoryVT(), dl, Ops, + Gather->getMemOperand()); + SDValue Chain = Res.getValue(1); + if (!ExperimentalVectorWideningLegalization) + Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, + DAG.getIntPtrConstant(0, dl)); + Results.push_back(Res); + Results.push_back(Chain); + return; + } + break; } } } @@ -24557,9 +25114,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FMAXC: return "X86ISD::FMAXC"; case X86ISD::FMINC: return "X86ISD::FMINC"; case X86ISD::FRSQRT: return "X86ISD::FRSQRT"; - case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS"; case X86ISD::FRCP: return "X86ISD::FRCP"; - case X86ISD::FRCPS: return "X86ISD::FRCPS"; case X86ISD::EXTRQI: return "X86ISD::EXTRQI"; case X86ISD::INSERTQI: return "X86ISD::INSERTQI"; case X86ISD::TLSADDR: return "X86ISD::TLSADDR"; @@ -24585,6 +25140,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::LOR: return "X86ISD::LOR"; case X86ISD::LXOR: return "X86ISD::LXOR"; case X86ISD::LAND: return "X86ISD::LAND"; + case X86ISD::LINC: return "X86ISD::LINC"; + case X86ISD::LDEC: return "X86ISD::LDEC"; case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL"; case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD"; case X86ISD::VZEXT: return "X86ISD::VZEXT"; @@ -24620,6 +25177,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PCMPGT: return "X86ISD::PCMPGT"; case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM"; case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM"; + case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS"; case X86ISD::ADD: return "X86ISD::ADD"; case X86ISD::SUB: return "X86ISD::SUB"; case X86ISD::ADC: return "X86ISD::ADC"; @@ -24635,7 +25193,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::OR: return "X86ISD::OR"; case X86ISD::XOR: return "X86ISD::XOR"; case X86ISD::AND: return "X86ISD::AND"; - case X86ISD::BEXTR: return "X86ISD::BEXTR"; case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM"; case X86ISD::MOVMSK: return "X86ISD::MOVMSK"; case X86ISD::PTEST: return "X86ISD::PTEST"; @@ -24650,13 +25207,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::PACKUS: return "X86ISD::PACKUS"; case X86ISD::PALIGNR: return "X86ISD::PALIGNR"; case X86ISD::VALIGN: return "X86ISD::VALIGN"; + case X86ISD::VSHLD: return "X86ISD::VSHLD"; + case X86ISD::VSHRD: return "X86ISD::VSHRD"; + case X86ISD::VSHLDV: return "X86ISD::VSHLDV"; + case X86ISD::VSHRDV: return "X86ISD::VSHRDV"; case X86ISD::PSHUFD: return "X86ISD::PSHUFD"; case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW"; case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW"; case X86ISD::SHUFP: return "X86ISD::SHUFP"; case X86ISD::SHUF128: return "X86ISD::SHUF128"; case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS"; - case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD"; case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS"; case X86ISD::MOVLPS: return "X86ISD::MOVLPS"; case X86ISD::MOVLPD: return "X86ISD::MOVLPD"; @@ -24670,7 +25230,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST"; case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM"; case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST"; - case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT"; case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV"; case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI"; case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128"; @@ -24680,8 +25239,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::VPERMI: return "X86ISD::VPERMI"; case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG"; case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM"; - case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; + case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS"; case X86ISD::VRANGE: return "X86ISD::VRANGE"; + case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND"; + case X86ISD::VRANGES: return "X86ISD::VRANGES"; + case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND"; case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ"; case X86ISD::PMULDQ: return "X86ISD::PMULDQ"; case X86ISD::PSADBW: return "X86ISD::PSADBW"; @@ -24697,14 +25259,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::RDSEED: return "X86ISD::RDSEED"; case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW"; case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD"; - case X86ISD::VPROT: return "X86ISD::VPROT"; - case X86ISD::VPROTI: return "X86ISD::VPROTI"; case X86ISD::VPSHA: return "X86ISD::VPSHA"; case X86ISD::VPSHL: return "X86ISD::VPSHL"; case X86ISD::VPCOM: return "X86ISD::VPCOM"; case X86ISD::VPCOMU: return "X86ISD::VPCOMU"; case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2"; - case X86ISD::FMADD: return "X86ISD::FMADD"; case X86ISD::FMSUB: return "X86ISD::FMSUB"; case X86ISD::FNMADD: return "X86ISD::FNMADD"; case X86ISD::FNMSUB: return "X86ISD::FNMSUB"; @@ -24716,22 +25275,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND"; case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND"; case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND"; + case X86ISD::FMADDS1: return "X86ISD::FMADDS1"; + case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1"; + case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1"; + case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1"; case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND"; case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND"; case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND"; case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND"; + case X86ISD::FMADDS3: return "X86ISD::FMADDS3"; + case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3"; + case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3"; + case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3"; case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND"; case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND"; case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND"; case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND"; + case X86ISD::FMADD4S: return "X86ISD::FMADD4S"; + case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S"; + case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S"; + case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S"; case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H"; case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L"; case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE"; + case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND"; case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES"; + case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND"; case X86ISD::VREDUCE: return "X86ISD::VREDUCE"; + case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND"; case X86ISD::VREDUCES: return "X86ISD::VREDUCES"; + case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND"; case X86ISD::VGETMANT: return "X86ISD::VGETMANT"; + case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND"; case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS"; + case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND"; case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI"; case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI"; case X86ISD::XTEST: return "X86ISD::XTEST"; @@ -24740,9 +25317,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SELECT: return "X86ISD::SELECT"; case X86ISD::SELECTS: return "X86ISD::SELECTS"; case X86ISD::ADDSUB: return "X86ISD::ADDSUB"; + case X86ISD::RCP14: return "X86ISD::RCP14"; + case X86ISD::RCP14S: return "X86ISD::RCP14S"; case X86ISD::RCP28: return "X86ISD::RCP28"; case X86ISD::RCP28S: return "X86ISD::RCP28S"; case X86ISD::EXP2: return "X86ISD::EXP2"; + case X86ISD::RSQRT14: return "X86ISD::RSQRT14"; + case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S"; case X86ISD::RSQRT28: return "X86ISD::RSQRT28"; case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S"; case X86ISD::FADD_RND: return "X86ISD::FADD_RND"; @@ -24780,6 +25361,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND"; case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH"; case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS"; + case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND"; case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI"; case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI"; case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND"; @@ -24788,6 +25370,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND"; case X86ISD::LWPINS: return "X86ISD::LWPINS"; case X86ISD::MGATHER: return "X86ISD::MGATHER"; + case X86ISD::MSCATTER: return "X86ISD::MSCATTER"; + case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD"; + case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS"; + case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD"; + case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS"; + case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB"; + case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB"; + case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB"; + case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB"; } return nullptr; } @@ -24796,7 +25387,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const { /// target, for a load/store of the specified type. bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, - unsigned AS) const { + unsigned AS, + Instruction *I) const { // X86 supports extremely general addressing modes. CodeModel::Model M = getTargetMachine().getCodeModel(); @@ -24853,9 +25445,9 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const { if (Bits == 8) return false; - // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make - // variable shifts just as cheap as scalar ones. - if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64)) + // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable + // shifts just as cheap as scalar ones. + if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64)) return false; // Otherwise, it's significantly cheaper to shift by a scalar amount than by a @@ -24968,9 +25560,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const { /// VECTOR_SHUFFLE operations, those with specific masks. /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values /// are assumed to be legal. -bool -X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M, - EVT VT) const { +bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const { if (!VT.isSimple()) return false; @@ -25522,7 +26112,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter( int64_t RegSaveFrameIndex = MI.getOperand(1).getImm(); int64_t VarArgsFPOffset = MI.getOperand(2).getImm(); - if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) { + if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) { // If %al is 0, branch around the XMM save block. BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg); BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB); @@ -25627,65 +26217,76 @@ static bool isCMOVPseudo(MachineInstr &MI) { } } -MachineBasicBlock * -X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, - MachineBasicBlock *BB) const { - const TargetInstrInfo *TII = Subtarget.getInstrInfo(); - DebugLoc DL = MI.getDebugLoc(); +// Helper function, which inserts PHI functions into SinkMBB: +// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ], +// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs +// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for +// the last PHI function inserted. +static MachineInstrBuilder createPHIsForCMOVsInSinkBB( + MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd, + MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB, + MachineBasicBlock *SinkMBB) { + MachineFunction *MF = TrueMBB->getParent(); + const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo(); + DebugLoc DL = MIItBegin->getDebugLoc(); - // To "insert" a SELECT_CC instruction, we actually have to insert the - // diamond control-flow pattern. The incoming instruction knows the - // destination vreg to set, the condition code register to branch on, the - // true/false values to select between, and a branch opcode to use. - const BasicBlock *LLVM_BB = BB->getBasicBlock(); - MachineFunction::iterator It = ++BB->getIterator(); + X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm()); + X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); - // thisMBB: - // ... - // TrueVal = ... - // cmpTY ccX, r1, r2 - // bCC copy1MBB - // fallthrough --> copy0MBB - MachineBasicBlock *thisMBB = BB; - MachineFunction *F = BB->getParent(); + MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin(); - // This code lowers all pseudo-CMOV instructions. Generally it lowers these - // as described above, by inserting a BB, and then making a PHI at the join - // point to select the true and false operands of the CMOV in the PHI. - // - // The code also handles two different cases of multiple CMOV opcodes - // in a row. - // - // Case 1: - // In this case, there are multiple CMOVs in a row, all which are based on - // the same condition setting (or the exact opposite condition setting). - // In this case we can lower all the CMOVs using a single inserted BB, and - // then make a number of PHIs at the join point to model the CMOVs. The only - // trickiness here, is that in a case like: - // - // t2 = CMOV cond1 t1, f1 - // t3 = CMOV cond1 t2, f2 - // - // when rewriting this into PHIs, we have to perform some renaming on the - // temps since you cannot have a PHI operand refer to a PHI result earlier - // in the same block. The "simple" but wrong lowering would be: - // - // t2 = PHI t1(BB1), f1(BB2) - // t3 = PHI t2(BB1), f2(BB2) - // - // but clearly t2 is not defined in BB1, so that is incorrect. The proper - // renaming is to note that on the path through BB1, t2 is really just a - // copy of t1, and do that renaming, properly generating: - // - // t2 = PHI t1(BB1), f1(BB2) - // t3 = PHI t1(BB1), f2(BB2) - // - // Case 2, we lower cascaded CMOVs such as + // As we are creating the PHIs, we have to be careful if there is more than + // one. Later CMOVs may reference the results of earlier CMOVs, but later + // PHIs have to reference the individual true/false inputs from earlier PHIs. + // That also means that PHI construction must work forward from earlier to + // later, and that the code must maintain a mapping from earlier PHI's + // destination registers, and the registers that went into the PHI. + DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; + MachineInstrBuilder MIB; + + for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { + unsigned DestReg = MIIt->getOperand(0).getReg(); + unsigned Op1Reg = MIIt->getOperand(1).getReg(); + unsigned Op2Reg = MIIt->getOperand(2).getReg(); + + // If this CMOV we are generating is the opposite condition from + // the jump we generated, then we have to swap the operands for the + // PHI that is going to be generated. + if (MIIt->getOperand(3).getImm() == OppCC) + std::swap(Op1Reg, Op2Reg); + + if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) + Op1Reg = RegRewriteTable[Op1Reg].first; + + if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) + Op2Reg = RegRewriteTable[Op2Reg].second; + + MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg) + .addReg(Op1Reg) + .addMBB(FalseMBB) + .addReg(Op2Reg) + .addMBB(TrueMBB); + + // Add this PHI to the rewrite table. + RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); + } + + return MIB; +} + +// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2). +MachineBasicBlock * +X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV, + MachineInstr &SecondCascadedCMOV, + MachineBasicBlock *ThisMBB) const { + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = FirstCMOV.getDebugLoc(); + + // We lower cascaded CMOVs such as // - // (CMOV (CMOV F, T, cc1), T, cc2) + // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2) // - // to two successive branches. For that, we look for another CMOV as the - // following instruction. + // to two successive branches. // // Without this, we would add a PHI between the two jumps, which ends up // creating a few copies all around. For instance, for @@ -25749,10 +26350,145 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, // .LBB5_4: // retq // - MachineInstr *CascadedCMOV = nullptr; - MachineInstr *LastCMOV = &MI; + + // We lower cascaded CMOV into two successive branches to the same block. + // EFLAGS is used by both, so mark it as live in the second. + const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); + MachineFunction *F = ThisMBB->getParent(); + MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); + + MachineFunction::iterator It = ++ThisMBB->getIterator(); + F->insert(It, FirstInsertedMBB); + F->insert(It, SecondInsertedMBB); + F->insert(It, SinkMBB); + + // For a cascaded CMOV, we lower it to two successive branches to + // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in + // the FirstInsertedMBB. + FirstInsertedMBB->addLiveIn(X86::EFLAGS); + + // If the EFLAGS register isn't dead in the terminator, then claim that it's + // live into the sink and copy blocks. + const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); + if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) { + SecondInsertedMBB->addLiveIn(X86::EFLAGS); + SinkMBB->addLiveIn(X86::EFLAGS); + } + + // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. + SinkMBB->splice(SinkMBB->begin(), ThisMBB, + std::next(MachineBasicBlock::iterator(FirstCMOV)), + ThisMBB->end()); + SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); + + // Fallthrough block for ThisMBB. + ThisMBB->addSuccessor(FirstInsertedMBB); + // The true block target of the first branch is always SinkMBB. + ThisMBB->addSuccessor(SinkMBB); + // Fallthrough block for FirstInsertedMBB. + FirstInsertedMBB->addSuccessor(SecondInsertedMBB); + // The true block for the branch of FirstInsertedMBB. + FirstInsertedMBB->addSuccessor(SinkMBB); + // This is fallthrough. + SecondInsertedMBB->addSuccessor(SinkMBB); + + // Create the conditional branch instructions. + X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm()); + unsigned Opc = X86::GetCondBranchFromCond(FirstCC); + BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); + + X86::CondCode SecondCC = + X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm()); + unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC); + BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB); + + // SinkMBB: + // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ] + unsigned DestReg = FirstCMOV.getOperand(0).getReg(); + unsigned Op1Reg = FirstCMOV.getOperand(1).getReg(); + unsigned Op2Reg = FirstCMOV.getOperand(2).getReg(); + MachineInstrBuilder MIB = + BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg) + .addReg(Op1Reg) + .addMBB(SecondInsertedMBB) + .addReg(Op2Reg) + .addMBB(ThisMBB); + + // The second SecondInsertedMBB provides the same incoming value as the + // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes). + MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB); + // Copy the PHI result to the register defined by the second CMOV. + BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL, + TII->get(TargetOpcode::COPY), + SecondCascadedCMOV.getOperand(0).getReg()) + .addReg(FirstCMOV.getOperand(0).getReg()); + + // Now remove the CMOVs. + FirstCMOV.eraseFromParent(); + SecondCascadedCMOV.eraseFromParent(); + + return SinkMBB; +} + +MachineBasicBlock * +X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, + MachineBasicBlock *ThisMBB) const { + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + DebugLoc DL = MI.getDebugLoc(); + + // To "insert" a SELECT_CC instruction, we actually have to insert the + // diamond control-flow pattern. The incoming instruction knows the + // destination vreg to set, the condition code register to branch on, the + // true/false values to select between and a branch opcode to use. + + // ThisMBB: + // ... + // TrueVal = ... + // cmpTY ccX, r1, r2 + // bCC copy1MBB + // fallthrough --> FalseMBB + + // This code lowers all pseudo-CMOV instructions. Generally it lowers these + // as described above, by inserting a BB, and then making a PHI at the join + // point to select the true and false operands of the CMOV in the PHI. + // + // The code also handles two different cases of multiple CMOV opcodes + // in a row. + // + // Case 1: + // In this case, there are multiple CMOVs in a row, all which are based on + // the same condition setting (or the exact opposite condition setting). + // In this case we can lower all the CMOVs using a single inserted BB, and + // then make a number of PHIs at the join point to model the CMOVs. The only + // trickiness here, is that in a case like: + // + // t2 = CMOV cond1 t1, f1 + // t3 = CMOV cond1 t2, f2 + // + // when rewriting this into PHIs, we have to perform some renaming on the + // temps since you cannot have a PHI operand refer to a PHI result earlier + // in the same block. The "simple" but wrong lowering would be: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t2(BB1), f2(BB2) + // + // but clearly t2 is not defined in BB1, so that is incorrect. The proper + // renaming is to note that on the path through BB1, t2 is really just a + // copy of t1, and do that renaming, properly generating: + // + // t2 = PHI t1(BB1), f1(BB2) + // t3 = PHI t1(BB1), f2(BB2) + // + // Case 2: + // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate + // function - EmitLoweredCascadedSelect. + X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm()); X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC); + MachineInstr *LastCMOV = &MI; MachineBasicBlock::iterator NextMIIt = std::next(MachineBasicBlock::iterator(MI)); @@ -25762,7 +26498,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, if (isCMOVPseudo(MI)) { // See if we have a string of CMOVS with the same condition. - while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) && + while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) && (NextMIIt->getOperand(3).getImm() == CC || NextMIIt->getOperand(3).getImm() == OppCC)) { LastCMOV = &*NextMIIt; @@ -25772,136 +26508,61 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI, // This checks for case 2, but only do this if we didn't already find // case 1, as indicated by LastCMOV == MI. - if (LastCMOV == &MI && NextMIIt != BB->end() && + if (LastCMOV == &MI && NextMIIt != ThisMBB->end() && NextMIIt->getOpcode() == MI.getOpcode() && NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() && NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() && NextMIIt->getOperand(1).isKill()) { - CascadedCMOV = &*NextMIIt; + return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB); } - MachineBasicBlock *jcc1MBB = nullptr; + const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock(); + MachineFunction *F = ThisMBB->getParent(); + MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB); + MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - // If we have a cascaded CMOV, we lower it to two successive branches to - // the same block. EFLAGS is used by both, so mark it as live in the second. - if (CascadedCMOV) { - jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(It, jcc1MBB); - jcc1MBB->addLiveIn(X86::EFLAGS); - } - - MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB); - MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB); - F->insert(It, copy0MBB); - F->insert(It, sinkMBB); + MachineFunction::iterator It = ++ThisMBB->getIterator(); + F->insert(It, FalseMBB); + F->insert(It, SinkMBB); // If the EFLAGS register isn't dead in the terminator, then claim that it's // live into the sink and copy blocks. const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo(); - - MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV; - if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) && - !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) { - copy0MBB->addLiveIn(X86::EFLAGS); - sinkMBB->addLiveIn(X86::EFLAGS); + if (!LastCMOV->killsRegister(X86::EFLAGS) && + !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) { + FalseMBB->addLiveIn(X86::EFLAGS); + SinkMBB->addLiveIn(X86::EFLAGS); } - // Transfer the remainder of BB and its successor edges to sinkMBB. - sinkMBB->splice(sinkMBB->begin(), BB, - std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end()); - sinkMBB->transferSuccessorsAndUpdatePHIs(BB); - - // Add the true and fallthrough blocks as its successors. - if (CascadedCMOV) { - // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV. - BB->addSuccessor(jcc1MBB); - - // In that case, jcc1MBB will itself fallthrough the copy0MBB, and - // jump to the sinkMBB. - jcc1MBB->addSuccessor(copy0MBB); - jcc1MBB->addSuccessor(sinkMBB); - } else { - BB->addSuccessor(copy0MBB); - } + // Transfer the remainder of ThisMBB and its successor edges to SinkMBB. + SinkMBB->splice(SinkMBB->begin(), ThisMBB, + std::next(MachineBasicBlock::iterator(LastCMOV)), + ThisMBB->end()); + SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB); - // The true block target of the first (or only) branch is always sinkMBB. - BB->addSuccessor(sinkMBB); + // Fallthrough block for ThisMBB. + ThisMBB->addSuccessor(FalseMBB); + // The true block target of the first (or only) branch is always a SinkMBB. + ThisMBB->addSuccessor(SinkMBB); + // Fallthrough block for FalseMBB. + FalseMBB->addSuccessor(SinkMBB); // Create the conditional branch instruction. unsigned Opc = X86::GetCondBranchFromCond(CC); - BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB); - - if (CascadedCMOV) { - unsigned Opc2 = X86::GetCondBranchFromCond( - (X86::CondCode)CascadedCMOV->getOperand(3).getImm()); - BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB); - } + BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB); - // copy0MBB: - // %FalseValue = ... - // # fallthrough to sinkMBB - copy0MBB->addSuccessor(sinkMBB); - - // sinkMBB: - // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ] + // SinkMBB: + // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ] // ... MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI); MachineBasicBlock::iterator MIItEnd = - std::next(MachineBasicBlock::iterator(LastCMOV)); - MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin(); - DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable; - MachineInstrBuilder MIB; - - // As we are creating the PHIs, we have to be careful if there is more than - // one. Later CMOVs may reference the results of earlier CMOVs, but later - // PHIs have to reference the individual true/false inputs from earlier PHIs. - // That also means that PHI construction must work forward from earlier to - // later, and that the code must maintain a mapping from earlier PHI's - // destination registers, and the registers that went into the PHI. - - for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) { - unsigned DestReg = MIIt->getOperand(0).getReg(); - unsigned Op1Reg = MIIt->getOperand(1).getReg(); - unsigned Op2Reg = MIIt->getOperand(2).getReg(); - - // If this CMOV we are generating is the opposite condition from - // the jump we generated, then we have to swap the operands for the - // PHI that is going to be generated. - if (MIIt->getOperand(3).getImm() == OppCC) - std::swap(Op1Reg, Op2Reg); - - if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end()) - Op1Reg = RegRewriteTable[Op1Reg].first; - - if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end()) - Op2Reg = RegRewriteTable[Op2Reg].second; - - MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL, - TII->get(X86::PHI), DestReg) - .addReg(Op1Reg).addMBB(copy0MBB) - .addReg(Op2Reg).addMBB(thisMBB); - - // Add this PHI to the rewrite table. - RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg); - } - - // If we have a cascaded CMOV, the second Jcc provides the same incoming - // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes). - if (CascadedCMOV) { - MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB); - // Copy the PHI result to the register defined by the second CMOV. - BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), - DL, TII->get(TargetOpcode::COPY), - CascadedCMOV->getOperand(0).getReg()) - .addReg(MI.getOperand(0).getReg()); - CascadedCMOV->eraseFromParent(); - } + std::next(MachineBasicBlock::iterator(LastCMOV)); + createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB); // Now remove the CMOV(s). - for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ) - (MIIt++)->eraseFromParent(); + ThisMBB->erase(MIItBegin, MIItEnd); - return sinkMBB; + return SinkMBB; } MachineBasicBlock * @@ -26094,7 +26755,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI, DebugLoc DL = MI.getDebugLoc(); assert(!isAsynchronousEHPersonality( - classifyEHPersonality(MF->getFunction()->getPersonalityFn())) && + classifyEHPersonality(MF->getFunction().getPersonalityFn())) && "SEH does not use catchret!"); // Only 32-bit EH needs to worry about manually restoring stack pointers. @@ -26121,7 +26782,7 @@ MachineBasicBlock * X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI, MachineBasicBlock *BB) const { MachineFunction *MF = BB->getParent(); - const Constant *PerFn = MF->getFunction()->getPersonalityFn(); + const Constant *PerFn = MF->getFunction().getPersonalityFn(); bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn)); // Only 32-bit SEH requires special handling for catchpad. if (IsSEH && Subtarget.is32Bit()) { @@ -26480,7 +27141,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI, } MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op)); - addFrameReference(MIB, FI, 36); + addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36); if (UseImmLabel) MIB.addMBB(DispatchBB); else @@ -26562,8 +27223,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI); // Create the jump table and associated information - MachineJumpTableInfo *JTI = - MF->getOrCreateJumpTableInfo(getJumpTableEncoding()); + unsigned JTE = getJumpTableEncoding(); + MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE); unsigned MJTI = JTI->createJumpTableIndex(LPadList); const X86RegisterInfo &RI = TII->getRegisterInfo(); @@ -26586,25 +27247,76 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI, .addRegMask(RI.getNoPreservedMask()); } - unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass); + // IReg is used as an index in a memory operand and therefore can't be SP + unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass); addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI, - 4); + Subtarget.is64Bit() ? 8 : 4); BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri)) .addReg(IReg) .addImm(LPadList.size()); - BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB); + BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB); - unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass); - BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg) - .addReg(IReg) - .addImm(1); - BuildMI(DispContBB, DL, - TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m)) - .addReg(0) - .addImm(Subtarget.is64Bit() ? 8 : 4) - .addReg(JReg) - .addJumpTableIndex(MJTI) - .addReg(0); + if (Subtarget.is64Bit()) { + unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass); + unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass); + + // leaq .LJTI0_0(%rip), BReg + BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg) + .addReg(X86::RIP) + .addImm(1) + .addReg(0) + .addJumpTableIndex(MJTI) + .addReg(0); + // movzx IReg64, IReg + BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64) + .addImm(0) + .addReg(IReg) + .addImm(X86::sub_32bit); + + switch (JTE) { + case MachineJumpTableInfo::EK_BlockAddress: + // jmpq *(BReg,IReg64,8) + BuildMI(DispContBB, DL, TII->get(X86::JMP64m)) + .addReg(BReg) + .addImm(8) + .addReg(IReg64) + .addImm(0) + .addReg(0); + break; + case MachineJumpTableInfo::EK_LabelDifference32: { + unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass); + unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass); + unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass); + + // movl (BReg,IReg64,4), OReg + BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg) + .addReg(BReg) + .addImm(4) + .addReg(IReg64) + .addImm(0) + .addReg(0); + // movsx OReg64, OReg + BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg); + // addq BReg, OReg64, TReg + BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg) + .addReg(OReg64) + .addReg(BReg); + // jmpq *TReg + BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg); + break; + } + default: + llvm_unreachable("Unexpected jump table encoding"); + } + } else { + // jmpl *.LJTI0_0(,IReg,4) + BuildMI(DispContBB, DL, TII->get(X86::JMP32m)) + .addReg(0) + .addImm(4) + .addReg(IReg) + .addJumpTableIndex(MJTI) + .addReg(0); + } // Add the jump table entries as successors to the MBB. SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs; @@ -26975,21 +27687,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.resetAll(); switch (Opc) { default: break; - case X86ISD::ADD: - case X86ISD::SUB: - case X86ISD::ADC: - case X86ISD::SBB: - case X86ISD::SMUL: - case X86ISD::UMUL: - case X86ISD::INC: - case X86ISD::DEC: - case X86ISD::OR: - case X86ISD::XOR: - case X86ISD::AND: - // These nodes' second result is a boolean. - if (Op.getResNo() == 0) - break; - LLVM_FALLTHROUGH; case X86ISD::SETCC: Known.Zero.setBitsFrom(1); break; @@ -26998,6 +27695,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero.setBitsFrom(NumLoBits); break; } + case X86ISD::PEXTRB: + case X86ISD::PEXTRW: { + SDValue Src = Op.getOperand(0); + EVT SrcVT = Src.getValueType(); + APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(), + Op.getConstantOperandVal(1)); + DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1); + Known = Known.zextOrTrunc(BitWidth); + Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits()); + break; + } case X86ISD::VSHLI: case X86ISD::VSRLI: { if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) { @@ -27006,7 +27714,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; } - DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1); + DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); unsigned ShAmt = ShiftImm->getZExtValue(); if (Opc == X86ISD::VSHLI) { Known.Zero <<= ShAmt; @@ -27023,6 +27731,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, break; } case X86ISD::VZEXT: { + // TODO: Add DemandedElts support. SDValue N0 = Op.getOperand(0); unsigned NumElts = VT.getVectorNumElements(); @@ -27038,6 +27747,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op, Known.Zero.setBitsFrom(InBitWidth); break; } + case X86ISD::CMOV: { + DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1); + // If we don't know any bits, early out. + if (Known.isUnknown()) + break; + KnownBits Known2; + DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1); + + // Only known if known in both the LHS and RHS. + Known.One &= Known2.One; + Known.Zero &= Known2.Zero; + break; + } + case X86ISD::UDIVREM8_ZEXT_HREG: + // TODO: Support more than just the zero extended bits? + if (Op.getResNo() != 1) + break; + // The remainder is zero extended. + Known.Zero.setBitsFrom(8); + break; } } @@ -27052,18 +27781,42 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( return VTBits; case X86ISD::VSEXT: { + // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); Tmp += VTBits - Src.getScalarValueSizeInBits(); return Tmp; } - case X86ISD::VSHLI: { + case X86ISD::VTRUNC: { + // TODO: Add DemandedElts support. SDValue Src = Op.getOperand(0); + unsigned NumSrcBits = Src.getScalarValueSizeInBits(); + assert(VTBits < NumSrcBits && "Illegal truncation input type"); unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); + if (Tmp > (NumSrcBits - VTBits)) + return Tmp - (NumSrcBits - VTBits); + return 1; + } + + case X86ISD::PACKSS: { + // PACKSS is just a truncation if the sign bits extend to the packed size. + // TODO: Add DemandedElts support. + unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits(); + unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1); + unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1); + unsigned Tmp = std::min(Tmp0, Tmp1); + if (Tmp > (SrcBits - VTBits)) + return Tmp - (SrcBits - VTBits); + return 1; + } + + case X86ISD::VSHLI: { + SDValue Src = Op.getOperand(0); APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); if (ShiftVal.uge(VTBits)) return VTBits; // Shifted all bits out --> zero. + unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); if (ShiftVal.uge(Tmp)) return 1; // Shifted all sign bits out --> unknown. return Tmp - ShiftVal.getZExtValue(); @@ -27071,8 +27824,10 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VSRAI: { SDValue Src = Op.getOperand(0); - unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1); APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue(); + if (ShiftVal.uge(VTBits - 1)) + return VTBits; // Sign splat. + unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1); ShiftVal += Tmp; return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue(); } @@ -27084,12 +27839,31 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode( case X86ISD::VPCOMU: // Vector compares return zero/all-bits result values. return VTBits; + + case X86ISD::CMOV: { + unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1); + if (Tmp0 == 1) return 1; // Early out. + unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1); + return std::min(Tmp0, Tmp1); + } + case X86ISD::SDIVREM8_SEXT_HREG: + // TODO: Support more than just the sign extended bits? + if (Op.getResNo() != 1) + break; + // The remainder is sign extended. + return VTBits - 7; } // Fallback case. return 1; } +SDValue X86TargetLowering::unwrapAddress(SDValue N) const { + if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) + return N->getOperand(0); + return N; +} + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, @@ -27130,13 +27904,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, } if (Match) { unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize); - SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize); - if (SrcVT != MaskVT) + MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() : + MVT::getIntegerVT(MaskEltSize); + SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize); + + if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) { V1 = extractSubVector(V1, 0, DAG, DL, SrcSize); + Shuffle = unsigned(X86ISD::VZEXT); + } else + Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); + DstVT = MVT::getIntegerVT(Scale * MaskEltSize); DstVT = MVT::getVectorVT(DstVT, NumDstElts); - Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT) - : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG); return true; } } @@ -27155,7 +27934,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // instructions are no slower than UNPCKLPD but has the option to // fold the input operand into even an unaligned memory load. if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) { - if (isTargetShuffleEquivalent(Mask, {0, 0})) { + if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) { Shuffle = X86ISD::MOVDDUP; SrcVT = DstVT = MVT::v2f64; return true; @@ -27290,7 +28069,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, // Narrow the repeated mask to create 32-bit element permutes. SmallVector<int, 4> WordMask = RepeatedMask; if (MaskScalarSizeInBits == 64) - scaleShuffleMask(2, RepeatedMask, WordMask); + scaleShuffleMask<int>(2, RepeatedMask, WordMask); Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI); ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32); @@ -27356,7 +28135,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, SDValue &V1, SDValue &V2, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget, - unsigned &Shuffle, MVT &ShuffleVT, + unsigned &Shuffle, MVT &SrcVT, MVT &DstVT, bool IsUnary) { unsigned EltSizeInBits = MaskVT.getScalarSizeInBits(); @@ -27364,26 +28143,36 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVLHPS; - ShuffleVT = MVT::v4f32; + SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) { V2 = V1; Shuffle = X86ISD::MOVHLPS; - ShuffleVT = MVT::v4f32; + SrcVT = DstVT = MVT::v4f32; return true; } if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() && (AllowFloatDomain || !Subtarget.hasSSE41())) { std::swap(V1, V2); Shuffle = X86ISD::MOVSD; - ShuffleVT = MaskVT; + SrcVT = DstVT = MaskVT; return true; } if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) && (AllowFloatDomain || !Subtarget.hasSSE41())) { Shuffle = X86ISD::MOVSS; - ShuffleVT = MaskVT; + SrcVT = DstVT = MaskVT; + return true; + } + } + + // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle. + // TODO add support for 256/512-bit types. + if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) { + if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG, + Subtarget)) { + DstVT = MaskVT; return true; } } @@ -27396,9 +28185,9 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, (MaskVT.is512BitVector() && Subtarget.hasAVX512())) { if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG, Subtarget)) { - ShuffleVT = MaskVT; - if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2()) - ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); + SrcVT = DstVT = MaskVT; + if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) + SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64); return true; } } @@ -27572,11 +28361,11 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask, /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. -static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, - ArrayRef<int> BaseMask, int Depth, - bool HasVariableMask, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, + ArrayRef<int> BaseMask, int Depth, + bool HasVariableMask, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!"); assert((Inputs.size() == 1 || Inputs.size() == 2) && "Unexpected number of shuffle inputs!"); @@ -27601,9 +28390,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, unsigned NumBaseMaskElts = BaseMask.size(); if (NumBaseMaskElts == 1) { assert(BaseMask[0] == 0 && "Invalid shuffle index found!"); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, V1); } unsigned RootSizeInBits = RootVT.getSizeInBits(); @@ -27621,16 +28408,19 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, bool IsEVEXShuffle = RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128); if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits)) - return false; + return SDValue(); // TODO - handle 128/256-bit lane shuffles of 512-bit vectors. // Handle 128-bit lane shuffles of 256-bit vectors. + // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless + // we need to use the zeroing feature. // TODO - this should support binary shuffles. if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 && + !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) && !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) { if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128) - return false; // Nothing to do! + return SDValue(); // Nothing to do! MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64); unsigned PermMask = 0; PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0); @@ -27642,9 +28432,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG.getUNDEF(ShuffleVT), DAG.getConstant(PermMask, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // For masks that have been widened to 128-bit elements or more, @@ -27653,7 +28441,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (BaseMaskEltSizeInBits > 64) { assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size"); int MaskScale = BaseMaskEltSizeInBits / 64; - scaleShuffleMask(MaskScale, BaseMask, Mask); + scaleShuffleMask<int>(MaskScale, BaseMask, Mask); } else { Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end()); } @@ -27669,7 +28457,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Only allow legal mask types. if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) - return false; + return SDValue(); // Attempt to match the mask against known shuffle patterns. MVT ShuffleSrcVT, ShuffleVT; @@ -27678,7 +28466,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Which shuffle domains are permitted? // Permit domain crossing at higher combine depths. bool AllowFloatDomain = FloatDomain || (Depth > 3); - bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && + bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() && (!MaskVT.is256BitVector() || Subtarget.hasAVX2()); // Determine zeroable mask elements. @@ -27697,9 +28485,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale); if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) && isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) { - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, V1); } } @@ -27707,52 +28493,46 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, ShuffleVT)) { if (Depth == 1 && Root.getOpcode() == Shuffle) - return false; // Nothing to do! + return SDValue(); // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return false; // AVX512 Writemask clash. + return SDValue(); // AVX512 Writemask clash. Res = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) - return false; // Nothing to do! + return SDValue(); // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return false; // AVX512 Writemask clash. + return SDValue(); // AVX512 Writemask clash. Res = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res, DAG.getConstant(PermuteImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } } if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, - V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT, - UnaryShuffle)) { + V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT, + ShuffleVT, UnaryShuffle)) { if (Depth == 1 && Root.getOpcode() == Shuffle) - return false; // Nothing to do! + return SDValue(); // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return false; // AVX512 Writemask clash. - V1 = DAG.getBitcast(ShuffleVT, V1); + return SDValue(); // AVX512 Writemask clash. + V1 = DAG.getBitcast(ShuffleSrcVT, V1); DCI.AddToWorklist(V1.getNode()); - V2 = DAG.getBitcast(ShuffleVT, V2); + V2 = DAG.getBitcast(ShuffleSrcVT, V2); DCI.AddToWorklist(V2.getNode()); Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain, @@ -27760,9 +28540,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Subtarget, Shuffle, ShuffleVT, PermuteImm)) { if (Depth == 1 && Root.getOpcode() == Shuffle) - return false; // Nothing to do! + return SDValue(); // Nothing to do! if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements())) - return false; // AVX512 Writemask clash. + return SDValue(); // AVX512 Writemask clash. V1 = DAG.getBitcast(ShuffleVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(ShuffleVT, V2); @@ -27770,9 +28550,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2, DAG.getConstant(PermuteImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // Typically from here on, we need an integer version of MaskVT. @@ -27785,21 +28563,19 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx, Zeroable)) { if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI) - return false; // Nothing to do! + return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); DCI.AddToWorklist(V1.getNode()); Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) { if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI) - return false; // Nothing to do! + return SDValue(); // Nothing to do! V1 = DAG.getBitcast(IntMaskVT, V1); DCI.AddToWorklist(V1.getNode()); V2 = DAG.getBitcast(IntMaskVT, V2); @@ -27808,23 +28584,25 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DAG.getConstant(BitLen, DL, MVT::i8), DAG.getConstant(BitIdx, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } } // Don't try to re-form single instruction chains under any circumstances now // that we've done encoding canonicalization for them. if (Depth < 2) - return false; + return SDValue(); + + // Depth threshold above which we can efficiently use variable mask shuffles. + // TODO This should probably be target specific. + bool AllowVariableMask = (Depth >= 3) || HasVariableMask; bool MaskContainsZeros = any_of(Mask, [](int M) { return M == SM_SentinelZero; }); if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) { // If we have a single input lane-crossing shuffle then lower to VPERMV. - if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX2() && (MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) || (Subtarget.hasAVX512() && @@ -27840,14 +28618,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero // vector as the second source. - if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + if (UnaryShuffle && AllowVariableMask && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || @@ -27871,13 +28647,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(Zero.getNode()); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // If we have a dual input lane-crossing shuffle then lower to VPERMV3. - if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros && + if (AllowVariableMask && !MaskContainsZeros && ((Subtarget.hasAVX512() && (MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 || MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) || @@ -27896,16 +28670,14 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(V2.getNode()); Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } - return false; + return SDValue(); } // See if we can combine a single input shuffle with zeros to a bit-mask, // which is much simpler than any shuffle. - if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) && + if (UnaryShuffle && MaskContainsZeros && AllowVariableMask && isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) && DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) { APInt Zero = APInt::getNullValue(MaskEltSizeInBits); @@ -27930,15 +28702,13 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND); Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // If we have a single input shuffle with different shuffle patterns in the // the 128-bit lanes use the variable mask to VPERMILPS. // TODO Combine other mask types at higher depths. - if (UnaryShuffle && HasVariableMask && !MaskContainsZeros && + if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros && ((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) || (MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) { SmallVector<SDValue, 16> VPermIdx; @@ -27953,14 +28723,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(Res.getNode()); Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // With XOP, binary shuffles of 128/256-bit floating point vectors can combine // to VPERMIL2PD/VPERMIL2PS. - if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() && + if (AllowVariableMask && Subtarget.hasXOP() && (MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 || MaskVT == MVT::v8f32)) { // VPERMIL2 Operation. @@ -27994,9 +28762,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp, DAG.getConstant(M2ZImm, DL, MVT::i8)); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // If we have 3 or more shuffle instructions or a chain involving a variable @@ -28004,7 +28770,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, // Intel's manuals suggest only using PSHUFB if doing so replacing 5 // instructions, but in practice PSHUFB tends to be *very* fast so we're // more aggressive. - if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && + if (UnaryShuffle && AllowVariableMask && ((RootVT.is128BitVector() && Subtarget.hasSSSE3()) || (RootVT.is256BitVector() && Subtarget.hasAVX2()) || (RootVT.is512BitVector() && Subtarget.hasBWI()))) { @@ -28022,7 +28788,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, continue; } M = Ratio * M + i % Ratio; - assert ((M / 16) == (i / 16) && "Lane crossing detected"); + assert((M / 16) == (i / 16) && "Lane crossing detected"); PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); @@ -28032,16 +28798,13 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(PSHUFBMaskOp.getNode()); Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // With XOP, if we have a 128-bit binary input shuffle we can always combine // to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never // slower than PSHUFB on targets that support both. - if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() && - Subtarget.hasXOP()) { + if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) { // VPPERM Mask Operation // Bits[4:0] - Byte Index (0 - 31) // Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO) @@ -28070,23 +28833,22 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root, DCI.AddToWorklist(VPPERMMaskOp.getNode()); Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp); DCI.AddToWorklist(Res.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res), - /*AddTo*/ true); - return true; + return DAG.getBitcast(RootVT, Res); } // Failed to find any combines. - return false; + return SDValue(); } // Attempt to constant fold all of the constant source ops. // Returns true if the entire shuffle is folded to a constant. // TODO: Extend this to merge multiple constant Ops and update the mask. -static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, - ArrayRef<int> Mask, SDValue Root, - bool HasVariableMask, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, + ArrayRef<int> Mask, SDValue Root, + bool HasVariableMask, + SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { MVT VT = Root.getSimpleValueType(); unsigned SizeInBits = VT.getSizeInBits(); @@ -28103,14 +28865,14 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, OneUseConstantOp |= SrcOp.hasOneUse(); if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i], RawBitsOps[i])) - return false; + return SDValue(); } // Only fold if at least one of the constants is only used once or // the combined shuffle has included a variable mask shuffle, this // is to avoid constant pool bloat. if (!OneUseConstantOp && !HasVariableMask) - return false; + return SDValue(); // Shuffle the constant bits according to the mask. APInt UndefElts(NumMaskElts, 0); @@ -28162,8 +28924,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, SDLoc DL(Root); SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL); DCI.AddToWorklist(CstOp.getNode()); - DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp)); - return true; + return DAG.getBitcast(VT, CstOp); } /// \brief Fully generic combining of x86 shuffle instructions. @@ -28195,18 +28956,15 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops, /// would simplify under the threshold for PSHUFB formation because of /// combine-ordering. To fix this, we should do the redundant instruction /// combining in this recursive walk. -static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, - int SrcOpIndex, SDValue Root, - ArrayRef<int> RootMask, - ArrayRef<const SDNode*> SrcNodes, - int Depth, bool HasVariableMask, - SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { +static SDValue combineX86ShufflesRecursively( + ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root, + ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth, + bool HasVariableMask, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { // Bound the depth of our recursive combine because this is ultimately // quadratic in nature. if (Depth > 8) - return false; + return SDValue(); // Directly rip through bitcasts to find the underlying operand. SDValue Op = SrcOps[SrcOpIndex]; @@ -28214,7 +28972,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, MVT VT = Op.getSimpleValueType(); if (!VT.isVector()) - return false; // Bail if we hit a non-vector. + return SDValue(); // Bail if we hit a non-vector. assert(Root.getSimpleValueType().isVector() && "Shuffles operate on vector types!"); @@ -28225,7 +28983,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, SmallVector<int, 64> OpMask; SmallVector<SDValue, 2> OpInputs; if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG)) - return false; + return SDValue(); assert(OpInputs.size() <= 2 && "Too many shuffle inputs"); SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue()); @@ -28334,18 +29092,15 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, } // Handle the all undef/zero cases early. - if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) { - DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType())); - return true; - } - if (all_of(Mask, [](int Idx) { return Idx < 0; })) { - // TODO - should we handle the mixed zero/undef case as well? Just returning - // a zero mask will lose information on undef elements possibly reducing - // future combine possibilities. - DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(), - Subtarget, DAG, SDLoc(Root))); - return true; - } + if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) + return DAG.getUNDEF(Root.getValueType()); + + // TODO - should we handle the mixed zero/undef case as well? Just returning + // a zero mask will lose information on undef elements possibly reducing + // future combine possibilities. + if (all_of(Mask, [](int Idx) { return Idx < 0; })) + return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, + SDLoc(Root)); // Remove unused shuffle source ops. resolveTargetShuffleInputsAndMask(Ops, Mask); @@ -28364,19 +29119,19 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, for (int i = 0, e = Ops.size(); i < e; ++i) if (Ops[i].getNode()->hasOneUse() || SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode())) - if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes, - Depth + 1, HasVariableMask, DAG, DCI, - Subtarget)) - return true; + if (SDValue Res = combineX86ShufflesRecursively( + Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask, + DAG, DCI, Subtarget)) + return Res; // Attempt to constant fold all of the constant source ops. - if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI, - Subtarget)) - return true; + if (SDValue Cst = combineX86ShufflesConstants( + Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget)) + return Cst; // We can only combine unary and binary shuffle mask cases. if (Ops.size() > 2) - return false; + return SDValue(); // Minor canonicalization of the accumulated shuffle mask to make it easier // to match below. All this does is detect masks with sequential pairs of @@ -28395,6 +29150,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps, std::swap(Ops[0], Ops[1]); } + // Finally, try to combine into a single shuffle instruction. return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG, DCI, Subtarget); } @@ -28650,8 +29406,37 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, SDLoc DL(N); MVT VT = N.getSimpleValueType(); SmallVector<int, 4> Mask; - unsigned Opcode = N.getOpcode(); + + // Combine binary shuffle of 2 similar 'Horizontal' instructions into a + // single instruction. + if (VT.getScalarSizeInBits() == 64 && + (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH || + Opcode == X86ISD::UNPCKL)) { + auto BC0 = peekThroughBitcasts(N.getOperand(0)); + auto BC1 = peekThroughBitcasts(N.getOperand(1)); + EVT VT0 = BC0.getValueType(); + EVT VT1 = BC1.getValueType(); + unsigned Opcode0 = BC0.getOpcode(); + unsigned Opcode1 = BC1.getOpcode(); + if (Opcode0 == Opcode1 && VT0 == VT1 && + (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD || + Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB || + Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) { + SDValue Lo, Hi; + if (Opcode == X86ISD::MOVSD) { + Lo = BC1.getOperand(0); + Hi = BC0.getOperand(1); + } else { + Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); + Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0); + } + SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi); + DCI.AddToWorklist(Horiz.getNode()); + return DAG.getBitcast(VT, Horiz); + } + } + switch (Opcode) { case X86ISD::PSHUFD: case X86ISD::PSHUFLW: @@ -28660,17 +29445,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, assert(Mask.size() == 4); break; case X86ISD::UNPCKL: { - auto Op0 = N.getOperand(0); - auto Op1 = N.getOperand(1); - unsigned Opcode0 = Op0.getOpcode(); - unsigned Opcode1 = Op1.getOpcode(); - - // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single - // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization. - // TODO: Add other horizontal operations as required. - if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD) - return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0)); - // Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in // which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE // moves upper half elements into the lower half part. For example: @@ -28688,7 +29462,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG, if (!VT.is128BitVector()) return SDValue(); - if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) { + auto Op0 = N.getOperand(0); + auto Op1 = N.getOperand(1); + if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) { ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask(); unsigned NumElts = VT.getVectorNumElements(); @@ -28999,7 +29775,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N, // Try to generate X86ISD::FMADDSUB node here. SDValue Opnd2; - if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2)) + if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2)) return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2); // Do not generate X86ISD::ADDSUB node for 512-bit types even though @@ -29056,6 +29832,40 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG, return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask); } +/// Eliminate a redundant shuffle of a horizontal math op. +static SDValue foldShuffleOfHorizOp(SDNode *N) { + if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef()) + return SDValue(); + + SDValue HOp = N->getOperand(0); + if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD && + HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB) + return SDValue(); + + // 128-bit horizontal math instructions are defined to operate on adjacent + // lanes of each operand as: + // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3] + // ...similarly for v2f64 and v8i16. + // TODO: 256-bit is not the same because...x86. + if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128) + return SDValue(); + + // When the operands of a horizontal math op are identical, the low half of + // the result is the same as the high half. If the shuffle is also replicating + // low and high halves, we don't need the shuffle. + // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X + ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask(); + // TODO: Other mask possibilities like {1,1} and {1,0} could be added here, + // but this should be tied to whatever horizontal op matching and shuffle + // canonicalization are producing. + if (isTargetShuffleEquivalent(Mask, { 0, 0 }) || + isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) || + isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 })) + return HOp; + + return SDValue(); +} + static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -29064,10 +29874,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // If we have legalized the vector types, look for blends of FADD and FSUB // nodes that we can fuse into an ADDSUB node. - if (TLI.isTypeLegal(VT)) + if (TLI.isTypeLegal(VT)) { if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG)) return AddSub; + if (SDValue HAddSub = foldShuffleOfHorizOp(N)) + return HAddSub; + } + // During Type Legalization, when promoting illegal vector types, // the backend might introduce new shuffle dag nodes and bitcasts. // @@ -29165,12 +29979,12 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG, // specific PSHUF instruction sequences into their minimal form so that we // can evaluate how many specialized shuffle instructions are involved in // a particular chain. - SmallVector<int, 1> NonceMask; // Just a placeholder. - NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, - /*Depth*/ 1, /*HasVarMask*/ false, DAG, - DCI, Subtarget)) - return SDValue(); // This routine will use CombineTo to replace N. + if (SDValue Res = combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) { + DCI.CombineTo(N, Res); + return SDValue(); + } } return SDValue(); @@ -29287,6 +30101,53 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, SDValue N0 = BitCast.getOperand(0); EVT VecVT = N0->getValueType(0); + if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() && + N0->getOpcode() == ISD::OR) { + SDValue Op0 = N0->getOperand(0); + SDValue Op1 = N0->getOperand(1); + MVT TrunckVT; + MVT BitcastVT; + switch (VT.getSimpleVT().SimpleTy) { + default: + return SDValue(); + case MVT::v16i1: + TrunckVT = MVT::i8; + BitcastVT = MVT::v8i1; + break; + case MVT::v32i1: + TrunckVT = MVT::i16; + BitcastVT = MVT::v16i1; + break; + case MVT::v64i1: + TrunckVT = MVT::i32; + BitcastVT = MVT::v32i1; + break; + } + bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL; + bool isArg0UndefLeft = + Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND; + bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL; + bool isArg1UndefLeft = + Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND; + SDValue OpLeft; + SDValue OpRight; + if (isArg0UndefRight && isArg1UndefLeft) { + OpLeft = Op0; + OpRight = Op1; + } else if (isArg1UndefRight && isArg0UndefLeft) { + OpLeft = Op1; + OpRight = Op0; + } else + return SDValue(); + SDLoc DL(BitCast); + SDValue Shr = OpLeft->getOperand(0); + SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr); + SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1); + SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight); + SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2); + } + if (!VT.isScalarInteger() || !VecVT.isSimple()) return SDValue(); @@ -29300,7 +30161,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // v8i16 and v16i16. // For these two cases, we can shuffle the upper element bytes to a // consecutive sequence at the start of the vector and treat the results as - // v16i8 or v32i8, and for v61i8 this is the preferable solution. However, + // v16i8 or v32i8, and for v16i8 this is the preferable solution. However, // for v16i16 this is not the case, because the shuffle is expensive, so we // avoid sign-extending to this type entirely. // For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as: @@ -29319,9 +30180,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, FPCastVT = MVT::v4f32; // For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2)) // sign-extend to a 256-bit operation to avoid truncation. - if (N0->getOpcode() == ISD::SETCC && - N0->getOperand(0)->getValueType(0).is256BitVector() && - Subtarget.hasInt256()) { + if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && + N0->getOperand(0)->getValueType(0).is256BitVector()) { SExtVT = MVT::v4i64; FPCastVT = MVT::v4f64; } @@ -29333,9 +30193,9 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // If the setcc operand is 128-bit, prefer sign-extending to 128-bit over // 256-bit because the shuffle is cheaper than sign extending the result of // the compare. - if (N0->getOpcode() == ISD::SETCC && - N0->getOperand(0)->getValueType(0).is256BitVector() && - Subtarget.hasInt256()) { + if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() && + (N0->getOperand(0)->getValueType(0).is256BitVector() || + N0->getOperand(0)->getValueType(0).is512BitVector())) { SExtVT = MVT::v8i32; FPCastVT = MVT::v8f32; } @@ -29348,23 +30208,34 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast, // truncating the result of the compare to 128-bits. break; case MVT::v32i1: - // TODO: Handle pre-AVX2 cases by splitting to two v16i1's. - if (!Subtarget.hasInt256()) - return SDValue(); SExtVT = MVT::v32i8; break; }; SDLoc DL(BitCast); SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT); + + if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) { + // Handle pre-AVX2 cases by splitting to two v16i1's. + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32); + SDValue Lo = extract128BitVector(V, 0, DAG, DL); + SDValue Hi = extract128BitVector(V, 16, DAG, DL); + Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo); + Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi); + Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi, + DAG.getConstant(16, DL, ShiftTy)); + V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi); + return DAG.getZExtOrTrunc(V, DL, VT); + } + if (SExtVT == MVT::v8i16) { - V = DAG.getBitcast(MVT::v16i8, V); - V = DAG.getVectorShuffle( - MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8), - {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1}); + assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector"); + V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V, + DAG.getUNDEF(MVT::v8i16)); } else assert(SExtVT.getScalarType() != MVT::i16 && - "Vectors of i16 must be shuffled"); + "Vectors of i16 must be packed"); if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE) V = DAG.getBitcast(FPCastVT, V); V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V); @@ -29463,16 +30334,22 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG, // the elements of a vector. // Returns the vector that is being reduced on, or SDValue() if a reduction // was not matched. -static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) { +static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp, + ArrayRef<ISD::NodeType> CandidateBinOps) { // The pattern must end in an extract from index 0. if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) || !isNullConstant(Extract->getOperand(1))) return SDValue(); - unsigned Stages = - Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements()); - SDValue Op = Extract->getOperand(0); + unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements()); + + // Match against one of the candidate binary ops. + if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) { + return Op.getOpcode() == unsigned(BinOp); + })) + return SDValue(); + // At each stage, we're looking for something that looks like: // %s = shufflevector <8 x i32> %op, <8 x i32> undef, // <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, @@ -29483,8 +30360,9 @@ static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) { // <4,5,6,7,u,u,u,u> // <2,3,u,u,u,u,u,u> // <1,u,u,u,u,u,u,u> + unsigned CandidateBinOp = Op.getOpcode(); for (unsigned i = 0; i < Stages; ++i) { - if (Op.getOpcode() != BinOp) + if (Op.getOpcode() != CandidateBinOp) return SDValue(); ShuffleVectorSDNode *Shuffle = @@ -29497,8 +30375,8 @@ static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) { } // The first operand of the shuffle should be the same as the other operand - // of the add. - if (!Shuffle || (Shuffle->getOperand(0) != Op)) + // of the binop. + if (!Shuffle || Shuffle->getOperand(0) != Op) return SDValue(); // Verify the shuffle has the expected (at this stage of the pyramid) mask. @@ -29507,6 +30385,7 @@ static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) { return SDValue(); } + BinOp = CandidateBinOp; return Op; } @@ -29552,8 +30431,7 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0, // In SetLT case, The second operand of the comparison can be either 1 or 0. APInt SplatVal; if ((CC == ISD::SETLT) && - !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal, - /*AllowShrink*/false) && + !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) && SplatVal.isOneValue()) || (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode())))) return false; @@ -29606,6 +30484,66 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0, return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1); } +// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW. +static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + // Bail without SSE41. + if (!Subtarget.hasSSE41()) + return SDValue(); + + EVT ExtractVT = Extract->getValueType(0); + if (ExtractVT != MVT::i16) + return SDValue(); + + // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns. + unsigned BinOp; + SDValue Src = matchBinOpReduction( + Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN}); + if (!Src) + return SDValue(); + + EVT SrcVT = Src.getValueType(); + EVT SrcSVT = SrcVT.getScalarType(); + if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0) + return SDValue(); + + SDLoc DL(Extract); + SDValue MinPos = Src; + + // First, reduce the source down to 128-bit, applying BinOp to lo/hi. + while (SrcVT.getSizeInBits() > 128) { + unsigned NumElts = SrcVT.getVectorNumElements(); + unsigned NumSubElts = NumElts / 2; + SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts); + unsigned SubSizeInBits = SrcVT.getSizeInBits(); + SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits); + SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits); + MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi); + } + assert(SrcVT == MVT::v8i16 && "Unexpected value type"); + + // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask + // to flip the value accordingly. + SDValue Mask; + if (BinOp == ISD::SMAX) + Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT); + else if (BinOp == ISD::SMIN) + Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT); + else if (BinOp == ISD::UMAX) + Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT); + + if (Mask) + MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + + MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos); + + if (Mask) + MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos); + + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos, + DAG.getIntPtrConstant(0, DL)); +} + // Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK. static SDValue combineHorizontalPredicateResult(SDNode *Extract, SelectionDAG &DAG, @@ -29621,66 +30559,63 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract, return SDValue(); // Check for OR(any_of) and AND(all_of) horizontal reduction patterns. - for (ISD::NodeType Op : {ISD::OR, ISD::AND}) { - SDValue Match = matchBinOpReduction(Extract, Op); - if (!Match) - continue; - - // EXTRACT_VECTOR_ELT can require implicit extension of the vector element - // which we can't support here for now. - if (Match.getScalarValueSizeInBits() != BitWidth) - continue; + unsigned BinOp = 0; + SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND}); + if (!Match) + return SDValue(); - // We require AVX2 for PMOVMSKB for v16i16/v32i8; - unsigned MatchSizeInBits = Match.getValueSizeInBits(); - if (!(MatchSizeInBits == 128 || - (MatchSizeInBits == 256 && - ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) - return SDValue(); + // EXTRACT_VECTOR_ELT can require implicit extension of the vector element + // which we can't support here for now. + if (Match.getScalarValueSizeInBits() != BitWidth) + return SDValue(); - // Don't bother performing this for 2-element vectors. - if (Match.getValueType().getVectorNumElements() <= 2) - return SDValue(); + // We require AVX2 for PMOVMSKB for v16i16/v32i8; + unsigned MatchSizeInBits = Match.getValueSizeInBits(); + if (!(MatchSizeInBits == 128 || + (MatchSizeInBits == 256 && + ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2())))) + return SDValue(); - // Check that we are extracting a reduction of all sign bits. - if (DAG.ComputeNumSignBits(Match) != BitWidth) - return SDValue(); + // Don't bother performing this for 2-element vectors. + if (Match.getValueType().getVectorNumElements() <= 2) + return SDValue(); - // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. - MVT MaskVT; - if (64 == BitWidth || 32 == BitWidth) - MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), - MatchSizeInBits / BitWidth); - else - MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); - - APInt CompareBits; - ISD::CondCode CondCode; - if (Op == ISD::OR) { - // any_of -> MOVMSK != 0 - CompareBits = APInt::getNullValue(32); - CondCode = ISD::CondCode::SETNE; - } else { - // all_of -> MOVMSK == ((1 << NumElts) - 1) - CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); - CondCode = ISD::CondCode::SETEQ; - } + // Check that we are extracting a reduction of all sign bits. + if (DAG.ComputeNumSignBits(Match) != BitWidth) + return SDValue(); - // Perform the select as i32/i64 and then truncate to avoid partial register - // stalls. - unsigned ResWidth = std::max(BitWidth, 32u); - EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); - SDLoc DL(Extract); - SDValue Zero = DAG.getConstant(0, DL, ResVT); - SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); - SDValue Res = DAG.getBitcast(MaskVT, Match); - Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); - Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), - Ones, Zero, CondCode); - return DAG.getSExtOrTrunc(Res, DL, ExtractVT); + // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB. + MVT MaskVT; + if (64 == BitWidth || 32 == BitWidth) + MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth), + MatchSizeInBits / BitWidth); + else + MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8); + + APInt CompareBits; + ISD::CondCode CondCode; + if (BinOp == ISD::OR) { + // any_of -> MOVMSK != 0 + CompareBits = APInt::getNullValue(32); + CondCode = ISD::CondCode::SETNE; + } else { + // all_of -> MOVMSK == ((1 << NumElts) - 1) + CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements()); + CondCode = ISD::CondCode::SETEQ; } - return SDValue(); + // Perform the select as i32/i64 and then truncate to avoid partial register + // stalls. + unsigned ResWidth = std::max(BitWidth, 32u); + EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth); + SDLoc DL(Extract); + SDValue Zero = DAG.getConstant(0, DL, ResVT); + SDValue Ones = DAG.getAllOnesConstant(DL, ResVT); + SDValue Res = DAG.getBitcast(MaskVT, Match); + Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res); + Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32), + Ones, Zero, CondCode); + return DAG.getSExtOrTrunc(Res, DL, ExtractVT); } static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, @@ -29707,7 +30642,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, return SDValue(); // Match shuffle + add pyramid. - SDValue Root = matchBinOpReduction(Extract, ISD::ADD); + unsigned BinOp = 0; + SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD}); // The operand is expected to be zero extended from i8 // (verified in detectZextAbsDiff). @@ -29758,7 +30694,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG, unsigned TypeSizeInBits = Type.getSizeInBits(); // Return the lowest TypeSizeInBits bits. MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits); - SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD); + SAD = DAG.getBitcast(ResVT, SAD); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD, Extract->getOperand(1)); } @@ -29794,7 +30730,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, if ((NumSrcElts % Mask.size()) == 0) { SmallVector<int, 16> ScaledMask; int Scale = NumSrcElts / Mask.size(); - scaleShuffleMask(Scale, Mask, ScaledMask); + scaleShuffleMask<int>(Scale, Mask, ScaledMask); Mask = std::move(ScaledMask); } else if ((Mask.size() % NumSrcElts) == 0) { SmallVector<int, 16> WidenedMask; @@ -29843,9 +30779,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB); SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp, DAG.getIntPtrConstant(SrcIdx, dl)); - SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp, - DAG.getValueType(SrcSVT)); - return DAG.getZExtOrTrunc(Assert, dl, VT); + return DAG.getZExtOrTrunc(ExtOp, dl, VT); } return SDValue(); @@ -29858,10 +30792,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG, static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { - if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) + if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) return NewOp; - if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget)) + // TODO - Remove this once we can handle the implicit zero-extension of + // X86ISD::PEXTRW/X86ISD::PEXTRB in: + // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and + // combineBasicSADPattern. + if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI)) return NewOp; SDValue InputVector = N->getOperand(0); @@ -29910,6 +30851,10 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget)) return Cmp; + // Attempt to replace min/max v8i16 reductions with PHMINPOSUW. + if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget)) + return MinMax; + // Only operate on vectors of 4 elements, where the alternative shuffling // gets to be more expensive. if (SrcVT != MVT::v4i32) @@ -30008,18 +30953,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG, return SDValue(); } -// TODO - merge with combineExtractVectorElt once it can handle the implicit -// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in: -// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and -// combineBasicSADPattern. -static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const X86Subtarget &Subtarget) { - return combineExtractWithShuffle(N, DAG, DCI, Subtarget); -} - /// If a vector select has an operand that is -1 or 0, try to simplify the /// select to a bitwise logic operation. +/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()? static SDValue combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -30037,10 +30973,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, assert(CondVT.isVector() && "Vector select expects a vector selector!"); - bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); + bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); // Check if the first operand is all zeros and Cond type is vXi1. // This situation only applies to avx512. - if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && + if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) { // Invert the cond to not(cond) : xor(op,allones)=not(op) SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond, @@ -30058,7 +30994,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return SDValue(); bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode()); - FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); + bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode()); // Try to invert the condition if true value is not all 1s and false value is // not all 0s. @@ -30068,7 +31004,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, // Check if SETCC has already been promoted. TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) == CondVT) { - bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode()); bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode()); if (TValIsAllZeros || FValIsAllOnes) { @@ -30084,6 +31019,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, } } + // Cond value must be 'sign splat' to be converted to a logical op. + if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits()) + return SDValue(); + // vselect Cond, 111..., 000... -> Cond if (TValIsAllOnes && FValIsAllZeros) return DAG.getBitcast(VT, Cond); @@ -30105,6 +31044,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(VT, And); } + // vselect Cond, 000..., X -> andn Cond, X + if (TValIsAllZeros) { + MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64); + SDValue CastCond = DAG.getBitcast(AndNVT, Cond); + SDValue CastRHS = DAG.getBitcast(AndNVT, RHS); + SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS); + return DAG.getBitcast(VT, AndN); + } + return SDValue(); } @@ -30120,78 +31068,52 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) { return SDValue(); // Don't do this for crazy integer types. - if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType())) + EVT VT = N->getValueType(0); + if (!DAG.getTargetLoweringInfo().isTypeLegal(VT)) return SDValue(); - // If this is efficiently invertible, canonicalize the LHSC/RHSC values - // so that TrueC (the true value) is larger than FalseC. - bool NeedsCondInvert = false; - if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) && - // Efficiently invertible. - (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible. - (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible. - isa<ConstantSDNode>(Cond.getOperand(1))))) { - NeedsCondInvert = true; - std::swap(TrueC, FalseC); - } - - // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0. - if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) { - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); - - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond); + // We're going to use the condition bit in math or logic ops. We could allow + // this with a wider condition value (post-legalization it becomes an i8), + // but if nothing is creating selects that late, it doesn't matter. + if (Cond.getValueType() != MVT::i1) + return SDValue(); - unsigned ShAmt = TrueC->getAPIntValue().logBase2(); - return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond, - DAG.getConstant(ShAmt, DL, MVT::i8)); - } + // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by + // 3, 5, or 9 with i32/i64, so those get transformed too. + // TODO: For constants that overflow or do not differ by power-of-2 or small + // multiplier, convert to 'and' + 'add'. + const APInt &TrueVal = TrueC->getAPIntValue(); + const APInt &FalseVal = FalseC->getAPIntValue(); + bool OV; + APInt Diff = TrueVal.ssub_ov(FalseVal, OV); + if (OV) + return SDValue(); - // Optimize cases that will turn into an LEA instruction. This requires - // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9). - if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) { - uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue(); - if (N->getValueType(0) == MVT::i32) - Diff = (unsigned)Diff; + APInt AbsDiff = Diff.abs(); + if (AbsDiff.isPowerOf2() || + ((VT == MVT::i32 || VT == MVT::i64) && + (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) { - bool IsFastMultiplier = false; - if (Diff < 10) { - switch ((unsigned char)Diff) { - default: - break; - case 1: // result = add base, cond - case 2: // result = lea base( , cond*2) - case 3: // result = lea base(cond, cond*2) - case 4: // result = lea base( , cond*4) - case 5: // result = lea base(cond, cond*4) - case 8: // result = lea base( , cond*8) - case 9: // result = lea base(cond, cond*8) - IsFastMultiplier = true; - break; - } + // We need a positive multiplier constant for shift/LEA codegen. The 'not' + // of the condition can usually be folded into a compare predicate, but even + // without that, the sequence should be cheaper than a CMOV alternative. + if (TrueVal.slt(FalseVal)) { + Cond = DAG.getNOT(DL, Cond, MVT::i1); + std::swap(TrueC, FalseC); } - if (IsFastMultiplier) { - APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue(); - if (NeedsCondInvert) // Invert the condition if needed. - Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, - DAG.getConstant(1, DL, Cond.getValueType())); + // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC + SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond); - // Zero extend the condition if needed. - Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond); - // Scale the condition by the difference. - if (Diff != 1) - Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond, - DAG.getConstant(Diff, DL, Cond.getValueType())); + // Multiply condition by the difference if non-one. + if (!AbsDiff.isOneValue()) + R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT)); - // Add the base if non-zero. - if (FalseC->getAPIntValue() != 0) - Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, - SDValue(FalseC, 0)); - return Cond; - } + // Add the base if non-zero. + if (!FalseC->isNullValue()) + R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0)); + + return R; } return SDValue(); @@ -30231,26 +31153,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, unsigned Opcode = Op.getOpcode(); switch (Opcode) { - case X86ISD::PALIGNR: - // PALIGNR can be converted to VALIGND/Q for 128-bit vectors. - if (!VT.is128BitVector()) - return false; - Opcode = X86ISD::VALIGN; - LLVM_FALLTHROUGH; - case X86ISD::VALIGN: { - if (EltVT != MVT::i32 && EltVT != MVT::i64) - return false; - uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits(); - unsigned EltSize = EltVT.getSizeInBits(); - // Make sure we can represent the same shift with the new VT. - if ((ShiftAmt % EltSize) != 0) - return false; - Imm = ShiftAmt / EltSize; - return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), - DAG.getConstant(Imm, DL, MVT::i8)); - } case X86ISD::SHUF128: { if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64) return false; @@ -30260,50 +31162,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG, return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1), Op.getOperand(2)); } - case ISD::INSERT_SUBVECTOR: { - unsigned EltSize = EltVT.getSizeInBits(); - if (EltSize != 32 && EltSize != 64) - return false; - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - // Only change element size, not type. - if (EltVT.isInteger() != OpEltVT.isInteger()) - return false; - uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue(); - Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; - SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0)); - DCI.AddToWorklist(Op0.getNode()); - // Op1 needs to be bitcasted to a smaller vector with the same element type. - SDValue Op1 = Op.getOperand(1); - MVT Op1VT = MVT::getVectorVT(EltVT, - Op1.getSimpleValueType().getSizeInBits() / EltSize); - Op1 = DAG.getBitcast(Op1VT, Op1); - DCI.AddToWorklist(Op1.getNode()); - DCI.CombineTo(OrigOp.getNode(), - DAG.getNode(Opcode, DL, VT, Op0, Op1, - DAG.getIntPtrConstant(Imm, DL))); - return true; - } - case ISD::EXTRACT_SUBVECTOR: { - unsigned EltSize = EltVT.getSizeInBits(); - if (EltSize != 32 && EltSize != 64) - return false; - MVT OpEltVT = Op.getSimpleValueType().getVectorElementType(); - // Only change element size, not type. - if (EltVT.isInteger() != OpEltVT.isInteger()) - return false; - uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue(); - Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize; - // Op0 needs to be bitcasted to a larger vector with the same element type. - SDValue Op0 = Op.getOperand(0); - MVT Op0VT = MVT::getVectorVT(EltVT, - Op0.getSimpleValueType().getSizeInBits() / EltSize); - Op0 = DAG.getBitcast(Op0VT, Op0); - DCI.AddToWorklist(Op0.getNode()); - DCI.CombineTo(OrigOp.getNode(), - DAG.getNode(Opcode, DL, VT, Op0, - DAG.getIntPtrConstant(Imm, DL))); - return true; - } case X86ISD::SUBV_BROADCAST: { unsigned EltSize = EltVT.getSizeInBits(); if (EltSize != 32 && EltSize != 64) @@ -30717,7 +31575,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, /// i.e., reusing the EFLAGS produced by the LOCKed instruction. /// Note that this is only legal for some op/cc combinations. static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { // This combine only operates on CMP-like nodes. if (!(Cmp.getOpcode() == X86ISD::CMP || (Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0)))) @@ -30747,12 +31606,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, if (!CmpLHS.hasOneUse()) return SDValue(); - auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); - if (!CmpRHSC || CmpRHSC->getZExtValue() != 0) - return SDValue(); - - const unsigned Opc = CmpLHS.getOpcode(); - + unsigned Opc = CmpLHS.getOpcode(); if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB) return SDValue(); @@ -30765,6 +31619,44 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, if (Opc == ISD::ATOMIC_LOAD_SUB) Addend = -Addend; + auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS); + if (!CmpRHSC) + return SDValue(); + + APInt Comparison = CmpRHSC->getAPIntValue(); + + // If the addend is the negation of the comparison value, then we can do + // a full comparison by emitting the atomic arithmetic as a locked sub. + if (Comparison == -Addend) { + // The CC is fine, but we need to rewrite the LHS of the comparison as an + // atomic sub. + auto *AN = cast<AtomicSDNode>(CmpLHS.getNode()); + auto AtomicSub = DAG.getAtomic( + ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(), + /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1), + /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()), + AN->getMemOperand()); + // If the comparision uses the CF flag we can't use INC/DEC instructions. + bool NeedCF = false; + switch (CC) { + default: break; + case X86::COND_A: case X86::COND_AE: + case X86::COND_B: case X86::COND_BE: + NeedCF = true; + break; + } + auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF); + DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), + DAG.getUNDEF(CmpLHS.getValueType())); + DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); + return LockOp; + } + + // We can handle comparisons with zero in a number of cases by manipulating + // the CC used. + if (!Comparison.isNullValue()) + return SDValue(); + if (CC == X86::COND_S && Addend == 1) CC = X86::COND_LE; else if (CC == X86::COND_NS && Addend == 1) @@ -30776,7 +31668,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC, else return SDValue(); - SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG); + SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0), DAG.getUNDEF(CmpLHS.getValueType())); DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1)); @@ -30983,14 +31875,15 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) { /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing /// uses of chain values. static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC, - SelectionDAG &DAG) { + SelectionDAG &DAG, + const X86Subtarget &Subtarget) { if (CC == X86::COND_B) if (SDValue Flags = combineCarryThroughADD(EFLAGS)) return Flags; if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC)) return R; - return combineSetCCAtomicArith(EFLAGS, CC, DAG); + return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget); } /// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL] @@ -30999,10 +31892,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDLoc DL(N); - // If the flag operand isn't dead, don't touch this CMOV. - if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty()) - return SDValue(); - SDValue FalseOp = N->getOperand(0); SDValue TrueOp = N->getOperand(1); X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2); @@ -31021,11 +31910,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // Try to simplify the EFLAGS and condition code operands. // We can't always do this as FCMOV only supports a subset of X86 cond. - if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) { + if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) { if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) { SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8), Flags}; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); + return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } @@ -31054,8 +31943,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, unsigned ShAmt = TrueC->getAPIntValue().logBase2(); Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond, DAG.getConstant(ShAmt, DL, MVT::i8)); - if (N->getNumValues() == 2) // Dead flag value? - return DCI.CombineTo(N, Cond, SDValue()); return Cond; } @@ -31069,9 +31956,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, FalseC->getValueType(0), Cond); Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); - - if (N->getNumValues() == 2) // Dead flag value? - return DCI.CombineTo(N, Cond, SDValue()); return Cond; } @@ -31112,8 +31996,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (FalseC->getAPIntValue() != 0) Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond, SDValue(FalseC, 0)); - if (N->getNumValues() == 2) // Dead flag value? - return DCI.CombineTo(N, Cond, SDValue()); return Cond; } } @@ -31153,7 +32035,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) { SDValue Ops[] = { FalseOp, Cond.getOperand(0), DAG.getConstant(CC, DL, MVT::i8), Cond }; - return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops); + return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); } } } @@ -31188,10 +32070,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8), Flags}; - SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps); + SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps); SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags}; - SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops); - DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1)); + SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops); return CMOV; } } @@ -31307,7 +32188,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // pmulld is supported since SSE41. It is better to use pmulld // instead of pmullw+pmulhw, except for subtargets where pmulld is slower than // the expansion. - bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize(); + bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize(); if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow())) return SDValue(); @@ -31319,15 +32200,19 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getOperand(0).getValueType(); + unsigned NumElts = VT.getVectorNumElements(); + if ((NumElts % 2) != 0) + return SDValue(); + unsigned RegSize = 128; MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16); - EVT ReducedVT = - EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements()); + EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts); + // Shrink the operands of mul. SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0); SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1); - if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) { + if (NumElts >= OpsVT.getVectorNumElements()) { // Generate the lower part of mul: pmullw. For MULU8/MULS8, only the // lower part is needed. SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1); @@ -31335,7 +32220,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND, DL, VT, MulLo); } else { - MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2); + MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2); // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16, // the higher part is also needed. SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL, @@ -31344,22 +32229,22 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // Repack the lower part and higher part result of mul into a wider // result. // Generate shuffle functioning as punpcklwd. - SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements()); - for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { + SmallVector<int, 16> ShuffleMask(NumElts); + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { ShuffleMask[2 * i] = i; - ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements(); + ShuffleMask[2 * i + 1] = i + NumElts; } SDValue ResLo = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo); + ResLo = DAG.getBitcast(ResVT, ResLo); // Generate shuffle functioning as punpckhwd. - for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) { - ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2; - ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2; + for (unsigned i = 0, e = NumElts / 2; i < e; i++) { + ShuffleMask[2 * i] = i + NumElts / 2; + ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2; } SDValue ResHi = DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask); - ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi); + ResHi = DAG.getBitcast(ResVT, ResHi); return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi); } } else { @@ -31405,8 +32290,8 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG, // Repack the lower part and higher part result of mul into a wider // result. Make sure the type of mul result is VT. MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32); - SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi); - Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res); + SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi); + Res = DAG.getBitcast(ResVT, Res); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res, DAG.getIntPtrConstant(0, DL)); } @@ -31496,7 +32381,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG, if (!MulConstantOptimization) return SDValue(); // An imul is usually smaller than the alternative sequence. - if (DAG.getMachineFunction().getFunction()->optForMinSize()) + if (DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) @@ -31653,7 +32538,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) { return SDValue(); } -static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) { +static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N0.getValueType(); @@ -31706,6 +32591,41 @@ static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) { return SDValue(); } +static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + + // Try to improve a sequence of srl (and X, C1), C2 by inverting the order. + // TODO: This is a generic DAG combine that became an x86-only combine to + // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and + // and-not ('andn'). + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) + return SDValue(); + + auto *ShiftC = dyn_cast<ConstantSDNode>(N1); + auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1)); + if (!ShiftC || !AndC) + return SDValue(); + + // If we can shrink the constant mask below 8-bits or 32-bits, then this + // transform should reduce code size. It may also enable secondary transforms + // from improved known-bits analysis or instruction selection. + APInt MaskVal = AndC->getAPIntValue(); + APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue()); + unsigned OldMaskSize = MaskVal.getMinSignedBits(); + unsigned NewMaskSize = NewMaskVal.getMinSignedBits(); + if ((OldMaskSize > 8 && NewMaskSize <= 8) || + (OldMaskSize > 32 && NewMaskSize <= 32)) { + // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC) + SDLoc DL(N); + SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT); + SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1); + return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask); + } + return SDValue(); +} + /// \brief Returns a vector of 0s if the node in input is a vector logical /// shift by a constant amount which is known to be bigger than or equal /// to the vector element size in bits. @@ -31745,7 +32665,11 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return V; if (N->getOpcode() == ISD::SRA) - if (SDValue V = combineShiftRightAlgebraic(N, DAG)) + if (SDValue V = combineShiftRightArithmetic(N, DAG)) + return V; + + if (N->getOpcode() == ISD::SRL) + if (SDValue V = combineShiftRightLogical(N, DAG)) return V; // Try to fold this logical shift into a zero vector. @@ -31756,6 +32680,90 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) && + "Unexpected shift opcode"); + + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + unsigned DstBitsPerElt = VT.getScalarSizeInBits(); + unsigned SrcBitsPerElt = 2 * DstBitsPerElt; + assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt && + N1.getScalarValueSizeInBits() == SrcBitsPerElt && + "Unexpected PACKSS/PACKUS input type"); + + // Constant Folding. + APInt UndefElts0, UndefElts1; + SmallVector<APInt, 32> EltBits0, EltBits1; + if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) && + (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) && + getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) && + getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) { + unsigned NumLanes = VT.getSizeInBits() / 128; + unsigned NumDstElts = VT.getVectorNumElements(); + unsigned NumSrcElts = NumDstElts / 2; + unsigned NumDstEltsPerLane = NumDstElts / NumLanes; + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + bool IsSigned = (X86ISD::PACKSS == Opcode); + + APInt Undefs(NumDstElts, 0); + SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt)); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { + unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; + auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0); + auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0); + + if (UndefElts[SrcIdx]) { + Undefs.setBit(Lane * NumDstEltsPerLane + Elt); + continue; + } + + APInt &Val = EltBits[SrcIdx]; + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + if (Val.isSignedIntN(DstBitsPerElt)) + Val = Val.trunc(DstBitsPerElt); + else if (Val.isNegative()) + Val = APInt::getSignedMinValue(DstBitsPerElt); + else + Val = APInt::getSignedMaxValue(DstBitsPerElt); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + if (Val.isIntN(DstBitsPerElt)) + Val = Val.trunc(DstBitsPerElt); + else if (Val.isNegative()) + Val = APInt::getNullValue(DstBitsPerElt); + else + Val = APInt::getAllOnesValue(DstBitsPerElt); + } + Bits[Lane * NumDstEltsPerLane + Elt] = Val; + } + } + + return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N)); + } + + // Attempt to combine as shuffle. + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) { + DCI.CombineTo(N, Res); + return SDValue(); + } + + return SDValue(); +} + static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { @@ -31796,15 +32804,24 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG, N0.getOpcode() == X86ISD::VSRAI) return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1); + // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1 + if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI && + N1 == N0.getOperand(1)) { + SDValue N00 = N0.getOperand(0); + unsigned NumSignBits = DAG.ComputeNumSignBits(N00); + if (ShiftVal.ult(NumSignBits)) + return N00; + } + // We can decode 'whole byte' logical bit shifts as shuffles. if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) { SDValue Op(N, 0); - SmallVector<int, 1> NonceMask; // Just a placeholder. - NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, - /*Depth*/ 1, /*HasVarMask*/ false, DAG, - DCI, Subtarget)) - return SDValue(); // This routine will use CombineTo to replace N. + if (SDValue Res = combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) { + DCI.CombineTo(N, Res); + return SDValue(); + } } // Constant Folding. @@ -31840,11 +32857,13 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG, // Attempt to combine PINSRB/PINSRW patterns to a shuffle. SDValue Op(N, 0); - SmallVector<int, 1> NonceMask; // Just a placeholder. - NonceMask.push_back(0); - combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, - /*Depth*/ 1, /*HasVarMask*/ false, DAG, - DCI, Subtarget); + if (SDValue Res = combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) { + DCI.CombineTo(N, Res); + return SDValue(); + } + return SDValue(); } @@ -31911,8 +32930,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG, SDValue FSetCC = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01, DAG.getConstant(x86cc, DL, MVT::i8)); - return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0), - FSetCC, DAG.getIntPtrConstant(0, DL)); + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, + N->getSimpleValueType(0), FSetCC, + DAG.getIntPtrConstant(0, DL)); } SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL, CMP00.getValueType(), CMP00, CMP01, @@ -32103,8 +33123,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, return SDValue(); APInt SplatVal; - if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal, - /*AllowShrink*/false) || + if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || !SplatVal.isMask()) return SDValue(); @@ -32122,9 +33141,137 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG, return DAG.getBitcast(N->getValueType(0), Shift); } +// Get the index node from the lowered DAG of a GEP IR instruction with one +// indexing dimension. +static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) { + if (Ld->isIndexed()) + return SDValue(); + + SDValue Base = Ld->getBasePtr(); + + if (Base.getOpcode() != ISD::ADD) + return SDValue(); + + SDValue ShiftedIndex = Base.getOperand(0); + + if (ShiftedIndex.getOpcode() != ISD::SHL) + return SDValue(); + + return ShiftedIndex.getOperand(0); + +} + +static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) { + if (Subtarget.hasBMI2() && VT.isScalarInteger()) { + switch (VT.getSizeInBits()) { + default: return false; + case 64: return Subtarget.is64Bit() ? true : false; + case 32: return true; + } + } + return false; +} + +// This function recognizes cases where X86 bzhi instruction can replace and +// 'and-load' sequence. +// In case of loading integer value from an array of constants which is defined +// as follows: +// +// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1} +// +// then applying a bitwise and on the result with another input. +// It's equivalent to performing bzhi (zero high bits) on the input, with the +// same index of the load. +static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + MVT VT = Node->getSimpleValueType(0); + SDLoc dl(Node); + + // Check if subtarget has BZHI instruction for the node's type + if (!hasBZHI(Subtarget, VT)) + return SDValue(); + + // Try matching the pattern for both operands. + for (unsigned i = 0; i < 2; i++) { + SDValue N = Node->getOperand(i); + LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode()); + + // continue if the operand is not a load instruction + if (!Ld) + return SDValue(); + + const Value *MemOp = Ld->getMemOperand()->getValue(); + + if (!MemOp) + return SDValue(); + + if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) { + if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) { + if (GV->isConstant() && GV->hasDefinitiveInitializer()) { + + Constant *Init = GV->getInitializer(); + Type *Ty = Init->getType(); + if (!isa<ConstantDataArray>(Init) || + !Ty->getArrayElementType()->isIntegerTy() || + Ty->getArrayElementType()->getScalarSizeInBits() != + VT.getSizeInBits() || + Ty->getArrayNumElements() > + Ty->getArrayElementType()->getScalarSizeInBits()) + continue; + + // Check if the array's constant elements are suitable to our case. + uint64_t ArrayElementCount = Init->getType()->getArrayNumElements(); + bool ConstantsMatch = true; + for (uint64_t j = 0; j < ArrayElementCount; j++) { + ConstantInt *Elem = + dyn_cast<ConstantInt>(Init->getAggregateElement(j)); + if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) { + ConstantsMatch = false; + break; + } + } + if (!ConstantsMatch) + continue; + + // Do the transformation (For 32-bit type): + // -> (and (load arr[idx]), inp) + // <- (and (srl 0xFFFFFFFF, (sub 32, idx))) + // that will be replaced with one bzhi instruction. + SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0); + SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT); + + // Get the Node which indexes into the array. + SDValue Index = getIndexFromUnindexedLoad(Ld); + if (!Index) + return SDValue(); + Index = DAG.getZExtOrTrunc(Index, dl, VT); + + SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index); + + SDValue AllOnes = DAG.getAllOnesConstant(dl, VT); + SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub); + + return DAG.getNode(ISD::AND, dl, VT, Inp, LShr); + } + } + } + } + return SDValue(); +} + static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + EVT VT = N->getValueType(0); + + // If this is SSE1 only convert to FAND to avoid scalarization. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { + return DAG.getBitcast( + MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32, + DAG.getBitcast(MVT::v4f32, N->getOperand(0)), + DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -32140,45 +33287,59 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget)) return ShiftRight; - EVT VT = N->getValueType(0); - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); + if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget)) + return R; // Attempt to recursively combine a bitmask AND with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - SmallVector<int, 1> NonceMask; // Just a placeholder. - NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, - /*Depth*/ 1, /*HasVarMask*/ false, DAG, - DCI, Subtarget)) - return SDValue(); // This routine will use CombineTo to replace N. + if (SDValue Res = combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) { + DCI.CombineTo(N, Res); + return SDValue(); + } } - // Create BEXTR instructions - // BEXTR is ((X >> imm) & (2**size-1)) - if (VT != MVT::i32 && VT != MVT::i64) - return SDValue(); + // Attempt to combine a scalar bitmask AND with an extracted shuffle. + if ((VT.getScalarSizeInBits() % 8) == 0 && + N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT && + isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) { + SDValue BitMask = N->getOperand(1); + SDValue SrcVec = N->getOperand(0).getOperand(0); + EVT SrcVecVT = SrcVec.getValueType(); - if (!Subtarget.hasBMI() && !Subtarget.hasTBM()) - return SDValue(); - if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL) - return SDValue(); + // Check that the constant bitmask masks whole bytes. + APInt UndefElts; + SmallVector<APInt, 64> EltBits; + if (VT == SrcVecVT.getScalarType() && + N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) && + getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) && + llvm::all_of(EltBits, [](APInt M) { + return M.isNullValue() || M.isAllOnesValue(); + })) { + unsigned NumElts = SrcVecVT.getVectorNumElements(); + unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8; + unsigned Idx = N->getOperand(0).getConstantOperandVal(1); + + // Create a root shuffle mask from the byte mask and the extracted index. + SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef); + for (unsigned i = 0; i != Scale; ++i) { + if (UndefElts[i]) + continue; + int VecIdx = Scale * Idx + i; + ShuffleMask[VecIdx] = + EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx; + } - ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1); - ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1)); - if (MaskNode && ShiftNode) { - uint64_t Mask = MaskNode->getZExtValue(); - uint64_t Shift = ShiftNode->getZExtValue(); - if (isMask_64(Mask)) { - uint64_t MaskSize = countPopulation(Mask); - if (Shift + MaskSize <= VT.getSizeInBits()) - return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0), - DAG.getConstant(Shift | (MaskSize << 8), DL, - VT)); + if (SDValue Shuffle = combineX86ShufflesRecursively( + {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) + return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle, + N->getOperand(0).getOperand(1)); } } + return SDValue(); } @@ -32411,6 +33572,18 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, static SDValue combineOr(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // If this is SSE1 only convert to FOR to avoid scalarization. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) { + return DAG.getBitcast(MVT::v4i32, + DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32, + DAG.getBitcast(MVT::v4f32, N0), + DAG.getBitcast(MVT::v4f32, N1))); + } + if (DCI.isBeforeLegalizeOps()) return SDValue(); @@ -32423,15 +33596,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget)) return R; - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - EVT VT = N->getValueType(0); - if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64) return SDValue(); // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c) - bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize(); + bool OptForSize = DAG.getMachineFunction().getFunction().optForSize(); // SHLD/SHRD instructions have lower register pressure, but on some // platforms they have higher latency than the equivalent @@ -32521,38 +33690,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Generate NEG and CMOV for integer abs. -static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) { - EVT VT = N->getValueType(0); - - // Since X86 does not have CMOV for 8-bit integer, we don't convert - // 8-bit integer abs to NEG and CMOV. - if (VT.isInteger() && VT.getSizeInBits() == 8) - return SDValue(); - - SDValue N0 = N->getOperand(0); - SDValue N1 = N->getOperand(1); - SDLoc DL(N); - - // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1) - // and change it to SUB and CMOV. - if (VT.isInteger() && N->getOpcode() == ISD::XOR && - N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 && - N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) { - auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1)); - if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) { - // Generate SUB & CMOV. - SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32), - DAG.getConstant(0, DL, VT), N0.getOperand(0)); - SDValue Ops[] = {N0.getOperand(0), Neg, - DAG.getConstant(X86::COND_GE, DL, MVT::i8), - SDValue(Neg.getNode(), 1)}; - return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops); - } - } - return SDValue(); -} - /// Try to turn tests against the signbit in the form of: /// XOR(TRUNCATE(SRL(X, size(X)-1)), 1) /// into: @@ -32688,8 +33825,7 @@ static SDValue detectUSatPattern(SDValue In, EVT VT) { "Unexpected types for truncate operation"); APInt C; - if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C, - /*AllowShrink*/false)) { + if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) { // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according // the element size of the destination type. return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) : @@ -33081,6 +34217,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0, DAG.getUNDEF(WideVecVT), ShuffleVec); } + // Prepare the new mask. SDValue NewMask; SDValue Mask = Mld->getMask(); @@ -33103,12 +34240,9 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector<SDValue, 16> Ops(NumConcat); SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; - NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } @@ -33154,8 +34288,33 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, if (Mst->isCompressingStore()) return SDValue(); - if (!Mst->isTruncatingStore()) - return reduceMaskedStoreToScalarStore(Mst, DAG); + if (!Mst->isTruncatingStore()) { + if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG)) + return ScalarStore; + + // If the mask is checking (0 > X), we're creating a vector with all-zeros + // or all-ones elements based on the sign bits of X. AVX1 masked store only + // cares about the sign bit of each mask element, so eliminate the compare: + // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X + // Note that by waiting to match an x86-specific PCMPGT node, we're + // eliminating potentially more complex matching of a setcc node which has + // a full range of predicates. + SDValue Mask = Mst->getMask(); + if (Mask.getOpcode() == X86ISD::PCMPGT && + ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) { + assert(Mask.getValueType() == Mask.getOperand(1).getValueType() && + "Unexpected type for PCMPGT"); + return DAG.getMaskedStore( + Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(), + Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand()); + } + + // TODO: AVX512 targets should also be able to simplify something like the + // pattern above, but that pattern will be different. It will either need to + // match setcc more generally or match PCMPGTM later (in tablegen?). + + return SDValue(); + } // Resolve truncating stores. EVT VT = Mst->getValue().getValueType(); @@ -33226,12 +34385,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG, WidenNumElts); unsigned NumConcat = WidenNumElts / MaskNumElts; - SmallVector<SDValue, 16> Ops(NumConcat); SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType()); + SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal); Ops[0] = Mask; - for (unsigned i = 1; i != NumConcat; ++i) - Ops[i] = ZeroVal; - NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops); } @@ -33384,8 +34540,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (VT.getSizeInBits() != 64) return SDValue(); - const Function *F = DAG.getMachineFunction().getFunction(); - bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat); + const Function &F = DAG.getMachineFunction().getFunction(); + bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat); bool F64IsLegal = !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2(); if ((VT.isVector() || @@ -33393,28 +34549,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, isa<LoadSDNode>(St->getValue()) && !cast<LoadSDNode>(St->getValue())->isVolatile() && St->getChain().hasOneUse() && !St->isVolatile()) { - SDNode* LdVal = St->getValue().getNode(); - LoadSDNode *Ld = nullptr; - int TokenFactorIndex = -1; + LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode()); SmallVector<SDValue, 8> Ops; - SDNode* ChainVal = St->getChain().getNode(); - // Must be a store of a load. We currently handle two cases: the load - // is a direct child, and it's under an intervening TokenFactor. It is - // possible to dig deeper under nested TokenFactors. - if (ChainVal == LdVal) - Ld = cast<LoadSDNode>(St->getChain()); - else if (St->getValue().hasOneUse() && - ChainVal->getOpcode() == ISD::TokenFactor) { - for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) { - if (ChainVal->getOperand(i).getNode() == LdVal) { - TokenFactorIndex = i; - Ld = cast<LoadSDNode>(St->getValue()); - } else - Ops.push_back(ChainVal->getOperand(i)); - } - } - if (!Ld || !ISD::isNormalLoad(Ld)) + if (!ISD::isNormalLoad(Ld)) return SDValue(); // If this is not the MMX case, i.e. we are just turning i64 load/store @@ -33431,17 +34569,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, if (Subtarget.is64Bit() || F64IsLegal) { MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64; SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(), - Ld->getPointerInfo(), Ld->getAlignment(), - Ld->getMemOperand()->getFlags()); + Ld->getMemOperand()); + // Make sure new load is placed in same chain order. - SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd); - if (TokenFactorIndex >= 0) { - Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); - } - return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(), - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); + DAG.makeEquivalentMemoryOrdering(Ld, NewLd); + return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(), + St->getMemOperand()); } // Otherwise, lower to two pairs of 32-bit loads / stores. @@ -33456,23 +34589,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG, MinAlign(Ld->getAlignment(), 4), Ld->getMemOperand()->getFlags()); // Make sure new loads are placed in same chain order. - SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd); - NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd); - - if (TokenFactorIndex >= 0) { - Ops.push_back(NewChain); - NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops); - } + DAG.makeEquivalentMemoryOrdering(Ld, LoLd); + DAG.makeEquivalentMemoryOrdering(Ld, HiLd); LoAddr = St->getBasePtr(); HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL); SDValue LoSt = - DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(), + DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(), St->getAlignment(), St->getMemOperand()->getFlags()); - SDValue HiSt = DAG.getStore( - NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4), - MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags()); + SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr, + St->getPointerInfo().getWithOffset(4), + MinAlign(St->getAlignment(), 4), + St->getMemOperand()->getFlags()); return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt); } @@ -33726,6 +34855,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG, return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1)); LLVM_FALLTHROUGH; case ISD::ADD: { + // TODO: ISD::SUB should be here but interferes with combineSubToSubus. SDValue Op0 = Src.getOperand(0); SDValue Op1 = Src.getOperand(1); if (TLI.isOperationLegal(Opcode, VT) && @@ -33882,8 +35012,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// This function transforms vector truncation of 'all or none' bits values. -/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations. +/// This function transforms vector truncation of 'extended sign-bits' or +/// 'extended zero-bits' values. +/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations. static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, SelectionDAG &DAG, const X86Subtarget &Subtarget) { @@ -33904,12 +35035,6 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, MVT InVT = In.getValueType().getSimpleVT(); MVT InSVT = InVT.getScalarType(); - // Use PACKSS if the input is a splatted sign bit. - // e.g. Comparison result, sext_in_reg, etc. - unsigned NumSignBits = DAG.ComputeNumSignBits(In); - if (NumSignBits != InSVT.getSizeInBits()) - return SDValue(); - // Check we have a truncation suited for PACKSS. if (!VT.is128BitVector() && !VT.is256BitVector()) return SDValue(); @@ -33918,7 +35043,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL, if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64) return SDValue(); - return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget); + // Use PACKSS if the input has sign-bits that extend all the way to the + // packed/truncated value. e.g. Comparison result, sext_in_reg, etc. + unsigned NumSignBits = DAG.ComputeNumSignBits(In); + unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16); + if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget); + + // Use PACKUS if the input has zero-bits that extend all the way to the + // packed/truncated value. e.g. masks, zext_in_reg, etc. + KnownBits Known; + DAG.computeKnownBits(In, Known); + unsigned NumLeadingZeroBits = Known.countMinLeadingZeros(); + NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8; + if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits)) + return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget); + + return SDValue(); } static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, @@ -33947,7 +35088,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG, return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc); } - // Try to truncate extended sign bits with PACKSS. + // Try to truncate extended sign/zero bits with PACKSS/PACKUS. if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget)) return V; @@ -34038,10 +35179,10 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG, unsigned NewOpcode = 0; if (Arg.hasOneUse()) { switch (Arg.getOpcode()) { - case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break; + case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break; - case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break; + case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break; case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break; case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break; @@ -34083,22 +35224,47 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG, return SDValue(); } + +/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val) +static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) { + if (N->getOpcode() != ISD::XOR) + return SDValue(); + + SDValue LHS = N->getOperand(0); + auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1)); + if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC) + return SDValue(); + + X86::CondCode NewCC = X86::GetOppositeBranchCondition( + X86::CondCode(LHS->getConstantOperandVal(0))); + SDLoc DL(N); + return getSETCC(NewCC, LHS->getOperand(1), DL, DAG); +} + static SDValue combineXor(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { + // If this is SSE1 only convert to FXOR to avoid scalarization. + if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && + N->getValueType(0) == MVT::v4i32) { + return DAG.getBitcast( + MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32, + DAG.getBitcast(MVT::v4f32, N->getOperand(0)), + DAG.getBitcast(MVT::v4f32, N->getOperand(1)))); + } + if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget)) return Cmp; if (DCI.isBeforeLegalizeOps()) return SDValue(); + if (SDValue SetCC = foldXor1SetCC(N, DAG)) + return SetCC; + if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG)) return RV; - if (Subtarget.hasCMov()) - if (SDValue RV = combineIntegerAbs(N, DAG)) - return RV; - if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget)) return FPLogic; @@ -34138,10 +35304,13 @@ static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG, // Vector types are handled in combineANDXORWithAllOnesIntoANDNP(). if (!((VT == MVT::f32 && Subtarget.hasSSE1()) || - (VT == MVT::f64 && Subtarget.hasSSE2()))) + (VT == MVT::f64 && Subtarget.hasSSE2()) || + (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2()))) return SDValue(); auto isAllOnesConstantFP = [](SDValue V) { + if (V.getSimpleValueType().isVector()) + return ISD::isBuildVectorAllOnes(V.getNode()); auto *C = dyn_cast<ConstantFPSDNode>(V); return C && C->getConstantFPValue()->isAllOnesValue(); }; @@ -34247,7 +35416,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, // This takes at least 3 instructions, so favor a library call when operating // on a scalar and minimizing code size. - if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize()) + if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize()) return SDValue(); SDValue Op0 = N->getOperand(0); @@ -34301,12 +35470,12 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, // Attempt to recursively combine a bitmask ANDNP with shuffles. if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) { SDValue Op(N, 0); - SmallVector<int, 1> NonceMask; // Just a placeholder. - NonceMask.push_back(0); - if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {}, - /*Depth*/ 1, /*HasVarMask*/ false, DAG, - DCI, Subtarget)) - return SDValue(); // This routine will use CombineTo to replace N. + if (SDValue Res = combineX86ShufflesRecursively( + {Op}, 0, Op, {0}, {}, /*Depth*/ 1, + /*HasVarMask*/ false, DAG, DCI, Subtarget)) { + DCI.CombineTo(N, Res); + return SDValue(); + } } return SDValue(); @@ -34314,19 +35483,15 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG, static SDValue combineBT(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + // BT ignores high bits in the bit index operand. - SDValue Op1 = N->getOperand(1); - if (Op1.hasOneUse()) { - unsigned BitWidth = Op1.getValueSizeInBits(); - APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); - KnownBits Known; - TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), - !DCI.isBeforeLegalizeOps()); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) || - TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO)) - DCI.CommitTargetLoweringOpt(TLO); - } + unsigned BitWidth = N1.getValueSizeInBits(); + APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth)); + if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask)) + return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1); + return SDValue(); } @@ -34444,18 +35609,152 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) { EVT VT = N->getValueType(0); EVT InVT = N0.getValueType(); - if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32) + if (N0.getResNo() != 1 || InVT != MVT::i8 || + !(VT == MVT::i32 || VT == MVT::i64)) return SDValue(); - SDVTList NodeTys = DAG.getVTList(MVT::i8, VT); + SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32); auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG : X86ISD::UDIVREM8_ZEXT_HREG; SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0), N0.getOperand(1)); DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0)); + // If this was a 64-bit extend, complete it. + if (VT == MVT::i64) + return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1)); return R.getValue(1); } +// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant +// operands and the result of CMOV is not used anywhere else - promote CMOV +// itself instead of promoting its result. This could be beneficial, because: +// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two +// (or more) pseudo-CMOVs only when they go one-after-another and +// getting rid of result extension code after CMOV will help that. +// 2) Promotion of constant CMOV arguments is free, hence the +// {ANY,SIGN,ZERO}_EXTEND will just be deleted. +// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this +// promotion is also good in terms of code-size. +// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit +// promotion). +static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) { + SDValue CMovN = Extend->getOperand(0); + if (CMovN.getOpcode() != X86ISD::CMOV) + return SDValue(); + + EVT TargetVT = Extend->getValueType(0); + unsigned ExtendOpcode = Extend->getOpcode(); + SDLoc DL(Extend); + + EVT VT = CMovN.getValueType(); + SDValue CMovOp0 = CMovN.getOperand(0); + SDValue CMovOp1 = CMovN.getOperand(1); + + bool DoPromoteCMOV = + (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) && + CMovN.hasOneUse() && + (isa<ConstantSDNode>(CMovOp0.getNode()) && + isa<ConstantSDNode>(CMovOp1.getNode())); + + if (!DoPromoteCMOV) + return SDValue(); + + CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0); + CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1); + + return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1, + CMovN.getOperand(2), CMovN.getOperand(3)); +} + +// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)). +// This is more or less the reverse of combineBitcastvxi1. +static SDValue +combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + unsigned Opcode = N->getOpcode(); + if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND && + Opcode != ISD::ANY_EXTEND) + return SDValue(); + if (!DCI.isBeforeLegalizeOps()) + return SDValue(); + if (!Subtarget.hasSSE2() || Subtarget.hasAVX512()) + return SDValue(); + + SDValue N0 = N->getOperand(0); + EVT VT = N->getValueType(0); + EVT SVT = VT.getScalarType(); + EVT InSVT = N0.getValueType().getScalarType(); + unsigned EltSizeInBits = SVT.getSizeInBits(); + + // Input type must be extending a bool vector (bit-casted from a scalar + // integer) to legal integer types. + if (!VT.isVector()) + return SDValue(); + if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8) + return SDValue(); + if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST) + return SDValue(); + + SDValue N00 = N0.getOperand(0); + EVT SclVT = N0.getOperand(0).getValueType(); + if (!SclVT.isScalarInteger()) + return SDValue(); + + SDLoc DL(N); + SDValue Vec; + SmallVector<int, 32> ShuffleMask; + unsigned NumElts = VT.getVectorNumElements(); + assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size"); + + // Broadcast the scalar integer to the vector elements. + if (NumElts > EltSizeInBits) { + // If the scalar integer is greater than the vector element size, then we + // must split it down into sub-sections for broadcasting. For example: + // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections. + // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections. + assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale"); + unsigned Scale = NumElts / EltSizeInBits; + EVT BroadcastVT = + EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00); + Vec = DAG.getBitcast(VT, Vec); + + for (unsigned i = 0; i != Scale; ++i) + ShuffleMask.append(EltSizeInBits, i); + } else { + // For smaller scalar integers, we can simply any-extend it to the vector + // element size (we don't care about the upper bits) and broadcast it to all + // elements. + SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT); + Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl); + ShuffleMask.append(NumElts, 0); + } + Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask); + + // Now, mask the relevant bit in each element. + SmallVector<SDValue, 32> Bits; + for (unsigned i = 0; i != NumElts; ++i) { + int BitIdx = (i % EltSizeInBits); + APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1); + Bits.push_back(DAG.getConstant(Bit, DL, SVT)); + } + SDValue BitMask = DAG.getBuildVector(VT, DL, Bits); + Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask); + + // Compare against the bitmask and extend the result. + EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts); + Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ); + Vec = DAG.getSExtOrTrunc(Vec, DL, VT); + + // For SEXT, this is now done, otherwise shift the result down for + // zero-extension. + if (Opcode == ISD::SIGN_EXTEND) + return Vec; + return DAG.getNode(ISD::SRL, DL, VT, Vec, + DAG.getConstant(EltSizeInBits - 1, DL, VT)); +} + /// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or /// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating /// with UNDEFs) of the input to vectors of the same size as the target type @@ -34570,6 +35869,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue DivRem8 = getDivRem8(N, DAG)) return DivRem8; + if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) + return NewCMov; + if (!DCI.isBeforeLegalizeOps()) { if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); @@ -34592,6 +35894,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; + if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (Subtarget.hasAVX() && VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -34604,6 +35909,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode. SDLoc dl(N); EVT VT = N->getValueType(0); @@ -34629,48 +35935,112 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG, // Do not convert the passthru input of scalar intrinsics. // FIXME: We could allow negations of the lower element only. - bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); + bool NegA = N->getOpcode() != X86ISD::FMADDS1 && + N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A); bool NegB = invertIfNegative(B); - bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); + bool NegC = N->getOpcode() != X86ISD::FMADDS3 && + N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C); // Negative multiplication when NegA xor NegB bool NegMul = (NegA != NegB); + bool HasNeg = NegA || NegB || NegC; unsigned NewOpcode; if (!NegMul) - NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB; + NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB); else NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB; + // For FMA, we risk reconstructing the node we started with. + // In order to avoid this, we check for negation or opcode change. If + // one of the two happened, then it is a new node and we return it. + if (N->getOpcode() == ISD::FMA) { + if (HasNeg || NewOpcode != N->getOpcode()) + return DAG.getNode(NewOpcode, dl, VT, A, B, C); + return SDValue(); + } if (N->getOpcode() == X86ISD::FMADD_RND) { switch (NewOpcode) { - case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break; + case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break; } + } else if (N->getOpcode() == X86ISD::FMADDS1) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break; + } + } else if (N->getOpcode() == X86ISD::FMADDS3) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break; + } } else if (N->getOpcode() == X86ISD::FMADDS1_RND) { switch (NewOpcode) { - case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break; + case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break; } } else if (N->getOpcode() == X86ISD::FMADDS3_RND) { switch (NewOpcode) { - case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break; + case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break; case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break; case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break; case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break; } + } else if (N->getOpcode() == X86ISD::FMADD4S) { + switch (NewOpcode) { + case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break; + case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break; + case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break; + case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break; + } } else { - assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) && - "Unexpected opcode!"); + llvm_unreachable("Unexpected opcode!"); + } + + // Only return the node is the opcode was changed or one of the + // operand was negated. If not, we'll just recreate the same node. + if (HasNeg || NewOpcode != N->getOpcode()) { + if (N->getNumOperands() == 4) + return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); return DAG.getNode(NewOpcode, dl, VT, A, B, C); } - return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3)); + return SDValue(); +} + +// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C) +static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDLoc dl(N); + EVT VT = N->getValueType(0); + + SDValue NegVal = isFNEG(N->getOperand(2).getNode()); + if (!NegVal) + return SDValue(); + + unsigned NewOpcode; + switch (N->getOpcode()) { + default: llvm_unreachable("Unexpected opcode!"); + case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break; + case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break; + case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break; + case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break; + } + + if (N->getNumOperands() == 4) + return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), + NegVal, N->getOperand(3)); + return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1), + NegVal); } static SDValue combineZext(SDNode *N, SelectionDAG &DAG, @@ -34710,9 +36080,15 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG, } } + if (SDValue NewCMov = combineToExtendCMOV(N, DAG)) + return NewCMov; + if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget)) return V; + if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget)) + return V; + if (VT.is256BitVector()) if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget)) return R; @@ -34804,23 +36180,19 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, return V; } - if (VT.getScalarType() == MVT::i1 && + if (VT.isVector() && VT.getVectorElementType() == MVT::i1 && (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) { - bool IsSEXT0 = - (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); - bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); - - if (!IsSEXT0 || !IsVZero1) { - // Swap the operands and update the condition code. + // Put build_vectors on the right. + if (LHS.getOpcode() == ISD::BUILD_VECTOR) { std::swap(LHS, RHS); CC = ISD::getSetCCSwappedOperands(CC); - - IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) && - (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1); - IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); } + bool IsSEXT0 = + (LHS.getOpcode() == ISD::SIGN_EXTEND) && + (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1); + bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode()); + if (IsSEXT0 && IsVZero1) { assert(VT == LHS.getOperand(0).getValueType() && "Uexpected operand type"); @@ -34846,17 +36218,92 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG, return SDValue(); } -static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) { +static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI) { + SDValue Src = N->getOperand(0); + MVT SrcVT = Src.getSimpleValueType(); + + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + + // MOVMSK only uses the MSB from each vector element. + KnownBits Known; + APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits())); + if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) { + DCI.AddToWorklist(Src.getNode()); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(N, 0); + } + + return SDValue(); +} + +static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { SDLoc DL(N); + + // Pre-shrink oversized index elements to avoid triggering scalarization. + if (DCI.isBeforeLegalize()) { + SDValue Index = N->getOperand(4); + if (Index.getScalarValueSizeInBits() > 64) { + EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64, + Index.getValueType().getVectorNumElements()); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index); + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Trunc; + DAG.UpdateNodeOperands(N, NewOps); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + } + + // Try to remove sign extends from i32 to i64 on the index. + // Only do this before legalize in case we are relying on it for + // legalization. + // TODO: We should maybe remove any sign extend once we learn how to sign + // extend narrow index during lowering. + if (DCI.isBeforeLegalizeOps()) { + SDValue Index = N->getOperand(4); + if (Index.getScalarValueSizeInBits() == 64 && + Index.getOpcode() == ISD::SIGN_EXTEND && + Index.getOperand(0).getScalarValueSizeInBits() == 32) { + SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); + NewOps[4] = Index.getOperand(0); + DAG.UpdateNodeOperands(N, NewOps); + // The original sign extend has less users, add back to worklist in case + // it needs to be removed. + DCI.AddToWorklist(Index.getNode()); + DCI.AddToWorklist(N); + return SDValue(N, 0); + } + } + // Gather and Scatter instructions use k-registers for masks. The type of // the masks is v*i1. So the mask will be truncated anyway. // The SIGN_EXTEND_INREG my be dropped. SDValue Mask = N->getOperand(2); - if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { + if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) { SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end()); NewOps[2] = Mask.getOperand(0); DAG.UpdateNodeOperands(N, NewOps); } + + // With AVX2 we only demand the upper bit of the mask. + if (!Subtarget.hasAVX512()) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(), + !DCI.isBeforeLegalizeOps()); + KnownBits Known; + APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits())); + if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) { + DCI.AddToWorklist(Mask.getNode()); + DCI.CommitTargetLoweringOpt(TLO); + return SDValue(N, 0); + } + } + return SDValue(); } @@ -34868,7 +36315,7 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG, SDValue EFLAGS = N->getOperand(1); // Try to simplify the EFLAGS and condition code operands. - if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) return getSETCC(CC, Flags, DL, DAG); return SDValue(); @@ -34884,7 +36331,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG, // Try to simplify the EFLAGS and condition code operands. // Make sure to not keep references to operands, as combineSetCCEFLAGS can // RAUW them under us. - if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) { + if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) { SDValue Cond = DAG.getConstant(CC, DL, MVT::i8); return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0), N->getOperand(1), Cond, Flags); @@ -34945,7 +36392,6 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, EVT VT = N->getValueType(0); EVT InVT = Op0.getValueType(); EVT InSVT = InVT.getScalarType(); - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32)) // UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32)) @@ -34955,9 +36401,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG, InVT.getVectorNumElements()); SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0); - if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT)) - return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P); - + // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP. return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P); } @@ -35049,7 +36493,7 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) { // Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS static SDValue combineADC(SDNode *N, SelectionDAG &DAG, - X86TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI) { // If the LHS and RHS of the ADC node are zero, then it can't overflow and // the result is either zero or one (depending on the input carry bit). // Strength reduce this down to a "set on carry" aka SETCC_CARRY&1. @@ -35260,6 +36704,9 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) { static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + SDValue MulOp = N->getOperand(0); SDValue Phi = N->getOperand(1); @@ -35305,6 +36752,9 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG, static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { + if (!Subtarget.hasSSE2()) + return SDValue(); + SDLoc DL(N); EVT VT = N->getValueType(0); SDValue Op0 = N->getOperand(0); @@ -35362,16 +36812,13 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG, Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad); if (VT.getSizeInBits() > ResVT.getSizeInBits()) { - // Update part of elements of the reduction vector. This is done by first - // extracting a sub-vector from it, updating this sub-vector, and inserting - // it back. - SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi, - DAG.getIntPtrConstant(0, DL)); - SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi); - return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res, - DAG.getIntPtrConstant(0, DL)); - } else - return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); + // Fill the upper elements with zero to match the add width. + SDValue Zero = DAG.getConstant(0, DL, VT); + Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad, + DAG.getIntPtrConstant(0, DL)); + } + + return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi); } /// Convert vector increment or decrement to sub/add with an all-ones constant: @@ -35392,7 +36839,7 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) { SDNode *N1 = N->getOperand(1).getNode(); APInt SplatVal; - if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) || + if (!ISD::isConstantSplatVector(N1, SplatVal) || !SplatVal.isOneValue()) return SDValue(); @@ -35426,6 +36873,89 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG, return combineAddOrSubToADCOrSBB(N, DAG); } +static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { + SDValue Op0 = N->getOperand(0); + SDValue Op1 = N->getOperand(1); + EVT VT = N->getValueType(0); + + // PSUBUS is supported, starting from SSE2, but special preprocessing + // for v8i32 requires umin, which appears in SSE41. + if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) && + !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) && + !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) && + !(Subtarget.hasAVX512() && Subtarget.hasBWI() && + (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 || + VT == MVT::v8i64))) + return SDValue(); + + SDValue SubusLHS, SubusRHS; + // Try to find umax(a,b) - b or a - umin(a,b) patterns + // they may be converted to subus(a,b). + // TODO: Need to add IR cannonicialization for this code. + if (Op0.getOpcode() == ISD::UMAX) { + SubusRHS = Op1; + SDValue MaxLHS = Op0.getOperand(0); + SDValue MaxRHS = Op0.getOperand(1); + if (MaxLHS == Op1) + SubusLHS = MaxRHS; + else if (MaxRHS == Op1) + SubusLHS = MaxLHS; + else + return SDValue(); + } else if (Op1.getOpcode() == ISD::UMIN) { + SubusLHS = Op0; + SDValue MinLHS = Op1.getOperand(0); + SDValue MinRHS = Op1.getOperand(1); + if (MinLHS == Op0) + SubusRHS = MinRHS; + else if (MinRHS == Op0) + SubusRHS = MinLHS; + else + return SDValue(); + } else + return SDValue(); + + // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with + // special preprocessing in some cases. + if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64) + return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS); + + // Special preprocessing case can be only applied + // if the value was zero extended from 16 bit, + // so we require first 16 bits to be zeros for 32 bit + // values, or first 48 bits for 64 bit values. + KnownBits Known; + DAG.computeKnownBits(SubusLHS, Known); + unsigned NumZeros = Known.countMinLeadingZeros(); + if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16) + return SDValue(); + + EVT ExtType = SubusLHS.getValueType(); + EVT ShrinkedType; + if (VT == MVT::v8i32 || VT == MVT::v8i64) + ShrinkedType = MVT::v8i16; + else + ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16; + + // If SubusLHS is zeroextended - truncate SubusRHS to it's + // size SubusRHS = umin(0xFFF.., SubusRHS). + SDValue SaturationConst = + DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(), + ShrinkedType.getScalarSizeInBits()), + SDLoc(SubusLHS), ExtType); + SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS, + SaturationConst); + SDValue NewSubusLHS = + DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType); + SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType); + SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType, + NewSubusLHS, NewSubusRHS); + // Zero extend the result, it may be used somewhere as 32 bit, + // if not zext and following trunc will shrink. + return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType); +} + static SDValue combineSub(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); @@ -35459,6 +36989,10 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG, if (SDValue V = combineIncDecVector(N, DAG)) return V; + // Try to create PSUBUS if SUB's argument is max/min + if (SDValue V = combineSubToSubus(N, DAG, Subtarget)) + return V; + return combineAddOrSubToADCOrSBB(N, DAG); } @@ -35554,39 +37088,26 @@ static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG, return SDValue(); } -/// Canonicalize (LSUB p, 1) -> (LADD p, -1). -static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG, - const X86Subtarget &Subtarget) { - SDValue Chain = N->getOperand(0); - SDValue LHS = N->getOperand(1); - SDValue RHS = N->getOperand(2); - MVT VT = RHS.getSimpleValueType(); - SDLoc DL(N); - - auto *C = dyn_cast<ConstantSDNode>(RHS); - if (!C || C->getZExtValue() != 1) - return SDValue(); - - RHS = DAG.getConstant(-1, DL, VT); - MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand(); - return DAG.getMemIntrinsicNode(X86ISD::LADD, DL, - DAG.getVTList(MVT::i32, MVT::Other), - {Chain, LHS, RHS}, VT, MMO); -} - -// TEST (AND a, b) ,(AND a, b) -> TEST a, b -static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) { +static SDValue combineTestM(SDNode *N, SelectionDAG &DAG, + const X86Subtarget &Subtarget) { SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); - if (Op0 != Op1 || Op1->getOpcode() != ISD::AND) - return SDValue(); - - EVT VT = N->getValueType(0); + MVT VT = N->getSimpleValueType(0); SDLoc DL(N); - return DAG.getNode(X86ISD::TESTM, DL, VT, - Op0->getOperand(0), Op0->getOperand(1)); + // TEST (AND a, b) ,(AND a, b) -> TEST a, b + if (Op0 == Op1 && Op1->getOpcode() == ISD::AND) + return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0), + Op0->getOperand(1)); + + // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero) + // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero) + if (ISD::isBuildVectorAllZeros(Op0.getNode()) || + ISD::isBuildVectorAllZeros(Op1.getNode())) + return getZeroVector(VT, Subtarget, DAG, DL); + + return SDValue(); } static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG, @@ -35610,21 +37131,55 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, if (DCI.isBeforeLegalizeOps()) return SDValue(); + MVT OpVT = N->getSimpleValueType(0); + + // Early out for mask vectors. + if (OpVT.getVectorElementType() == MVT::i1) + return SDValue(); + SDLoc dl(N); SDValue Vec = N->getOperand(0); SDValue SubVec = N->getOperand(1); - SDValue Idx = N->getOperand(2); - unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue(); - MVT OpVT = N->getSimpleValueType(0); + unsigned IdxVal = N->getConstantOperandVal(2); MVT SubVecVT = SubVec.getSimpleValueType(); + if (ISD::isBuildVectorAllZeros(Vec.getNode())) { + // Inserting zeros into zeros is a nop. + if (ISD::isBuildVectorAllZeros(SubVec.getNode())) + return Vec; + + // If we're inserting into a zero vector and then into a larger zero vector, + // just insert into the larger zero vector directly. + if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR && + ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) { + unsigned Idx2Val = SubVec.getConstantOperandVal(2); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, + SubVec.getOperand(1), + DAG.getIntPtrConstant(IdxVal + Idx2Val, dl)); + } + + // If we're inserting a bitcast into zeros, rewrite the insert and move the + // bitcast to the other side. This helps with detecting zero extending + // during isel. + // TODO: Is this useful for other indices than 0? + if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) { + MVT CastVT = SubVec.getOperand(0).getSimpleValueType(); + unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits(); + MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems); + SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT, + DAG.getBitcast(NewVT, Vec), + SubVec.getOperand(0), N->getOperand(2)); + return DAG.getBitcast(OpVT, Insert); + } + } + // If this is an insert of an extract, combine to a shuffle. Don't do this - // if the insert or extract can be represented with a subvector operation. + // if the insert or extract can be represented with a subregister operation. if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR && SubVec.getOperand(0).getSimpleValueType() == OpVT && (IdxVal != 0 || !Vec.isUndef())) { - int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue(); + int ExtIdxVal = SubVec.getConstantOperandVal(1); if (ExtIdxVal != 0) { int VecNumElts = OpVT.getVectorNumElements(); int SubVecNumElts = SubVecVT.getVectorNumElements(); @@ -35679,17 +37234,36 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, } // If lower/upper loads are the same and the only users of the load, then // lower to a VBROADCASTF128/VBROADCASTI128/etc. - if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) { + if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) && - SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) { + SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec); - } - } + // If this is subv_broadcast insert into both halves, use a larger // subv_broadcast. - if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) { + if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec.getOperand(0)); + + // If we're inserting all zeros into the upper half, change this to + // an insert into an all zeros vector. We will match this to a move + // with implicit upper bit zeroing during isel. + if (ISD::isBuildVectorAllZeros(SubVec.getNode())) + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, + getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2, + Vec.getOperand(2)); + + // If we are inserting into both halves of the vector, the starting + // vector should be undef. If it isn't, make it so. Only do this if the + // the early insert has no other uses. + // TODO: Should this be a generic DAG combine? + if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) { + Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT), + SubVec2, Vec.getOperand(2)); + DCI.AddToWorklist(Vec.getNode()); + return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec, + N->getOperand(2)); + } } } @@ -35697,6 +37271,32 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG, return SDValue(); } +static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const X86Subtarget &Subtarget) { + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + MVT OpVT = N->getSimpleValueType(0); + SDValue InVec = N->getOperand(0); + unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue(); + + if (ISD::isBuildVectorAllZeros(InVec.getNode())) + return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N)); + + if (ISD::isBuildVectorAllOnes(InVec.getNode())) { + if (OpVT.getScalarType() == MVT::i1) + return DAG.getConstant(1, SDLoc(N), OpVT); + return getOnesVector(OpVT, DAG, SDLoc(N)); + } + + if (InVec.getOpcode() == ISD::BUILD_VECTOR) + return DAG.getBuildVector( + OpVT, SDLoc(N), + InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements())); + + return SDValue(); +} SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { @@ -35704,12 +37304,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, switch (N->getOpcode()) { default: break; case ISD::EXTRACT_VECTOR_ELT: - return combineExtractVectorElt(N, DAG, DCI, Subtarget); case X86ISD::PEXTRW: case X86ISD::PEXTRB: - return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget); + return combineExtractVectorElt(N, DAG, DCI, Subtarget); case ISD::INSERT_SUBVECTOR: return combineInsertSubvector(N, DAG, DCI, Subtarget); + case ISD::EXTRACT_SUBVECTOR: + return combineExtractSubvector(N, DAG, DCI, Subtarget); case ISD::VSELECT: case ISD::SELECT: case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget); @@ -35753,6 +37354,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case ISD::SETCC: return combineSetCC(N, DAG, Subtarget); case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget); case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget); + case X86ISD::PACKSS: + case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget); case X86ISD::VSHLI: case X86ISD::VSRAI: case X86ISD::VSRLI: @@ -35784,6 +37387,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::MOVDDUP: case X86ISD::MOVSS: case X86ISD::MOVSD: + case X86ISD::VBROADCAST: case X86ISD::VPPERM: case X86ISD::VPERMI: case X86ISD::VPERMV: @@ -35795,15 +37399,23 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N, case X86ISD::VPERM2X128: case X86ISD::VZEXT_MOVL: case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget); - case X86ISD::FMADD: case X86ISD::FMADD_RND: case X86ISD::FMADDS1_RND: case X86ISD::FMADDS3_RND: + case X86ISD::FMADDS1: + case X86ISD::FMADDS3: + case X86ISD::FMADD4S: case ISD::FMA: return combineFMA(N, DAG, Subtarget); + case X86ISD::FMADDSUB_RND: + case X86ISD::FMSUBADD_RND: + case X86ISD::FMADDSUB: + case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget); + case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI); + case X86ISD::MGATHER: + case X86ISD::MSCATTER: case ISD::MGATHER: - case ISD::MSCATTER: return combineGatherScatter(N, DAG); - case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget); - case X86ISD::TESTM: return combineTestM(N, DAG); + case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget); + case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget); case X86ISD::PCMPEQ: case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget); } @@ -35910,6 +37522,27 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const { return Promote; } +bool X86TargetLowering:: + isDesirableToCombineBuildVectorToShuffleTruncate( + ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const { + + assert(SrcVT.getVectorNumElements() == ShuffleMask.size() && + "Element count mismatch"); + assert( + Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) && + "Shuffle Mask expected to be legal"); + + // For 32-bit elements VPERMD is better than shuffle+truncate. + // TODO: After we improve lowerBuildVector, add execption for VPERMW. + if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2()) + return false; + + if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask)) + return false; + + return true; +} + //===----------------------------------------------------------------------===// // X86 Inline Assembly Support //===----------------------------------------------------------------------===// @@ -36041,8 +37674,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { case 'v': case 'Y': case 'l': - return C_RegisterClass; case 'k': // AVX512 masking registers. + return C_RegisterClass; case 'a': case 'b': case 'c': @@ -36074,8 +37707,15 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const { switch (Constraint[1]) { default: break; - case 'k': + case 'z': + case '0': return C_Register; + case 'i': + case 'm': + case 'k': + case 't': + case '2': + return C_RegisterClass; } } } @@ -36123,15 +37763,42 @@ TargetLowering::ConstraintWeight if (type->isX86_MMXTy() && Subtarget.hasMMX()) weight = CW_SpecificReg; break; - case 'Y': - // Other "Y<x>" (e.g. "Yk") constraints should be implemented below. - if (constraint[1] == 'k') { - // Support for 'Yk' (similarly to the 'k' variant below). - weight = CW_SpecificReg; + case 'Y': { + unsigned Size = StringRef(constraint).size(); + // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y' + char NextChar = Size == 2 ? constraint[1] : 'i'; + if (Size > 2) break; + switch (NextChar) { + default: + return CW_Invalid; + // XMM0 + case 'z': + case '0': + if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) + return CW_SpecificReg; + return CW_Invalid; + // Conditional OpMask regs (AVX512) + case 'k': + if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) + return CW_Register; + return CW_Invalid; + // Any MMX reg + case 'm': + if (type->isX86_MMXTy() && Subtarget.hasMMX()) + return weight; + return CW_Invalid; + // Any SSE reg when ISA >= SSE2, same as 'Y' + case 'i': + case 't': + case '2': + if (!Subtarget.hasSSE2()) + return CW_Invalid; + break; } - // Else fall through (handle "Y" constraint). + // Fall through (handle "Y" constraint). LLVM_FALLTHROUGH; + } case 'v': if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()) weight = CW_Register; @@ -36143,7 +37810,8 @@ TargetLowering::ConstraintWeight break; case 'k': // Enable conditional vector operations using %k<#> registers. - weight = CW_SpecificReg; + if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512()) + weight = CW_Register; break; case 'I': if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) { @@ -36545,6 +38213,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, switch (Constraint[1]) { default: break; + case 'i': + case 't': + case '2': + return getRegForInlineAsmConstraint(TRI, "Y", VT); + case 'm': + if (!Subtarget.hasMMX()) break; + return std::make_pair(0U, &X86::VR64RegClass); + case 'z': + case '0': + if (!Subtarget.hasSSE1()) break; + return std::make_pair(X86::XMM0, &X86::VR128RegClass); case 'k': // This register class doesn't allocate k0 for masked vector operation. if (Subtarget.hasAVX512()) { // Only supported in AVX512. @@ -36637,12 +38316,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, if (Size == 1) Size = 8; unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size); if (DestReg > 0) { - Res.first = DestReg; - Res.second = Size == 8 ? &X86::GR8RegClass - : Size == 16 ? &X86::GR16RegClass - : Size == 32 ? &X86::GR32RegClass - : &X86::GR64RegClass; - assert(Res.second->contains(Res.first) && "Register in register class"); + bool is64Bit = Subtarget.is64Bit(); + const TargetRegisterClass *RC = + Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass) + : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass) + : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass) + : &X86::GR64RegClass; + if (RC->contains(DestReg)) + Res = std::make_pair(DestReg, RC); } else { // No register found/type mismatch. Res.first = 0; @@ -36750,7 +38431,7 @@ void X86TargetLowering::insertCopiesSplitCSR( // fine for CXX_FAST_TLS since the C++-style TLS access functions should be // nounwind. If we want to generalize this later, we may need to emit // CFI pseudo-instructions. - assert(Entry->getParent()->getFunction()->hasFnAttribute( + assert(Entry->getParent()->getFunction().hasFnAttribute( Attribute::NoUnwind) && "Function should be nounwind in insertCopiesSplitCSR!"); Entry->addLiveIn(*I); @@ -36773,8 +38454,8 @@ bool X86TargetLowering::supportSwiftError() const { /// string if not applicable. StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const { // If the function specifically requests stack probes, emit them. - if (MF.getFunction()->hasFnAttribute("probe-stack")) - return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString(); + if (MF.getFunction().hasFnAttribute("probe-stack")) + return MF.getFunction().getFnAttribute("probe-stack").getValueAsString(); // Generally, if we aren't on Windows, the platform ABI does not include // support for stack probes, so don't emit them. diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h index dbbc2bbba6a4..8464081b1b08 100644 --- a/lib/Target/X86/X86ISelLowering.h +++ b/lib/Target/X86/X86ISelLowering.h @@ -17,7 +17,7 @@ #include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/Target/TargetOptions.h" namespace llvm { @@ -214,7 +214,7 @@ namespace llvm { // FP vector get exponent. FGETEXP_RND, FGETEXPS_RND, // Extract Normalized Mantissas. - VGETMANT, VGETMANTS, + VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND, // FP Scale. SCALEF, SCALEFS, @@ -254,7 +254,9 @@ namespace llvm { /// Note that these typically require refinement /// in order to obtain suitable precision. FRSQRT, FRCP, - FRSQRTS, FRCPS, + + // AVX-512 reciprocal approximations with a little more precision. + RSQRT14, RSQRT14S, RCP14, RCP14S, // Thread Local Storage. TLSADDR, @@ -333,6 +335,9 @@ namespace llvm { // Vector integer comparisons, the result is in a mask vector. PCMPEQM, PCMPGTM, + // v8i16 Horizontal minimum and position. + PHMINPOS, + MULTISHIFT, /// Vector comparison generating mask bits for fp and @@ -346,9 +351,6 @@ namespace llvm { ADD, SUB, ADC, SBB, SMUL, INC, DEC, OR, XOR, AND, - // Bit field extract. - BEXTR, - // LOW, HI, FLAGS = umul LHS, RHS. UMUL, @@ -391,13 +393,17 @@ namespace llvm { PSHUFHW, PSHUFLW, SHUFP, + // VBMI2 Concat & Shift. + VSHLD, + VSHRD, + VSHLDV, + VSHRDV, //Shuffle Packed Values at 128-bit granularity. SHUF128, MOVDDUP, MOVSHDUP, MOVSLDUP, MOVLHPS, - MOVLHPD, MOVHLPS, MOVLPS, MOVLPD, @@ -428,11 +434,13 @@ namespace llvm { VFIXUPIMM, VFIXUPIMMS, // Range Restriction Calculation For Packed Pairs of Float32/64 values. - VRANGE, + VRANGE, VRANGE_RND, VRANGES, VRANGES_RND, // Reduce - Perform Reduction Transformation on scalar\packed FP. - VREDUCE, VREDUCES, + VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND, // RndScale - Round FP Values To Include A Given Number Of Fraction Bits. - VRNDSCALE, VRNDSCALES, + // Also used by the legacy (V)ROUND intrinsics where we mask out the + // scaling part of the immediate. + VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND, // Tests Types Of a FP Values for packed types. VFPCLASS, // Tests Types Of a FP Values for scalar types. @@ -445,14 +453,9 @@ namespace llvm { // Broadcast subvector to vector. SUBV_BROADCAST, - // Extract vector element. - VEXTRACT, - /// SSE4A Extraction and Insertion. EXTRQI, INSERTQI, - // XOP variable/immediate rotations. - VPROT, VPROTI, // XOP arithmetic/logical shifts. VPSHA, VPSHL, // XOP signed/unsigned integer comparisons. @@ -471,10 +474,20 @@ namespace llvm { // Multiply and Add Packed Integers. VPMADDUBSW, VPMADDWD, + + // AVX512IFMA multiply and add. + // NOTE: These are different than the instruction and perform + // op0 x op1 + op2. VPMADD52L, VPMADD52H, + // VNNI + VPDPBUSD, + VPDPBUSDS, + VPDPWSSD, + VPDPWSSDS, + // FMA nodes. - FMADD, + // We use the target independent ISD::FMA for the non-inverted case. FNMADD, FMSUB, FNMSUB, @@ -489,6 +502,15 @@ namespace llvm { FMADDSUB_RND, FMSUBADD_RND, + // FMA4 specific scalar intrinsics bits that zero the non-scalar bits. + FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S, + + // Scalar intrinsic FMA. + FMADDS1, FMADDS3, + FNMADDS1, FNMADDS3, + FMSUBS1, FMSUBS3, + FNMSUBS1, FNMSUBS3, + // Scalar intrinsic FMA with rounding mode. // Two versions, passthru bits on op1 or op3. FMADDS1_RND, FMADDS3_RND, @@ -500,6 +522,9 @@ namespace llvm { COMPRESS, EXPAND, + // Bits shuffle + VPSHUFBITQMB, + // Convert Unsigned/Integer to Floating-Point Value with rounding mode. SINT_TO_FP_RND, UINT_TO_FP_RND, SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND, @@ -557,7 +582,10 @@ namespace llvm { RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2, // Conversions between float and half-float. - CVTPS2PH, CVTPH2PS, + CVTPS2PH, CVTPH2PS, CVTPH2PS_RND, + + // Galois Field Arithmetic Instructions + GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB, // LWP insert record. LWPINS, @@ -571,7 +599,7 @@ namespace llvm { /// LOCK-prefixed arithmetic read-modify-write instructions. /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS) - LADD, LSUB, LOR, LXOR, LAND, + LADD, LSUB, LOR, LXOR, LAND, LINC, LDEC, // Load, scalar_to_vector, and zero extend. VZEXT_LOAD, @@ -617,8 +645,8 @@ namespace llvm { // Vector truncating masked store with unsigned/signed saturation VMTRUNCSTOREUS, VMTRUNCSTORES, - // X86 specific gather - MGATHER + // X86 specific gather and scatter + MGATHER, MSCATTER, // WARNING: Do not add anything in the end unless you want the node to // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all @@ -628,46 +656,6 @@ namespace llvm { /// Define some predicates that are used for node matching. namespace X86 { - /// Return true if the specified - /// EXTRACT_SUBVECTOR operand specifies a vector extract that is - /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions. - bool isVEXTRACT128Index(SDNode *N); - - /// Return true if the specified - /// INSERT_SUBVECTOR operand specifies a subvector insert that is - /// suitable for input to VINSERTF128, VINSERTI128 instructions. - bool isVINSERT128Index(SDNode *N); - - /// Return true if the specified - /// EXTRACT_SUBVECTOR operand specifies a vector extract that is - /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions. - bool isVEXTRACT256Index(SDNode *N); - - /// Return true if the specified - /// INSERT_SUBVECTOR operand specifies a subvector insert that is - /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions. - bool isVINSERT256Index(SDNode *N); - - /// Return the appropriate - /// immediate to extract the specified EXTRACT_SUBVECTOR index - /// with VEXTRACTF128, VEXTRACTI128 instructions. - unsigned getExtractVEXTRACT128Immediate(SDNode *N); - - /// Return the appropriate - /// immediate to insert at the specified INSERT_SUBVECTOR index - /// with VINSERTF128, VINSERT128 instructions. - unsigned getInsertVINSERT128Immediate(SDNode *N); - - /// Return the appropriate - /// immediate to extract the specified EXTRACT_SUBVECTOR index - /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions. - unsigned getExtractVEXTRACT256Immediate(SDNode *N); - - /// Return the appropriate - /// immediate to insert at the specified INSERT_SUBVECTOR index - /// with VINSERTF64x4, VINSERTI64x4 instructions. - unsigned getInsertVINSERT256Immediate(SDNode *N); - /// Returns true if Elt is a constant zero or floating point constant +0.0. bool isZeroNode(SDValue Elt); @@ -696,7 +684,7 @@ namespace llvm { void markLibCallAttributes(MachineFunction *MF, unsigned CC, ArgListTy &Args) const override; - MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override { + MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override { return MVT::i8; } @@ -767,18 +755,18 @@ namespace llvm { SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; - // Return true if it is profitable to combine a BUILD_VECTOR to a TRUNCATE - // for given operand and result types. + // Return true if it is profitable to combine a BUILD_VECTOR with a + // stride-pattern to a shuffle and a truncate. // Example of such a combine: - // v4i32 build_vector((extract_elt V, 0), - // (extract_elt V, 2), - // (extract_elt V, 4), - // (extract_elt V, 6)) + // v4i32 build_vector((extract_elt V, 1), + // (extract_elt V, 3), + // (extract_elt V, 5), + // (extract_elt V, 7)) // --> - // v4i32 truncate (bitcast V to v4i64) - bool isDesirableToCombineBuildVectorToTruncate() const override { - return true; - } + // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to + // v4i64) + bool isDesirableToCombineBuildVectorToShuffleTruncate( + ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override; /// Return true if the target has native support for /// the specified value type and it is 'desirable' to use the type for the @@ -799,6 +787,11 @@ namespace llvm { /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool mergeStoresAfterLegalization() const override { return true; } + + bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT, + const SelectionDAG &DAG) const override; + bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -854,6 +847,8 @@ namespace llvm { const SelectionDAG &DAG, unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; + bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const override; @@ -903,7 +898,8 @@ namespace llvm { /// Return true if the addressing mode represented /// by AM is legal for this target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, - Type *Ty, unsigned AS) const override; + Type *Ty, unsigned AS, + Instruction *I = nullptr) const override; /// Return true if the specified immediate is legal /// icmp immediate, that is the target has icmp instructions which can @@ -966,6 +962,7 @@ namespace llvm { /// true and stores the intrinsic information into the IntrinsicInfo that was /// passed to the function. bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I, + MachineFunction &MF, unsigned Intrinsic) const override; /// Returns true if the target can instruction select the @@ -977,8 +974,7 @@ namespace llvm { /// VECTOR_SHUFFLE operations, those with specific masks. By default, if a /// target supports the VECTOR_SHUFFLE node, all mask values are assumed to /// be legal. - bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask, - EVT VT) const override; + bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override; /// Similar to isShuffleMaskLegal. This is used by Targets can use this to /// indicate if there is a suitable VECTOR_SHUFFLE that can be used to @@ -1013,13 +1009,19 @@ namespace llvm { bool shouldConvertConstantLoadToIntImm(const APInt &Imm, Type *Ty) const override; - bool convertSelectOfConstantsToMath() const override { - return true; - } + bool convertSelectOfConstantsToMath(EVT VT) const override; /// Return true if EXTRACT_SUBVECTOR is cheap for this result type /// with this index. - bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override; + bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT, + unsigned Index) const override; + + bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem, + unsigned AddrSpace) const override { + // If we can replace more than 2 scalar stores, there will be a reduction + // in instructions even after we add a vector constant load. + return NumElem > 2; + } /// Intel processors have a unified instruction and data cache const char * getClearCacheBuiltinName() const override { @@ -1051,9 +1053,13 @@ namespace llvm { Value *getIRStackGuard(IRBuilder<> &IRB) const override; bool useLoadStackGuardNode() const override; + bool useStackGuardXorFP() const override; void insertSSPDeclarations(Module &M) const override; Value *getSDagStackGuard(const Module &M) const override; Value *getSSPStackGuardCheck(const Module &M) const override; + SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val, + const SDLoc &DL) const override; + /// Return true if the target stores SafeStack pointer at a fixed offset in /// some non-standard address space, and populates the address space and @@ -1164,8 +1170,6 @@ namespace llvm { SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; - SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const; - SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const; unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const; @@ -1184,8 +1188,6 @@ namespace llvm { SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const; SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const; - SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl, - SelectionDAG &DAG) const; SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const; SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const; @@ -1207,6 +1209,7 @@ namespace llvm { SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const; SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg, @@ -1222,8 +1225,8 @@ namespace llvm { const SDLoc &dl, SelectionDAG &DAG) const override; bool supportSplitCSR(MachineFunction *MF) const override { - return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && - MF->getFunction()->hasFnAttribute(Attribute::NoUnwind); + return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && + MF->getFunction().hasFnAttribute(Attribute::NoUnwind); } void initializeSplitCSR(MachineBasicBlock *Entry) const override; void insertCopiesSplitCSR( @@ -1268,6 +1271,10 @@ namespace llvm { EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1, + MachineInstr &MI2, + MachineBasicBlock *BB) const; + MachineBasicBlock *EmitLoweredSelect(MachineInstr &I, MachineBasicBlock *BB) const; @@ -1421,19 +1428,93 @@ namespace llvm { } }; - // X86 specific Gather node. - class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode { + // X86 specific Gather/Scatter nodes. + // The class has the same order of operands as MaskedGatherScatterSDNode for + // convenience. + class X86MaskedGatherScatterSDNode : public MemSDNode { public: - X86MaskedGatherSDNode(unsigned Order, - const DebugLoc &dl, SDVTList VTs, EVT MemVT, - MachineMemOperand *MMO) - : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO) - {} + X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order, + const DebugLoc &dl, SDVTList VTs, EVT MemVT, + MachineMemOperand *MMO) + : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {} + + const SDValue &getBasePtr() const { return getOperand(3); } + const SDValue &getIndex() const { return getOperand(4); } + const SDValue &getMask() const { return getOperand(2); } + const SDValue &getValue() const { return getOperand(1); } + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::MGATHER || + N->getOpcode() == X86ISD::MSCATTER; + } + }; + + class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode { + public: + X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, + EVT MemVT, MachineMemOperand *MMO) + : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, + MMO) {} + static bool classof(const SDNode *N) { return N->getOpcode() == X86ISD::MGATHER; } }; + class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode { + public: + X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs, + EVT MemVT, MachineMemOperand *MMO) + : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT, + MMO) {} + + static bool classof(const SDNode *N) { + return N->getOpcode() == X86ISD::MSCATTER; + } + }; + + /// Generate unpacklo/unpackhi shuffle mask. + template <typename T = int> + void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo, + bool Unary) { + assert(Mask.empty() && "Expected an empty shuffle mask vector"); + int NumElts = VT.getVectorNumElements(); + int NumEltsInLane = 128 / VT.getScalarSizeInBits(); + for (int i = 0; i < NumElts; ++i) { + unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane; + int Pos = (i % NumEltsInLane) / 2 + LaneStart; + Pos += (Unary ? 0 : NumElts * (i % 2)); + Pos += (Lo ? 0 : NumEltsInLane / 2); + Mask.push_back(Pos); + } + } + + /// Helper function to scale a shuffle or target shuffle mask, replacing each + /// mask index with the scaled sequential indices for an equivalent narrowed + /// mask. This is the reverse process to canWidenShuffleElements, but can + /// always succeed. + template <typename T> + void scaleShuffleMask(int Scale, ArrayRef<T> Mask, + SmallVectorImpl<T> &ScaledMask) { + assert(0 < Scale && "Unexpected scaling factor"); + int NumElts = Mask.size(); + ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1); + + for (int i = 0; i != NumElts; ++i) { + int M = Mask[i]; + + // Repeat sentinel values in every mask element. + if (M < 0) { + for (int s = 0; s != Scale; ++s) + ScaledMask[(Scale * i) + s] = M; + continue; + } + + // Scale mask element and increment across each mask element. + for (int s = 0; s != Scale; ++s) + ScaledMask[(Scale * i) + s] = (Scale * M) + s; + } + } } // end namespace llvm #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td index 08b501ff20bf..2acd8d17beb2 100644 --- a/lib/Target/X86/X86Instr3DNow.td +++ b/lib/Target/X86/X86Instr3DNow.td @@ -12,94 +12,123 @@ // //===----------------------------------------------------------------------===// -class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat> - : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> { +let Sched = WriteFAdd in { +def I3DNOW_FALU_ITINS : OpndItins< + IIC_3DNOW_FALU_RR, IIC_3DNOW_FALU_RM +>; } -class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> +let Sched = WriteCvtF2I in { +def I3DNOW_FCVT_F2I_ITINS : OpndItins< + IIC_3DNOW_FCVT_F2I_RR, IIC_3DNOW_FCVT_F2I_RM +>; +} + +let Sched = WriteCvtI2F in { +def I3DNOW_FCVT_I2F_ITINS : OpndItins< + IIC_3DNOW_FCVT_I2F_RR, IIC_3DNOW_FCVT_I2F_RM +>; +} + +let Sched = WriteVecIMul in { +def I3DNOW_MISC_FUNC_ITINS : OpndItins< + IIC_3DNOW_MISC_FUNC_REG, IIC_3DNOW_MISC_FUNC_MEM +>; +} + +let Sched = WriteShuffle in { +def I3DNOW_PSHUF_ITINS : OpndItins< + IIC_MMX_PSHUF, IIC_MMX_PSHUF +>; +} + +class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat, + InstrItinClass itin> + : I<o, F, outs, ins, asm, pat, itin>, TB, Requires<[Has3DNow]> { +} + +class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat, + InstrItinClass itin> : I3DNow<o, F, (outs VR64:$dst), ins, - !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>, + !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat, itin>, Has3DNow0F0FOpcode { // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. let isAsmParserOnly = 1; let Constraints = "$src1 = $dst"; } -class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat> +class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat, + InstrItinClass itin> : I3DNow<o, F, (outs VR64:$dst), ins, - !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>, + !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat, itin>, Has3DNow0F0FOpcode { // FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet. let isAsmParserOnly = 1; } -multiclass I3DNow_binop_rm<bits<8> opc, string Mn> { - def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>; - def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>; -} - -multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, bit Commutable = 0, - string Ver = ""> { +multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, OpndItins itins, + bit Commutable = 0, string Ver = ""> { let isCommutable = Commutable in def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, [(set VR64:$dst, (!cast<Intrinsic>( - !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>; + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))], + itins.rr>, Sched<[itins.Sched]>; def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, [(set VR64:$dst, (!cast<Intrinsic>( !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, - (bitconvert (load_mmx addr:$src2))))]>; -} - -multiclass I3DNow_conv_rm<bits<8> opc, string Mn> { - def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>; - def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>; + (bitconvert (load_mmx addr:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } -multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> { +multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, OpndItins itins, + string Ver = ""> { def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn, [(set VR64:$dst, (!cast<Intrinsic>( - !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>; + !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))], itins.rr>, + Sched<[itins.Sched]>; def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn, [(set VR64:$dst, (!cast<Intrinsic>( !strconcat("int_x86_3dnow", Ver, "_", Mn)) - (bitconvert (load_mmx addr:$src))))]>; + (bitconvert (load_mmx addr:$src))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } -defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", 1>; -defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">; -defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">; -defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", 1>; -defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", 1>; -defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">; -defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">; -defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">; -defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">; -defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", 1>; -defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">; -defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">; -defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">; -defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">; -defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">; -defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", 1>; -defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", 1>; -defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">; -defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", 1>; - +defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", I3DNOW_MISC_FUNC_ITINS, 1>; +defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", I3DNOW_FCVT_F2I_ITINS>; +defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", I3DNOW_FALU_ITINS>; +defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", I3DNOW_FALU_ITINS, 1>; +defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", I3DNOW_FALU_ITINS, 1>; +defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", I3DNOW_FALU_ITINS>; +defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", I3DNOW_FALU_ITINS>; +defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", I3DNOW_FALU_ITINS>; +defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", I3DNOW_FALU_ITINS>; +defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", I3DNOW_FALU_ITINS, 1>; +defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", I3DNOW_FALU_ITINS>; +defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", I3DNOW_FALU_ITINS>; +defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", I3DNOW_FALU_ITINS>; +defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", I3DNOW_FALU_ITINS>; +defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", I3DNOW_FALU_ITINS>; +defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", I3DNOW_FALU_ITINS, 1>; +defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", I3DNOW_FALU_ITINS, 1>; +defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", I3DNOW_FCVT_I2F_ITINS>; +defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>; def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms", - [(int_x86_mmx_femms)]>; + [(int_x86_mmx_femms)], IIC_MMX_EMMS>; +let SchedRW = [WriteLoad] in { def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr), "prefetch\t$addr", - [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>; - + [(prefetch addr:$addr, (i32 0), imm, (i32 1))], + IIC_SSE_PREFETCH>; def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr", - [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB, - Requires<[HasPrefetchW]>; + [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))], + IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>; +} // "3DNowA" instructions -defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">; -defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">; -defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", 0, "a">; -defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", 0, "a">; -defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">; +defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", I3DNOW_FCVT_F2I_ITINS, "a">; +defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", I3DNOW_FCVT_I2F_ITINS, "a">; +defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", I3DNOW_FALU_ITINS, 0, "a">; +defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", I3DNOW_FALU_ITINS, 0, "a">; +defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", I3DNOW_PSHUF_ITINS, "a">; diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td index 0ae960e7d566..2a2286e42405 100644 --- a/lib/Target/X86/X86InstrAVX512.td +++ b/lib/Target/X86/X86InstrAVX512.td @@ -192,6 +192,7 @@ class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm, ValueType KVT = _vt; } +def v1i1_info : X86KVectorVTInfo<VK1, VK1WM, v1i1>; def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>; def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>; def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>; @@ -211,8 +212,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F, list<dag> Pattern, list<dag> MaskingPattern, list<dag> ZeroMaskingPattern, + InstrItinClass itin, string MaskingConstraint = "", - InstrItinClass itin = NoItinerary, bit IsCommutable = 0, bit IsKCommutable = 0> { let isCommutable = IsCommutable in @@ -251,9 +252,9 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, + InstrItinClass itin, SDNode Select = vselect, string MaskingConstraint = "", - InstrItinClass itin = NoItinerary, bit IsCommutable = 0, bit IsKCommutable = 0> : AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, @@ -262,25 +263,30 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, MaskingRHS)], [(set _.RC:$dst, (Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))], - MaskingConstraint, NoItinerary, IsCommutable, + itin, MaskingConstraint, IsCommutable, IsKCommutable>; -// Similar to AVX512_maskable_common, but with scalar types. -multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _, - dag Outs, - dag Ins, dag MaskingIns, dag ZeroMaskingIns, - string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm, - SDNode Select = vselect, - string MaskingConstraint = "", - InstrItinClass itin = NoItinerary, - bit IsCommutable = 0, - bit IsKCommutable = 0> : - AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr, - AttSrcAsm, IntelSrcAsm, - [], [], [], - MaskingConstraint, NoItinerary, IsCommutable, - IsKCommutable>; +// This multiclass generates the unconditional/non-masking, the masking and +// the zero-masking variant of the vector instruction. In the masking case, the +// perserved vector elements come from a new dummy input operand tied to $dst. +// This version uses a separate dag for non-masking and masking. +multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _, + dag Outs, dag Ins, string OpcodeStr, + string AttSrcAsm, string IntelSrcAsm, + dag RHS, dag MaskRHS, + InstrItinClass itin, + bit IsCommutable = 0, bit IsKCommutable = 0, + SDNode Select = vselect> : + AVX512_maskable_custom<O, F, Outs, Ins, + !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), + !con((ins _.KRCWM:$mask), Ins), + OpcodeStr, AttSrcAsm, IntelSrcAsm, + [(set _.RC:$dst, RHS)], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))], + [(set _.RC:$dst, + (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))], + itin, "$src0 = $dst", IsCommutable, IsKCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -289,15 +295,15 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0, bit IsKCommutable = 0, SDNode Select = vselect> : AVX512_maskable_common<O, F, _, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select, - "$src0 = $dst", itin, IsCommutable, IsKCommutable>; + (Select _.KRCWM:$mask, RHS, _.RC:$src0), itin, + Select, "$src0 = $dst", IsCommutable, IsKCommutable>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the scalar instruction. @@ -305,14 +311,10 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0> : - AVX512_maskable_common<O, F, _, Outs, Ins, - !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), - !con((ins _.KRCWM:$mask), Ins), - OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (X86selects _.KRCWM:$mask, RHS, _.RC:$src0), - X86selects, "$src0 = $dst", itin, IsCommutable>; + AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm, + RHS, itin, IsCommutable, 0, X86selects>; // Similar to AVX512_maskable but in this case one of the source operands // ($src1) is already tied to $dst so we just use that for the preserved @@ -321,40 +323,42 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _, multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, bit IsCommutable = 0, - bit IsKCommutable = 0> : + dag RHS, InstrItinClass itin, + bit IsCommutable = 0, + bit IsKCommutable = 0, + SDNode Select = vselect, + bit MaskOnly = 0> : AVX512_maskable_common<O, F, _, Outs, !con((ins _.RC:$src1), NonTiedIns), !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), - OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (vselect _.KRCWM:$mask, RHS, _.RC:$src1), - vselect, "", NoItinerary, IsCommutable, IsKCommutable>; + OpcodeStr, AttSrcAsm, IntelSrcAsm, + !if(MaskOnly, (null_frag), RHS), + (Select _.KRCWM:$mask, RHS, _.RC:$src1), itin, + Select, "", IsCommutable, IsKCommutable>; multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag NonTiedIns, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, bit IsCommutable = 0, - bit IsKCommutable = 0> : - AVX512_maskable_common<O, F, _, Outs, - !con((ins _.RC:$src1), NonTiedIns), - !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), - !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns), - OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (X86selects _.KRCWM:$mask, RHS, _.RC:$src1), - X86selects, "", NoItinerary, IsCommutable, - IsKCommutable>; + dag RHS, InstrItinClass itin, + bit IsCommutable = 0, + bit IsKCommutable = 0, + bit MaskOnly = 0> : + AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm, + IntelSrcAsm, RHS, itin, IsCommutable, IsKCommutable, + X86selects, MaskOnly>; multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - list<dag> Pattern> : + list<dag> Pattern, + InstrItinClass itin> : AVX512_maskable_custom<O, F, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [], - "$src0 = $dst">; + itin, "$src0 = $dst">; // Instruction with mask that puts result in mask register, @@ -366,17 +370,18 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F, string AttSrcAsm, string IntelSrcAsm, list<dag> Pattern, list<dag> MaskingPattern, + InstrItinClass itin, bit IsCommutable = 0> { let isCommutable = IsCommutable in def NAME: AVX512<O, F, Outs, Ins, OpcodeStr#"\t{"#AttSrcAsm#", $dst|"# "$dst, "#IntelSrcAsm#"}", - Pattern, NoItinerary>; + Pattern, itin>; def NAME#k: AVX512<O, F, Outs, MaskingIns, OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"# "$dst {${mask}}, "#IntelSrcAsm#"}", - MaskingPattern, NoItinerary>, EVEX_K; + MaskingPattern, itin>, EVEX_K; } multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, @@ -385,27 +390,30 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskingRHS, + InstrItinClass itin, bit IsCommutable = 0> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr, AttSrcAsm, IntelSrcAsm, [(set _.KRC:$dst, RHS)], - [(set _.KRC:$dst, MaskingRHS)], IsCommutable>; + [(set _.KRC:$dst, MaskingRHS)], itin, IsCommutable>; multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, - dag RHS, bit IsCommutable = 0> : + dag RHS, InstrItinClass itin, + bit IsCommutable = 0> : AVX512_maskable_common_cmp<O, F, _, Outs, Ins, !con((ins _.KRCWM:$mask), Ins), OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS, - (and _.KRCWM:$mask, RHS), IsCommutable>; + (and _.KRCWM:$mask, RHS), itin, IsCommutable>; multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, - string AttSrcAsm, string IntelSrcAsm> : + string AttSrcAsm, string IntelSrcAsm, + InstrItinClass itin> : AVX512_maskable_custom_cmp<O, F, Outs, Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr, - AttSrcAsm, IntelSrcAsm, [],[]>; + AttSrcAsm, IntelSrcAsm, [],[], itin>; // This multiclass generates the unconditional/non-masking, the masking and // the zero-masking variant of the vector instruction. In the masking case, the @@ -414,7 +422,7 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _, dag Outs, dag Ins, string OpcodeStr, string AttSrcAsm, string IntelSrcAsm, dag RHS, dag MaskedRHS, - InstrItinClass itin = NoItinerary, + InstrItinClass itin, bit IsCommutable = 0, SDNode Select = vselect> : AVX512_maskable_custom<O, F, Outs, Ins, !con((ins _.RC:$src0, _.KRCWM:$mask), Ins), @@ -426,41 +434,8 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _, [(set _.RC:$dst, (Select _.KRCWM:$mask, MaskedRHS, _.ImmAllZerosV))], - "$src0 = $dst", itin, IsCommutable>; - -// Bitcasts between 512-bit vector types. Return the original type since -// no instruction is needed for the conversion. -def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; -def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; -def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; -def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; -def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; -def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; -def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; -def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; -def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; -def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; -def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; -def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; -def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; -def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; -def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; -def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; -def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; -def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; -def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; -def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; -def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; -def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; -def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; -def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; -def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; -def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; -def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; -def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; -def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; -def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; -def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; + itin, "$src0 = $dst", IsCommutable>; + // Alias instruction that maps zero vector to pxor / xorp* for AVX-512. // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then @@ -478,7 +453,7 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "", // Alias instructions that allow VPTERNLOG to be used with a mask to create // a mix of all ones and all zeros elements. This is done this way to force // the same register to be used as input for all three sources. -let isPseudo = 1, Predicates = [HasAVX512] in { +let isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteVecALU] in { def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst), (ins VK16WM:$mask), "", [(set VR512:$dst, (vselect (v16i1 VK16WM:$mask), @@ -512,28 +487,49 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, //===----------------------------------------------------------------------===// // AVX-512 - VECTOR INSERT // -multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To, - PatFrag vinsert_insert> { - let ExeDomain = To.ExeDomain in { - defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst), + +// Supports two different pattern operators for mask and unmasked ops. Allows +// null_frag to be passed for one. +multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From, + X86VectorVTInfo To, + SDPatternOperator vinsert_insert, + SDPatternOperator vinsert_for_mask, + OpndItins itins> { + let hasSideEffects = 0, ExeDomain = To.ExeDomain in { + defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst), (ins To.RC:$src1, From.RC:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts, "$src3, $src2, $src1", "$src1, $src2, $src3", (vinsert_insert:$src3 (To.VT To.RC:$src1), (From.VT From.RC:$src2), - (iPTR imm))>, AVX512AIi8Base, EVEX_4V; - - defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst), + (iPTR imm)), + (vinsert_for_mask:$src3 (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm)), itins.rr>, + AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>; + let mayLoad = 1 in + defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst), (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3), "vinsert" # From.EltTypeName # "x" # From.NumElts, "$src3, $src2, $src1", "$src1, $src2, $src3", (vinsert_insert:$src3 (To.VT To.RC:$src1), (From.VT (bitconvert (From.LdFrag addr:$src2))), - (iPTR imm))>, AVX512AIi8Base, EVEX_4V, - EVEX_CD8<From.EltSize, From.CD8TupleForm>; + (iPTR imm)), + (vinsert_for_mask:$src3 (To.VT To.RC:$src1), + (From.VT (bitconvert (From.LdFrag addr:$src2))), + (iPTR imm)), itins.rm>, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<From.EltSize, From.CD8TupleForm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } +// Passes the same pattern operator for masked and unmasked ops. +multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, + X86VectorVTInfo To, + SDPatternOperator vinsert_insert, + OpndItins itins> : + vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, itins>; + multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, X86VectorVTInfo To, PatFrag vinsert_insert, SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> { @@ -555,62 +551,78 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From, } multiclass vinsert_for_type<ValueType EltVT32, int Opcode128, - ValueType EltVT64, int Opcode256> { + ValueType EltVT64, int Opcode256, + OpndItins itins> { let Predicates = [HasVLX] in defm NAME # "32x4Z256" : vinsert_for_size<Opcode128, X86VectorVTInfo< 4, EltVT32, VR128X>, X86VectorVTInfo< 8, EltVT32, VR256X>, - vinsert128_insert>, EVEX_V256; + vinsert128_insert, itins>, EVEX_V256; defm NAME # "32x4Z" : vinsert_for_size<Opcode128, X86VectorVTInfo< 4, EltVT32, VR128X>, X86VectorVTInfo<16, EltVT32, VR512>, - vinsert128_insert>, EVEX_V512; + vinsert128_insert, itins>, EVEX_V512; defm NAME # "64x4Z" : vinsert_for_size<Opcode256, X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert256_insert>, VEX_W, EVEX_V512; + vinsert256_insert, itins>, VEX_W, EVEX_V512; + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasVLX, HasDQI] in - defm NAME # "64x2Z256" : vinsert_for_size<Opcode128, + defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 4, EltVT64, VR256X>, - vinsert128_insert>, VEX_W, EVEX_V256; + null_frag, vinsert128_insert, itins>, + VEX_W, EVEX_V256; + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { - defm NAME # "64x2Z" : vinsert_for_size<Opcode128, + defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128, X86VectorVTInfo< 2, EltVT64, VR128X>, X86VectorVTInfo< 8, EltVT64, VR512>, - vinsert128_insert>, VEX_W, EVEX_V512; + null_frag, vinsert128_insert, itins>, + VEX_W, EVEX_V512; - defm NAME # "32x8Z" : vinsert_for_size<Opcode256, + defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256, X86VectorVTInfo< 8, EltVT32, VR256X>, X86VectorVTInfo<16, EltVT32, VR512>, - vinsert256_insert>, EVEX_V512; + null_frag, vinsert256_insert, itins>, + EVEX_V512; } } -defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>; -defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>; +// FIXME: Is there a better scheduler itinerary for VINSERTF/VINSERTI? +let Sched = WriteFShuffle256 in +def AVX512_VINSERTF : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; +let Sched = WriteShuffle256 in +def AVX512_VINSERTI : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + +defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, AVX512_VINSERTF>; +defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, AVX512_VINSERTI>; // Codegen pattern with the alternative types, -// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +// Even with AVX512DQ we'll still use these for unmasked operations. defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>; defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info, - vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>; + vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info, - vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info, - vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>; + vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; // Codegen pattern with the alternative types insert VEC128 into VEC256 defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, @@ -628,48 +640,184 @@ defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info, defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info, vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>; + +multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, X86VectorVTInfo Cast, + PatFrag vinsert_insert, + SDNodeXForm INSERT_get_vinsert_imm, + list<Predicate> p> { +let Predicates = p in { + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))), + Cast.RC:$src0)), + (!cast<Instruction>(InstrStr#"rrk") + Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT + (bitconvert + (From.LdFrag addr:$src2))), + (iPTR imm))), + Cast.RC:$src0)), + (!cast<Instruction>(InstrStr#"rmk") + Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; + + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT From.RC:$src2), + (iPTR imm))), + Cast.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#"rrkz") + Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; + def : Pat<(Cast.VT + (vselect Cast.KRCWM:$mask, + (bitconvert + (vinsert_insert:$ins (To.VT To.RC:$src1), + (From.VT + (bitconvert + (From.LdFrag addr:$src2))), + (iPTR imm))), + Cast.ImmAllZerosV)), + (!cast<Instruction>(InstrStr#"rmkz") + Cast.KRCWM:$mask, To.RC:$src1, addr:$src2, + (INSERT_get_vinsert_imm To.RC:$ins))>; +} +} + +defm : vinsert_for_mask_cast<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info, + v8f32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4f32x_info, v8f32x_info, + v4f64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; + +defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info, + v8i32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info, + v8i32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info, + v8i32x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4i32x_info, v8i32x_info, + v4i64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v8i16x_info, v16i16x_info, + v4i64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v16i8x_info, v32i8x_info, + v4i64x_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>; + +defm : vinsert_for_mask_cast<"VINSERTF32x4Z", v2f64x_info, v8f64_info, + v16f32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTF64x2Z", v4f32x_info, v16f32_info, + v8f64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; + +defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v2i64x_info, v8i64_info, + v16i32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v8i16x_info, v32i16_info, + v16i32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v16i8x_info, v64i8_info, + v16i32_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v4i32x_info, v16i32_info, + v8i64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v8i16x_info, v32i16_info, + v8i64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v16i8x_info, v64i8_info, + v8i64_info, vinsert128_insert, + INSERT_get_vinsert128_imm, [HasDQI]>; + +defm : vinsert_for_mask_cast<"VINSERTF32x8Z", v4f64x_info, v8f64_info, + v16f32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTF64x4Z", v8f32x_info, v16f32_info, + v8f64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; + +defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v4i64x_info, v8i64_info, + v16i32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v16i16x_info, v32i16_info, + v16i32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v32i8x_info, v64i8_info, + v16i32_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasDQI]>; +defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v8i32x_info, v16i32_info, + v8i64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v16i16x_info, v32i16_info, + v8i64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; +defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info, + v8i64_info, vinsert256_insert, + INSERT_get_vinsert256_imm, [HasAVX512]>; + // vinsertps - insert f32 to XMM let ExeDomain = SSEPackedSingle in { def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>, - EVEX_4V; + [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))], + IIC_SSE_INSERTPS_RR>, EVEX_4V, Sched<[WriteFShuffle]>; def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst), (ins VR128X:$src1, f32mem:$src2, u8imm:$src3), "vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set VR128X:$dst, (X86insertps VR128X:$src1, (v4f32 (scalar_to_vector (loadf32 addr:$src2))), - imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>; + imm:$src3))], IIC_SSE_INSERTPS_RM>, EVEX_4V, + EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd, ReadAfterLd]>; } //===----------------------------------------------------------------------===// // AVX-512 VECTOR EXTRACT //--- -multiclass vextract_for_size<int Opcode, - X86VectorVTInfo From, X86VectorVTInfo To, - PatFrag vextract_extract, - SDNodeXForm EXTRACT_get_vextract_imm> { +// Supports two different pattern operators for mask and unmasked ops. Allows +// null_frag to be passed for one. +multiclass vextract_for_size_split<int Opcode, + X86VectorVTInfo From, X86VectorVTInfo To, + SDPatternOperator vextract_extract, + SDPatternOperator vextract_for_mask, + OpndItins itins> { let hasSideEffects = 0, ExeDomain = To.ExeDomain in { - // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to - // vextract_extract), we interesting only in patterns without mask, - // intrinsics pattern match generated bellow. - defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst), + defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst), (ins From.RC:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x" # To.NumElts, "$idx, $src1", "$src1, $idx", - [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1), - (iPTR imm)))]>, - AVX512AIi8Base, EVEX; + (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)), + (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm)), + itins.rr>, AVX512AIi8Base, EVEX, Sched<[itins.Sched]>; + def mr : AVX512AIi8<Opcode, MRMDestMem, (outs), (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx), "vextract" # To.EltTypeName # "x" # To.NumElts # "\t{$idx, $src1, $dst|$dst, $src1, $idx}", [(store (To.VT (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm))), - addr:$dst)]>, EVEX; + addr:$dst)], itins.rm>, EVEX, + Sched<[itins.Sched.Folded, ReadAfterLd]>; let mayStore = 1, hasSideEffects = 0 in def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs), @@ -678,28 +826,18 @@ multiclass vextract_for_size<int Opcode, "vextract" # To.EltTypeName # "x" # To.NumElts # "\t{$idx, $src1, $dst {${mask}}|" "$dst {${mask}}, $src1, $idx}", - []>, EVEX_K, EVEX; + [], itins.rm>, EVEX_K, EVEX, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } - - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (vextract_extract:$ext (From.VT From.RC:$src1), - (iPTR imm)), - To.RC:$src0)), - (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rrk") - To.RC:$src0, To.KRCWM:$mask, From.RC:$src1, - (EXTRACT_get_vextract_imm To.RC:$ext))>; - - def : Pat<(To.VT (vselect To.KRCWM:$mask, - (vextract_extract:$ext (From.VT From.RC:$src1), - (iPTR imm)), - To.ImmAllZerosV)), - (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts # - From.ZSuffix # "rrkz") - To.KRCWM:$mask, From.RC:$src1, - (EXTRACT_get_vextract_imm To.RC:$ext))>; } +// Passes the same pattern operator for masked and unmasked ops. +multiclass vextract_for_size<int Opcode, X86VectorVTInfo From, + X86VectorVTInfo To, + SDPatternOperator vextract_extract, + OpndItins itins> : + vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, itins>; + // Codegen pattern for the alternative types multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, X86VectorVTInfo To, PatFrag vextract_extract, @@ -717,68 +855,79 @@ multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From, } multiclass vextract_for_type<ValueType EltVT32, int Opcode128, - ValueType EltVT64, int Opcode256> { - defm NAME # "32x4Z" : vextract_for_size<Opcode128, - X86VectorVTInfo<16, EltVT32, VR512>, - X86VectorVTInfo< 4, EltVT32, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>, - EVEX_V512, EVEX_CD8<32, CD8VT4>; - defm NAME # "64x4Z" : vextract_for_size<Opcode256, - X86VectorVTInfo< 8, EltVT64, VR512>, - X86VectorVTInfo< 4, EltVT64, VR256X>, - vextract256_extract, - EXTRACT_get_vextract256_imm>, - VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + ValueType EltVT64, int Opcode256, + OpndItins itins> { + let Predicates = [HasAVX512] in { + defm NAME # "32x4Z" : vextract_for_size<Opcode128, + X86VectorVTInfo<16, EltVT32, VR512>, + X86VectorVTInfo< 4, EltVT32, VR128X>, + vextract128_extract, itins>, + EVEX_V512, EVEX_CD8<32, CD8VT4>; + defm NAME # "64x4Z" : vextract_for_size<Opcode256, + X86VectorVTInfo< 8, EltVT64, VR512>, + X86VectorVTInfo< 4, EltVT64, VR256X>, + vextract256_extract, itins>, + VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>; + } let Predicates = [HasVLX] in defm NAME # "32x4Z256" : vextract_for_size<Opcode128, X86VectorVTInfo< 8, EltVT32, VR256X>, X86VectorVTInfo< 4, EltVT32, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>, + vextract128_extract, itins>, EVEX_V256, EVEX_CD8<32, CD8VT4>; + + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasVLX, HasDQI] in - defm NAME # "64x2Z256" : vextract_for_size<Opcode128, + defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128, X86VectorVTInfo< 4, EltVT64, VR256X>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>, + null_frag, vextract128_extract, itins>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; + + // Even with DQI we'd like to only use these instructions for masking. let Predicates = [HasDQI] in { - defm NAME # "64x2Z" : vextract_for_size<Opcode128, + defm NAME # "64x2Z" : vextract_for_size_split<Opcode128, X86VectorVTInfo< 8, EltVT64, VR512>, X86VectorVTInfo< 2, EltVT64, VR128X>, - vextract128_extract, - EXTRACT_get_vextract128_imm>, + null_frag, vextract128_extract, itins>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; - defm NAME # "32x8Z" : vextract_for_size<Opcode256, + defm NAME # "32x8Z" : vextract_for_size_split<Opcode256, X86VectorVTInfo<16, EltVT32, VR512>, X86VectorVTInfo< 8, EltVT32, VR256X>, - vextract256_extract, - EXTRACT_get_vextract256_imm>, + null_frag, vextract256_extract, itins>, EVEX_V512, EVEX_CD8<32, CD8VT8>; } } -defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>; -defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>; +// FIXME: Is there a better scheduler itinerary for VEXTRACTF/VEXTRACTI? +let Sched = WriteFShuffle256 in +def AVX512_VEXTRACTF : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; +let Sched = WriteShuffle256 in +def AVX512_VEXTRACTI : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + +defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, AVX512_VEXTRACTF>; +defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, AVX512_VEXTRACTI>; // extract_subvector codegen patterns with the alternative types. -// Only add this if 64x2 and its friends are not supported natively via AVX512DQ. +// Even with AVX512DQ we'll still use these for unmasked operations. defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, - vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, - vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>; + vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, - vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>; + vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>; // Codegen pattern with the alternative types extract VEC128 from VEC256 defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info, @@ -797,80 +946,185 @@ defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>; -// A 128-bit subvector extract from the first 256-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>; -def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>; -def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))), - (v8i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_xmm))>; -def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))), - (v16i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_xmm))>; - -// A 256-bit subvector extract from the first 256-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))), - (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>; -def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))), - (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>; -def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))), - (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>; -def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))), - (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>; -def : Pat<(v16i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))), - (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm))>; -def : Pat<(v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))), - (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm))>; - -let AddedComplexity = 25 in { // to give priority over vinsertf128rm -// A 128-bit subvector insert to the first 512-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; -def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))), - (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; -def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; -def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; -def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))), - (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; -def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))), - (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>; - -// A 256-bit subvector insert to the first 512-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))), - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))), - (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))), - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))), - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))), - (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))), - (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>; -} + +// A 128-bit extract from bits [255:128] of a 512-bit vector should use a +// smaller extract to enable EVEX->VEX. +let Predicates = [NoVLX] in { +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), + (v2i64 (VEXTRACTI128rr + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))), + (v2f64 (VEXTRACTF128rr + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))), + (v4i32 (VEXTRACTI128rr + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))), + (v4f32 (VEXTRACTF128rr + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))), + (v8i16 (VEXTRACTI128rr + (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), + (v16i8 (VEXTRACTI128rr + (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), + (iPTR 1)))>; +} + +// A 128-bit extract from bits [255:128] of a 512-bit vector should use a +// smaller extract to enable EVEX->VEX. +let Predicates = [HasVLX] in { +def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))), + (v2i64 (VEXTRACTI32x4Z256rr + (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))), + (v2f64 (VEXTRACTF32x4Z256rr + (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))), + (v4i32 (VEXTRACTI32x4Z256rr + (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))), + (v4f32 (VEXTRACTF32x4Z256rr + (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))), + (v8i16 (VEXTRACTI32x4Z256rr + (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)), + (iPTR 1)))>; +def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))), + (v16i8 (VEXTRACTI32x4Z256rr + (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)), + (iPTR 1)))>; +} + + +// Additional patterns for handling a bitcast between the vselect and the +// extract_subvector. +multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, X86VectorVTInfo Cast, + PatFrag vextract_extract, + SDNodeXForm EXTRACT_get_vextract_imm, + list<Predicate> p> { +let Predicates = p in { + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (vextract_extract:$ext + (From.VT From.RC:$src), (iPTR imm)))), + To.RC:$src0)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrk") + Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; + + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (vextract_extract:$ext + (From.VT From.RC:$src), (iPTR imm)))), + Cast.ImmAllZerosV)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrkz") + Cast.KRCWM:$mask, From.RC:$src, + (EXTRACT_get_vextract_imm To.RC:$ext)))>; +} +} + +defm : vextract_for_mask_cast<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info, + v4f32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTF64x2Z256", v8f32x_info, v4f32x_info, + v2f64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; + +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v8i32x_info, v4i32x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v16i16x_info, v8i16x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v32i8x_info, v16i8x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>; + +defm : vextract_for_mask_cast<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info, + v4f32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTF64x2Z", v16f32_info, v4f32x_info, + v2f64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; + +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info, + v4i32x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v16i32_info, v4i32x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v32i16_info, v8i16x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v64i8_info, v16i8x_info, + v2i64x_info, vextract128_extract, + EXTRACT_get_vextract128_imm, [HasDQI]>; + +defm : vextract_for_mask_cast<"VEXTRACTF32x8Z", v8f64_info, v4f64x_info, + v8f32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info, + v4f64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; + +defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v8i64_info, v4i64x_info, + v8i32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v32i16_info, v16i16x_info, + v8i32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v64i8_info, v32i8x_info, + v8i32x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasDQI]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info, + v4i64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info, + v4i64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; +defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info, + v4i64x_info, vextract256_extract, + EXTRACT_get_vextract256_imm, [HasAVX512]>; // vextractps - extract 32 bits from XMM def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst), (ins VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>, - EVEX; + [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))], + IIC_SSE_EXTRACTPS_RR>, EVEX, VEX_WIG, Sched<[WriteFShuffle]>; def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs), (ins f32mem:$dst, VR128X:$src1, u8imm:$src2), "vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2), - addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>; + addr:$dst)], IIC_SSE_EXTRACTPS_RM>, + EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd]>; //===---------------------------------------------------------------------===// // AVX-512 BROADCAST @@ -894,66 +1148,108 @@ multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr, DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>; } -multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, - X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> { - let ExeDomain = DestInfo.ExeDomain in { - defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), +// Split version to allow mask and broadcast node to be different types. This +// helps support the 32x2 broadcasts. +multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr, + SchedWrite SchedRR, SchedWrite SchedRM, + X86VectorVTInfo MaskInfo, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo, + SDPatternOperator UnmaskedOp = X86VBroadcast> { + let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in { + defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo, + (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src", - (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>, - T8PD, EVEX; - defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))), + (MaskInfo.VT + (bitconvert + (DestInfo.VT + (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))), + NoItinerary>, T8PD, EVEX, Sched<[SchedRR]>; + let mayLoad = 1 in + defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo, + (outs MaskInfo.RC:$dst), (ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src", - (DestInfo.VT (X86VBroadcast - (SrcInfo.ScalarLdFrag addr:$src)))>, - T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>; - } - - def : Pat<(DestInfo.VT (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src))))), - (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))), - DestInfo.RC:$src0)), + (MaskInfo.VT + (bitconvert + (DestInfo.VT (UnmaskedOp + (SrcInfo.ScalarLdFrag addr:$src))))), + (MaskInfo.VT + (bitconvert + (DestInfo.VT (X86VBroadcast + (SrcInfo.ScalarLdFrag addr:$src))))), + NoItinerary>, T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>, + Sched<[SchedRM]>; + } + + def : Pat<(MaskInfo.VT + (bitconvert + (DestInfo.VT (UnmaskedOp + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src))))))), + (!cast<Instruction>(NAME#MaskInfo.ZSuffix#m) addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.RC:$src0)), (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk) - DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>; - def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask, - (X86VBroadcast - (SrcInfo.VT (scalar_to_vector - (SrcInfo.ScalarLdFrag addr:$src)))), - DestInfo.ImmAllZerosV)), - (!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz) - DestInfo.KRCWM:$mask, addr:$src)>; -} + MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>; + def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask, + (bitconvert + (DestInfo.VT + (X86VBroadcast + (SrcInfo.VT (scalar_to_vector + (SrcInfo.ScalarLdFrag addr:$src)))))), + MaskInfo.ImmAllZerosV)), + (!cast<Instruction>(NAME#MaskInfo.ZSuffix#mkz) + MaskInfo.KRCWM:$mask, addr:$src)>; +} + +// Helper class to force mask and broadcast result to same type. +multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr, + SchedWrite SchedRR, SchedWrite SchedRM, + X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo> : + avx512_broadcast_rm_split<opc, OpcodeStr, SchedRR, SchedRM, + DestInfo, DestInfo, SrcInfo>; multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256, + WriteFShuffle256Ld, _.info512, _.info128>, avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>, - EVEX_V512; + EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256, + WriteFShuffle256Ld, _.info256, _.info128>, avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>, - EVEX_V256; + EVEX_V256; } } multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256, + WriteFShuffle256Ld, _.info512, _.info128>, avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256, + WriteFShuffle256Ld, _.info256, _.info128>, avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>, EVEX_V256; - defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>, + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256, + WriteFShuffle256Ld, _.info128, _.info128>, avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>, EVEX_V128; } @@ -968,26 +1264,27 @@ def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src), def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src), (VBROADCASTSDZm addr:$src)>; -multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _, - SDPatternOperator OpNode, +multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR, + X86VectorVTInfo _, SDPatternOperator OpNode, RegisterClass SrcRC> { let ExeDomain = _.ExeDomain in defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins SrcRC:$src), "vpbroadcast"##_.Suffix, "$src", "$src", - (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX; + (_.VT (OpNode SrcRC:$src)), NoItinerary>, T8PD, EVEX, + Sched<[SchedRR]>; } -multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, +multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR, X86VectorVTInfo _, SDPatternOperator OpNode, RegisterClass SrcRC, SubRegIndex Subreg> { - let ExeDomain = _.ExeDomain in + let hasSideEffects = 0, ExeDomain = _.ExeDomain in defm r : AVX512_maskable_custom<opc, MRMSrcReg, (outs _.RC:$dst), (ins GR32:$src), !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)), !con((ins _.KRCWM:$mask), (ins GR32:$src)), "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [], - "$src0 = $dst">, T8PD, EVEX; + NoItinerary, "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>; def : Pat <(_.VT (OpNode SrcRC:$src)), (!cast<Instruction>(Name#r) @@ -1006,13 +1303,13 @@ multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name, AVX512VLVectorVTInfo _, SDPatternOperator OpNode, RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, _.info512, OpNode, SrcRC, - Subreg>, EVEX_V512; + defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, WriteShuffle256, _.info512, + OpNode, SrcRC, Subreg>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, _.info256, OpNode, - SrcRC, Subreg>, EVEX_V256; - defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, _.info128, OpNode, - SrcRC, Subreg>, EVEX_V128; + defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, WriteShuffle256, + _.info256, OpNode, SrcRC, Subreg>, EVEX_V256; + defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, WriteShuffle, + _.info128, OpNode, SrcRC, Subreg>, EVEX_V128; } } @@ -1020,10 +1317,13 @@ multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _, SDPatternOperator OpNode, RegisterClass SrcRC, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512; + defm Z : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info512, OpNode, + SrcRC>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256; - defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128; + defm Z256 : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info256, OpNode, + SrcRC>, EVEX_V256; + defm Z128 : avx512_int_broadcast_reg<opc, WriteShuffle, _.info128, OpNode, + SrcRC>, EVEX_V128; } } @@ -1054,17 +1354,20 @@ multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo, multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd] in { - defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>, + defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256, + WriteShuffle256Ld, _.info512, _.info128>, avx512_int_broadcast_rm_lowering<_.info512, _.info256>, EVEX_V512; // Defined separately to avoid redefinition. defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>; } let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>, + defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256, + WriteShuffle256Ld, _.info256, _.info128>, avx512_int_broadcast_rm_lowering<_.info256, _.info256>, EVEX_V256; - defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>, + defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle, + WriteShuffleLd, _.info128, _.info128>, EVEX_V128; } } @@ -1083,8 +1386,24 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr, defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", (_Dst.VT (X86SubVBroadcast - (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>, - AVX5128IBase, EVEX; + (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))), + NoItinerary>, AVX5128IBase, EVEX, + Sched<[WriteShuffleLd]>; +} + +// This should be used for the AVX512DQ broadcast instructions. It disables +// the unmasked patterns so that we only use the DQ instructions when masking +// is requested. +multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr, + X86VectorVTInfo _Dst, X86VectorVTInfo _Src> { + let hasSideEffects = 0, mayLoad = 1 in + defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), + (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src", + (null_frag), + (_Dst.VT (X86SubVBroadcast + (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))), + NoItinerary>, AVX5128IBase, EVEX, + Sched<[WriteShuffleLd]>; } let Predicates = [HasAVX512] in { @@ -1093,12 +1412,14 @@ let Predicates = [HasAVX512] in { (VPBROADCASTQZm addr:$src)>; } -let Predicates = [HasVLX, HasBWI] in { +let Predicates = [HasVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), (VPBROADCASTQZ128m addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), (VPBROADCASTQZ256m addr:$src)>; +} +let Predicates = [HasVLX, HasBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. // This means we'll encounter truncated i32 loads; match that here. def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), @@ -1131,6 +1452,10 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4", EVEX_V512, EVEX_CD8<64, CD8VT4>; let Predicates = [HasAVX512] in { +def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), + (VBROADCASTF64X4rm addr:$src)>; +def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), + (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))), (VBROADCASTI64X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), @@ -1141,9 +1466,15 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))), def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))), (VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4f64 VR256X:$src), 1)>; +def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), + (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8f32 VR256X:$src), 1)>; def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v4i64 VR256X:$src), 1)>; +def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), + (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), + (v8i32 VR256X:$src), 1)>; def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v16i16 VR256X:$src), 1)>; @@ -1151,6 +1482,10 @@ def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))), (VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), (v32i8 VR256X:$src), 1)>; +def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4rm addr:$src)>; +def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4rm addr:$src)>; def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), @@ -1165,6 +1500,10 @@ defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4", v8f32x_info, v4f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VT4>; +def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), + (VBROADCASTF32X4Z256rm addr:$src)>; +def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), + (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))), (VBROADCASTI32X4Z256rm addr:$src)>; def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), @@ -1172,9 +1511,15 @@ def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))), // Provide fallback in case the load node that is used in the patterns above // is used by additional users, which prevents the pattern selection. +def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), + (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2f64 VR128X:$src), 1)>; def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))), (VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (v4f32 VR128X:$src), 1)>; +def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), + (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), + (v2i64 VR128X:$src), 1)>; def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))), (VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), (v4i32 VR128X:$src), 1)>; @@ -1187,92 +1532,41 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))), } let Predicates = [HasVLX, HasDQI] in { -defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", v4i64x_info, v2i64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v4f64x_info, v2f64x_info>, VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -} - -let Predicates = [HasVLX, NoDQI] in { -def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4Z256rm addr:$src)>; -def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4Z256rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))), - (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2f64 VR128X:$src), 1)>; -def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))), - (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm), - (v2i64 VR128X:$src), 1)>; -} - -let Predicates = [HasAVX512, NoDQI] in { -def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))), - (VBROADCASTF32X4rm addr:$src)>; -def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))), - (VBROADCASTI32X4rm addr:$src)>; - -def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))), - (VBROADCASTF64X4rm addr:$src)>; -def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))), - (VBROADCASTI64X4rm addr:$src)>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; } let Predicates = [HasDQI] in { -defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2", +defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2", v8i64_info, v2i64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8", +defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8", v16i32_info, v8i32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; -defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2", +defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2", v8f64_info, v2f64x_info>, VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>; -defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8", +defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8", v16f32_info, v8f32x_info>, EVEX_V512, EVEX_CD8<32, CD8VT8>; - -// Provide fallback in case the load node that is used in the patterns above -// is used by additional users, which prevents the pattern selection. -def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))), - (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8f32 VR256X:$src), 1)>; -def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))), - (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm), - (v8i32 VR256X:$src), 1)>; } multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> { let Predicates = [HasDQI] in - defm Z : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>, - EVEX_V512; + defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256, + WriteShuffle256Ld, _Dst.info512, + _Src.info512, _Src.info128, null_frag>, + EVEX_V512; let Predicates = [HasDQI, HasVLX] in - defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>, - EVEX_V256; + defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256, + WriteShuffle256Ld, _Dst.info256, + _Src.info256, _Src.info128, null_frag>, + EVEX_V256; } multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, @@ -1280,8 +1574,10 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr, avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> { let Predicates = [HasDQI, HasVLX] in - defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>, - EVEX_V128; + defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle, + WriteShuffleLd, _Dst.info128, + _Src.info128, _Src.info128, null_frag>, + EVEX_V128; } defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2", @@ -1313,7 +1609,8 @@ multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, RegisterClass KRC> { def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX; + [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))], + IIC_SSE_PSHUF_RI>, EVEX, Sched<[WriteShuffle]>; } multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr, @@ -1333,7 +1630,19 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q", //===----------------------------------------------------------------------===// // -- VPERMI2 - 3 source operands form -- -multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + +let Sched = WriteFShuffle256 in +def AVX512_PERM2_F : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; + +let Sched = WriteShuffle256 in +def AVX512_PERM2_I : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + +multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { // The index operand in the pattern should really be an integer type. However, // if we do that and it happens to come from a bitcast, then it becomes @@ -1343,18 +1652,19 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V, - AVX5128IBase; + (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), + itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>; defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, - (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>, - EVEX_4V, AVX5128IBase; + (_.VT (bitconvert (_.LdFrag addr:$src3))))), itins.rm, 1>, + EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, + +multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, OpndItins itins, X86VectorVTInfo _> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -1363,66 +1673,68 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), - 1>, AVX5128IBase, EVEX_4V, EVEX_B; + itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } -multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, +multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo> { - defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, - avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>, + avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, - avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128; - defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, - avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>, + avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>, + avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256; } } multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo VTInfo, - Predicate Prd> { + OpndItins itins, + AVX512VLVectorVTInfo VTInfo, + Predicate Prd> { let Predicates = [Prd] in - defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512; let Predicates = [Prd, HasVLX] in { - defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128; - defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, EVEX_V256; + defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128; + defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256; } } -defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", +defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", AVX512_PERM2_I, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", +defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", AVX512_PERM2_I, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", +defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", AVX512_PERM2_I, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", +defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", AVX512_PERM2_I, avx512vl_i8_info, HasVBMI>, EVEX_CD8<8, CD8VF>; -defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", +defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", AVX512_PERM2_F, avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", +defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", AVX512_PERM2_F, avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>; // VPERMT2 -multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, +multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, OpndItins itins, X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>, - EVEX_4V, AVX5128IBase; + (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), + itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>; defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins IdxVT.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, - (bitconvert (_.LdFrag addr:$src3)))), 1>, - EVEX_4V, AVX5128IBase; + (bitconvert (_.LdFrag addr:$src3)))), itins.rm, 1>, + EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, +multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpndItins itins, X86VectorVTInfo _, X86VectorVTInfo IdxVT> { let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -1431,147 +1743,165 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, !strconcat("$src2, ${src3}", _.BroadcastStr ), (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))), - 1>, AVX5128IBase, EVEX_4V, EVEX_B; + itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } -multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, +multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo, AVX512VLVectorVTInfo ShuffleMask> { - defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512, ShuffleMask.info512>, - avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512, + avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info512, ShuffleMask.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128, ShuffleMask.info128>, - avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128, + avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info128, ShuffleMask.info128>, EVEX_V128; - defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256, ShuffleMask.info256>, - avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256, + avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info256, ShuffleMask.info256>, EVEX_V256; } } -multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr, +multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo, AVX512VLVectorVTInfo Idx, Predicate Prd> { let Predicates = [Prd] in - defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512, + defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512, Idx.info512>, EVEX_V512; let Predicates = [Prd, HasVLX] in { - defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128, + defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128, Idx.info128>, EVEX_V128; - defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256, + defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256, Idx.info256>, EVEX_V256; } } -defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", +defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", AVX512_PERM2_I, avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", +defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", AVX512_PERM2_I, avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", +defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", AVX512_PERM2_I, avx512vl_i16_info, avx512vl_i16_info, HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", +defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", AVX512_PERM2_I, avx512vl_i8_info, avx512vl_i8_info, HasVBMI>, EVEX_CD8<8, CD8VF>; -defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", +defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", AVX512_PERM2_F, avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", +defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", AVX512_PERM2_F, avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; //===----------------------------------------------------------------------===// // AVX-512 - BLEND using mask // -multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { + +let Sched = WriteFVarBlend in +def AVX512_BLENDM : OpndItins< + IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM +>; + +let Sched = WriteVarBlend in +def AVX512_PBLENDM : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + +multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain, hasSideEffects = 0 in { def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), - []>, EVEX_4V; + [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - []>, EVEX_4V, EVEX_K; + [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), - []>, EVEX_4V, EVEX_KZ; + [], itins.rr>, EVEX_4V, EVEX_KZ, Sched<[itins.Sched]>; let mayLoad = 1 in { def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"), - []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"), - []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"), - []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } } -multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { - +multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _> { let mayLoad = 1, hasSideEffects = 0 in { def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"), - []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, "}"), - []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + [], itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass blendmask_dq <bits<8> opc, string OpcodeStr, +multiclass blendmask_dq <bits<8> opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo> { - defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, - avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + defm Z : avx512_blendmask <opc, OpcodeStr, itins, VTInfo.info512>, + avx512_blendmask_rmb <opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>, - avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>, - avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>, + avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>, + avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128; } } -multiclass blendmask_bw <bits<8> opc, string OpcodeStr, +multiclass blendmask_bw <bits<8> opc, string OpcodeStr, OpndItins itins, AVX512VLVectorVTInfo VTInfo> { let Predicates = [HasBWI] in - defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512; + defm Z : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { - defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128; + defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256; + defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128; } } -defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>; -defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W; -defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>; -defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W; -defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>; -defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; +defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", AVX512_BLENDM, avx512vl_f32_info>; +defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", AVX512_BLENDM, avx512vl_f64_info>, VEX_W; +defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", AVX512_PBLENDM, avx512vl_i32_info>; +defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", AVX512_PBLENDM, avx512vl_i64_info>, VEX_W; +defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", AVX512_PBLENDM, avx512vl_i8_info>; +defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_info>, VEX_W; //===----------------------------------------------------------------------===// @@ -1580,8 +1910,8 @@ defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W; // avx512_cmp_scalar - AVX512 CMPSS and CMPSD -multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{ - +multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd, + OpndItins itins> { defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), @@ -1589,7 +1919,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc)>, EVEX_4V; + imm:$cc), itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -1597,7 +1927,8 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> "vcmp${cc}"#_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, - imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + imm:$cc), itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), @@ -1607,28 +1938,31 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> (OpNodeRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc, - (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_4V, EVEX_B, Sched<[itins.Sched]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, (outs VK1:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V; + "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>, EVEX_4V, + Sched<[itins.Sched]>; let mayLoad = 1 in defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">, - EVEX_4V, EVEX_B; + "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", itins.rr>, + EVEX_4V, EVEX_B, Sched<[itins.Sched]>; }// let isAsmParserOnly = 1, hasSideEffects = 0 let isCodeGenOnly = 1 in { @@ -1640,7 +1974,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> [(set _.KRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2, imm:$cc))], - IIC_SSE_ALU_F32S_RR>, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rm : AVX512Ii8<0xC2, MRMSrcMem, (outs _.KRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc), @@ -1649,33 +1983,34 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd> [(set _.KRC:$dst, (OpNode _.FRC:$src1, (_.ScalarLdFrag addr:$src2), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } let Predicates = [HasAVX512] in { let ExeDomain = SSEPackedSingle in - defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>, - AVX512XSIi8Base; + defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd, + SSE_ALU_F32S>, AVX512XSIi8Base; let ExeDomain = SSEPackedDouble in - defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>, - AVX512XDIi8Base, VEX_W; + defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd, + SSE_ALU_F64S>, AVX512XDIi8Base, VEX_W; } multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable> { + OpndItins itins, X86VectorVTInfo _, bit IsCommutable> { let isCommutable = IsCommutable in def rr : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rm : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2)))))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V; + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCommutable = IsCommutable in def rrk : AVX512BI<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), @@ -1683,7 +2018,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, "$dst {${mask}}, $src1, $src2}"), [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rmk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|", @@ -1692,19 +2027,19 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))))))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + itins.rm>, EVEX_4V, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable> : - avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> { + OpndItins itins, X86VectorVTInfo _, bit IsCommutable> : + avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, _, IsCommutable> { def rmb : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), !strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst", "|$dst, $src1, ${src2}", _.BroadcastStr, "}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2))))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + itins.rm>, EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmbk : AVX512BI<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2), @@ -1715,285 +2050,95 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd, - bit IsCommutable = 0> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512, + defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256, + defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128, + defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info128, IsCommutable>, EVEX_V128; } } multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr, - SDNode OpNode, AVX512VLVectorVTInfo VTInfo, - Predicate prd, bit IsCommutable = 0> { + SDNode OpNode, OpndItins itins, + AVX512VLVectorVTInfo VTInfo, + Predicate prd, bit IsCommutable = 0> { let Predicates = [prd] in - defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, + defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, + defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256, IsCommutable>, EVEX_V256; - defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, + defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128, IsCommutable>, EVEX_V128; } } +// FIXME: Is there a better scheduler itinerary for VPCMP? defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm, - avx512vl_i8_info, HasBWI, 1>, - EVEX_CD8<8, CD8VF>; + SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>, + EVEX_CD8<8, CD8VF>, VEX_WIG; defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm, - avx512vl_i16_info, HasBWI, 1>, - EVEX_CD8<16, CD8VF>; + SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>, + EVEX_CD8<16, CD8VF>, VEX_WIG; defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm, - avx512vl_i32_info, HasAVX512, 1>, + SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>, EVEX_CD8<32, CD8VF>; defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm, - avx512vl_i64_info, HasAVX512, 1>, + SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm, - avx512vl_i8_info, HasBWI>, - EVEX_CD8<8, CD8VF>; + SSE_ALU_F32P, avx512vl_i8_info, HasBWI>, + EVEX_CD8<8, CD8VF>, VEX_WIG; defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm, - avx512vl_i16_info, HasBWI>, - EVEX_CD8<16, CD8VF>; + SSE_ALU_F32P, avx512vl_i16_info, HasBWI>, + EVEX_CD8<16, CD8VF>, VEX_WIG; defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm, - avx512vl_i32_info, HasAVX512>, + SSE_ALU_F32P, avx512vl_i32_info, HasAVX512>, EVEX_CD8<32, CD8VF>; defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm, - avx512vl_i64_info, HasAVX512>, + SSE_ALU_F32P, avx512vl_i64_info, HasAVX512>, T8PD, VEX_W, EVEX_CD8<64, CD8VF>; - -multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - SDNode OpNode, string InstrStr, - list<Predicate> Preds> { -let Predicates = Preds in { - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask, - _.RC:$src1, _.RC:$src2), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert - (_.LdFrag addr:$src2))))))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask, - _.RC:$src1, addr:$src2), - NewInf.KRC)>; -} -} - -multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - SDNode OpNode, string InstrStr, - list<Predicate> Preds> - : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { -let Predicates = Preds in { - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)))))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask, - _.RC:$src1, addr:$src2), - NewInf.KRC)>; -} -} - -// VPCMPEQB - i8 -defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQBZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQBZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQBZ256", [HasBWI, HasVLX]>; - -// VPCMPEQW - i16 -defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm, - "VPCMPEQWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQWZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQWZ256", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQWZ256", [HasBWI, HasVLX]>; - -defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm, - "VPCMPEQWZ", [HasBWI]>; - -// VPCMPEQD - i32 -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpeqm, - "VPCMPEQDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm, - "VPCMPEQDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQDZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm, - "VPCMPEQDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQDZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm, - "VPCMPEQDZ", [HasAVX512]>; -defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm, - "VPCMPEQDZ", [HasAVX512]>; - -// VPCMPEQQ - i64 -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpeqm, - "VPCMPEQQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpeqm, - "VPCMPEQQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm, - "VPCMPEQQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQQZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpeqm, - "VPCMPEQQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm, - "VPCMPEQQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm, - "VPCMPEQQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm, - "VPCMPEQQZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm, - "VPCMPEQQZ", [HasAVX512]>; -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm, - "VPCMPEQQZ", [HasAVX512]>; -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm, - "VPCMPEQQZ", [HasAVX512]>; - -// VPCMPGTB - i8 -defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTBZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTBZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTBZ256", [HasBWI, HasVLX]>; - -// VPCMPGTW - i16 -defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpgtm, - "VPCMPGTWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTWZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTWZ256", [HasBWI, HasVLX]>; -defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTWZ256", [HasBWI, HasVLX]>; - -defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpgtm, - "VPCMPGTWZ", [HasBWI]>; - -// VPCMPGTD - i32 -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpgtm, - "VPCMPGTDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpgtm, - "VPCMPGTDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTDZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpgtm, - "VPCMPGTDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTDZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm, - "VPCMPGTDZ", [HasAVX512]>; -defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm, - "VPCMPGTDZ", [HasAVX512]>; - -// VPCMPGTQ - i64 -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpgtm, - "VPCMPGTQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpgtm, - "VPCMPGTQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm, - "VPCMPGTQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTQZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpgtm, - "VPCMPGTQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm, - "VPCMPGTQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm, - "VPCMPGTQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm, - "VPCMPGTQZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm, - "VPCMPGTQZ", [HasAVX512]>; -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm, - "VPCMPGTQZ", [HasAVX512]>; -defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm, - "VPCMPGTQZ", [HasAVX512]>; +// Transforms to swizzle an immediate to help matching memory operand in first +// operand. +def CommutePCMPCC : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0x7; + switch (Imm) { + default: llvm_unreachable("Unreachable!"); + case 0x01: Imm = 0x06; break; // LT -> NLE + case 0x02: Imm = 0x05; break; // LE -> NLT + case 0x05: Imm = 0x02; break; // NLT -> LE + case 0x06: Imm = 0x01; break; // NLE -> LT + case 0x00: // EQ + case 0x03: // FALSE + case 0x04: // NE + case 0x07: // TRUE + break; + } + return getI8Imm(Imm, SDLoc(N)); +}]>; multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let isCommutable = 1 in def rri : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc), @@ -2001,7 +2146,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V; + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; def rmi : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), !strconcat("vpcmp${cc}", Suffix, @@ -2009,7 +2154,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V; + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCommutable = 1 in def rrik : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, @@ -2020,7 +2165,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (and _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc)))], - IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; def rmik : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc), @@ -2031,7 +2176,8 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), imm:$cc)))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + itins.rm>, EVEX_4V, EVEX_K, + Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { @@ -2039,20 +2185,20 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RR>, EVEX_4V; + [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in def rmi_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|", "$dst, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V; + [], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rrik_alt : AVX512AIi8<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K; + [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>; let mayLoad = 1 in def rmik_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2, @@ -2060,13 +2206,25 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode, !strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2, $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K; + [], itins.rm>, EVEX_4V, EVEX_K, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } + + def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)), + (_.VT _.RC:$src1), imm:$cc), + (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, + (CommutePCMPCC imm:$cc))>; + + def : Pat<(and _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src2)), + (_.VT _.RC:$src1), imm:$cc)), + (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + (CommutePCMPCC imm:$cc))>; } multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, - X86VectorVTInfo _> : - avx512_icmp_cc<opc, Suffix, OpNode, _> { + OpndItins itins, X86VectorVTInfo _> : + avx512_icmp_cc<opc, Suffix, OpNode, itins, _> { def rmib : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, AVX512ICC:$cc), @@ -2076,7 +2234,8 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, [(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmibk : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2, AVX512ICC:$cc), @@ -2087,7 +2246,8 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (X86VBroadcast (_.ScalarLdFrag addr:$src2)), imm:$cc)))], - IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in { @@ -2097,302 +2257,98 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode, !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|", "$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B; + [], itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmibk_alt : AVX512AIi8<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), !strconcat("vpcmp", Suffix, "\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"), - [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B; + [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } + + def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), imm:$cc), + (!cast<Instruction>(NAME#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2, + (CommutePCMPCC imm:$cc))>; + + def : Pat<(and _.KRCWM:$mask, (OpNode (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), imm:$cc)), + (!cast<Instruction>(NAME#_.ZSuffix#"rmibk") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + (CommutePCMPCC imm:$cc))>; } multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512; + defm Z : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info512>, + EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128; + defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info256>, + EVEX_V256; + defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info128>, + EVEX_V128; } } multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>, + defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info512>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>, + defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>, + defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info128>, EVEX_V128; } } -defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info, - HasBWI>, EVEX_CD8<8, CD8VF>; -defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info, - HasBWI>, EVEX_CD8<8, CD8VF>; - -defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info, - HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; -defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info, - HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>; - -defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info, - HasAVX512>, EVEX_CD8<32, CD8VF>; -defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info, - HasAVX512>, EVEX_CD8<32, CD8VF>; - -defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info, - HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; -defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info, - HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>; - -multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - SDNode OpNode, string InstrStr, - list<Predicate> Preds> { -let Predicates = Preds in { - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc)), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, - _.RC:$src2, - imm:$cc), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), - imm:$cc)), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1, - addr:$src2, - imm:$cc), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and _.KRCWM:$mask, - (OpNode (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask, - _.RC:$src1, - _.RC:$src2, - imm:$cc), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert - (_.LdFrag addr:$src2))), - imm:$cc)))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask, - _.RC:$src1, - addr:$src2, - imm:$cc), - NewInf.KRC)>; -} -} - -multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - SDNode OpNode, string InstrStr, - list<Predicate> Preds> - : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> { -let Predicates = Preds in { - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (OpNode (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - imm:$cc)), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1, - addr:$src2, - imm:$cc), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (and (_.KVT _.KRCWM:$mask), - (_.KVT (OpNode (_.VT _.RC:$src1), - (X86VBroadcast - (_.ScalarLdFrag addr:$src2)), - imm:$cc)))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask, - _.RC:$src1, - addr:$src2, - imm:$cc), - NewInf.KRC)>; -} -} - -// VPCMPB - i8 -defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm, - "VPCMPBZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm, - "VPCMPBZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm, - "VPCMPBZ256", [HasBWI, HasVLX]>; - -// VPCMPW - i16 -defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpm, - "VPCMPWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpm, - "VPCMPWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpm, - "VPCMPWZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm, - "VPCMPWZ256", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm, - "VPCMPWZ256", [HasBWI, HasVLX]>; - -defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm, - "VPCMPWZ", [HasBWI]>; - -// VPCMPD - i32 -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpm, - "VPCMPDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpm, - "VPCMPDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpm, - "VPCMPDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpm, - "VPCMPDZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpm, - "VPCMPDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpm, - "VPCMPDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpm, - "VPCMPDZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm, - "VPCMPDZ", [HasAVX512]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm, - "VPCMPDZ", [HasAVX512]>; - -// VPCMPQ - i64 -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpm, - "VPCMPQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpm, - "VPCMPQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpm, - "VPCMPQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpm, - "VPCMPQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpm, - "VPCMPQZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpm, - "VPCMPQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpm, - "VPCMPQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpm, - "VPCMPQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpm, - "VPCMPQZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm, - "VPCMPQZ", [HasAVX512]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm, - "VPCMPQZ", [HasAVX512]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm, - "VPCMPQZ", [HasAVX512]>; - -// VPCMPUB - i8 -defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu, - "VPCMPUBZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu, - "VPCMPUBZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu, - "VPCMPUBZ256", [HasBWI, HasVLX]>; - -// VPCMPUW - i16 -defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpmu, - "VPCMPUWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpmu, - "VPCMPUWZ128", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpmu, - "VPCMPUWZ128", [HasBWI, HasVLX]>; - -defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu, - "VPCMPUWZ256", [HasBWI, HasVLX]>; -defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu, - "VPCMPUWZ256", [HasBWI, HasVLX]>; - -defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu, - "VPCMPUWZ", [HasBWI]>; - -// VPCMPUD - i32 -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpmu, - "VPCMPUDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpmu, - "VPCMPUDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpmu, - "VPCMPUDZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpmu, - "VPCMPUDZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpmu, - "VPCMPUDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpmu, - "VPCMPUDZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpmu, - "VPCMPUDZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu, - "VPCMPUDZ", [HasAVX512]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu, - "VPCMPUDZ", [HasAVX512]>; - -// VPCMPUQ - i64 -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpmu, - "VPCMPUQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpmu, - "VPCMPUQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpmu, - "VPCMPUQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpmu, - "VPCMPUQZ128", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpmu, - "VPCMPUQZ128", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpmu, - "VPCMPUQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpmu, - "VPCMPUQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpmu, - "VPCMPUQZ256", [HasAVX512, HasVLX]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpmu, - "VPCMPUQZ256", [HasAVX512, HasVLX]>; - -defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu, - "VPCMPUQZ", [HasAVX512]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu, - "VPCMPUQZ", [HasAVX512]>; -defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu, - "VPCMPUQZ", [HasAVX512]>; - -multiclass avx512_vcmp_common<X86VectorVTInfo _> { +// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU? +defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P, + avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; +defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P, + avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>; + +defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P, + avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; +defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P, + avx512vl_i16_info, HasBWI>, + VEX_W, EVEX_CD8<16, CD8VF>; + +defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; +defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P, + avx512vl_i32_info, HasAVX512>, + EVEX_CD8<32, CD8VF>; + +defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P, + avx512vl_i64_info, HasAVX512>, + VEX_W, EVEX_CD8<64, CD8VF>; +defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P, + avx512vl_i64_info, HasAVX512>, + VEX_W, EVEX_CD8<64, CD8VF>; + +multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> { defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc), "vcmp${cc}"#_.Suffix, "$src2, $src1", "$src1, $src2", (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), - imm:$cc), 1>; + imm:$cc), itins.rr, 1>, + Sched<[itins.Sched]>; defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc), @@ -2400,7 +2356,8 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> { "$src2, $src1", "$src1, $src2", (X86cmpm (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - imm:$cc)>; + imm:$cc), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), @@ -2410,33 +2367,63 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> { "$src1, ${src2}"##_.BroadcastStr, (X86cmpm (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - imm:$cc)>,EVEX_B; + imm:$cc), itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; // Accept explicit immediate argument form instead of comparison code. let isAsmParserOnly = 1, hasSideEffects = 0 in { defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">; + "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>, + Sched<[itins.Sched]>; let mayLoad = 1 in { defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, - "$cc, $src2, $src1", "$src1, $src2, $cc">; + "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, ${src2}"##_.BroadcastStr##", $src1", - "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B; + "$src1, ${src2}"##_.BroadcastStr##", $cc", itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } - } + } + + // Patterns for selecting with loads in other operand. + def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1), + CommutableCMPCC:$cc), + (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2, + imm:$cc)>; + + def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), + (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + imm:$cc)>; + + def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), CommutableCMPCC:$cc), + (!cast<Instruction>(NAME#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2, + imm:$cc)>; + + def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast + (_.ScalarLdFrag addr:$src2)), + (_.VT _.RC:$src1), + CommutableCMPCC:$cc)), + (!cast<Instruction>(NAME#_.ZSuffix#"rmbik") _.KRCWM:$mask, + _.RC:$src1, addr:$src2, + imm:$cc)>; } -multiclass avx512_vcmp_sae<X86VectorVTInfo _> { +multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> { // comparison code form (VCMP[EQ/LT/LE/...] defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _, (outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc), @@ -2445,7 +2432,8 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> { (X86cmpmRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), imm:$cc, - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_B, Sched<[itins.Sched]>; let isAsmParserOnly = 1, hasSideEffects = 0 in { defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _, @@ -2453,163 +2441,78 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> { (ins _.RC:$src1, _.RC:$src2, u8imm:$cc), "vcmp"#_.Suffix, "$cc, {sae}, $src2, $src1", - "$src1, $src2, {sae}, $cc">, EVEX_B; + "$src1, $src2, {sae}, $cc", itins.rr>, + EVEX_B, Sched<[itins.Sched]>; } } -multiclass avx512_vcmp<AVX512VLVectorVTInfo _> { +multiclass avx512_vcmp<OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcmp_common<_.info512>, - avx512_vcmp_sae<_.info512>, EVEX_V512; + defm Z : avx512_vcmp_common<itins, _.info512>, + avx512_vcmp_sae<itins, _.info512>, EVEX_V512; } let Predicates = [HasAVX512,HasVLX] in { - defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128; - defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256; + defm Z128 : avx512_vcmp_common<itins, _.info128>, EVEX_V128; + defm Z256 : avx512_vcmp_common<itins, _.info256>, EVEX_V256; } } -defm VCMPPD : avx512_vcmp<avx512vl_f64_info>, +defm VCMPPD : avx512_vcmp<SSE_ALU_F64P, avx512vl_f64_info>, AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; -defm VCMPPS : avx512_vcmp<avx512vl_f32_info>, +defm VCMPPS : avx512_vcmp<SSE_ALU_F32P, avx512vl_f32_info>, AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - string InstrStr, list<Predicate> Preds> { -let Predicates = Preds in { - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc)), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1, - _.RC:$src2, - imm:$cc), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))), - imm:$cc)), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1, - addr:$src2, - imm:$cc), - NewInf.KRC)>; - - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpm (_.VT _.RC:$src1), - (X86VBroadcast (_.ScalarLdFrag addr:$src2)), - imm:$cc)), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1, - addr:$src2, - imm:$cc), - NewInf.KRC)>; -} -} - -multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf, - string InstrStr, list<Predicate> Preds> - : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> { - -let Predicates = Preds in - def : Pat<(insert_subvector (NewInf.KVT immAllZerosV), - (_.KVT (X86cmpmRnd (_.VT _.RC:$src1), - (_.VT _.RC:$src2), - imm:$cc, - (i32 FROUND_NO_EXC))), - (i64 0)), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1, - _.RC:$src2, - imm:$cc), - NewInf.KRC)>; -} - - -// VCMPPS - f32 -defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v8i1_info, "VCMPPSZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v16i1_info, "VCMPPSZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v32i1_info, "VCMPPSZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v64i1_info, "VCMPPSZ128", - [HasAVX512, HasVLX]>; - -defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v16i1_info, "VCMPPSZ256", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v32i1_info, "VCMPPSZ256", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v64i1_info, "VCMPPSZ256", - [HasAVX512, HasVLX]>; - -defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ", - [HasAVX512]>; -defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ", - [HasAVX512]>; - -// VCMPPD - f64 -defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info, "VCMPPDZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info, "VCMPPDZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128", - [HasAVX512, HasVLX]>; - -defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info, "VCMPPDZ256", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256", - [HasAVX512, HasVLX]>; -defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256", - [HasAVX512, HasVLX]>; - -defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ", - [HasAVX512]>; -defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ", - [HasAVX512]>; -defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ", - [HasAVX512]>; + +// Patterns to select fp compares with load as first operand. +let Predicates = [HasAVX512] in { + def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1, + CommutableCMPCC:$cc)), + (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1, + CommutableCMPCC:$cc)), + (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>; +} // ---------------------------------------------------------------- // FPClass //handle fpclass instruction mask = op(reg_scalar,imm) // op(mem_scalar,imm) multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, Predicate prd> { - let Predicates = [prd] in { - def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst), + OpndItins itins, X86VectorVTInfo _, + Predicate prd> { + let Predicates = [prd], ExeDomain = _.ExeDomain in { + def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), - (i32 imm:$src2)))], NoItinerary>; + (i32 imm:$src2)))], itins.rr>, + Sched<[itins.Sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2))))], NoItinerary>, EVEX_K; + (i32 imm:$src2))))], itins.rr>, + EVEX_K, Sched<[itins.Sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), - (ins _.MemOp:$src1, i32u8imm:$src2), + (ins _.IntScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst, - (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2)))], NoItinerary>; + (OpNode _.ScalarIntMemCPat:$src1, + (i32 imm:$src2)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), - (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), + (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix## "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, - (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2))))], NoItinerary>, EVEX_K; + (OpNode _.ScalarIntMemCPat:$src1, + (i32 imm:$src2))))], itins.rm>, + EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -2617,33 +2520,39 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, // fpclass(reg_vec, mem_vec, imm) // fpclass(reg_vec, broadcast(eltVt), imm) multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, string mem, string broadcast>{ + OpndItins itins, X86VectorVTInfo _, + string mem, string broadcast>{ + let ExeDomain = _.ExeDomain in { def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1), - (i32 imm:$src2)))], NoItinerary>; + (i32 imm:$src2)))], itins.rr>, + Sched<[itins.Sched]>; def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2))))], NoItinerary>, EVEX_K; + (i32 imm:$src2))))], itins.rr>, + EVEX_K, Sched<[itins.Sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##mem# "\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.KRC:$dst,(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2)))], NoItinerary>; + (i32 imm:$src2)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##mem# "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", [(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2))))], NoItinerary>, EVEX_K; + (i32 imm:$src2))))], itins.rm>, + EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## @@ -2652,7 +2561,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set _.KRC:$dst,(OpNode (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2)))], NoItinerary>,EVEX_B; + (i32 imm:$src2)))], itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"## @@ -2661,35 +2571,42 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2))))], NoItinerary>, - EVEX_B, EVEX_K; + (i32 imm:$src2))))], itins.rm>, + EVEX_B, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } } -multiclass avx512_vector_fpclass_all<string OpcodeStr, - AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd, - string broadcast>{ +multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _, + bits<8> opc, SDNode OpNode, + OpndItins itins, Predicate prd, + string broadcast>{ let Predicates = [prd] in { - defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}", - broadcast>, EVEX_V512; + defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins, + _.info512, "{z}", broadcast>, EVEX_V512; } let Predicates = [prd, HasVLX] in { - defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}", - broadcast>, EVEX_V128; - defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}", - broadcast>, EVEX_V256; + defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins, + _.info128, "{x}", broadcast>, EVEX_V128; + defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins, + _.info256, "{y}", broadcast>, EVEX_V256; } } +// FIXME: Is there a better scheduler itinerary for VFPCLASS? multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec, bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{ defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec, - VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>; + VecOpNode, SSE_ALU_F32P, prd, "{l}">, + EVEX_CD8<32, CD8VF>; defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec, - VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W; + VecOpNode, SSE_ALU_F64P, prd, "{q}">, + EVEX_CD8<64, CD8VF> , VEX_W; defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, - f32x_info, prd>, EVEX_CD8<32, CD8VT1>; + SSE_ALU_F32S, f32x_info, prd>, + EVEX_CD8<32, CD8VT1>; defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode, - f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W; + SSE_ALU_F64S, f64x_info, prd>, + EVEX_CD8<64, CD8VT1>, VEX_W; } defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, @@ -2704,15 +2621,16 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass, multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk, string OpcodeStr, RegisterClass KRC, ValueType vvt, X86MemOperand x86memop> { - let hasSideEffects = 0 in + let hasSideEffects = 0, SchedRW = [WriteMove] in def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVDQ>; def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set KRC:$dst, (vvt (load addr:$src)))]>; + [(set KRC:$dst, (vvt (load addr:$src)))], IIC_SSE_MOVDQ>; def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(store KRC:$src, addr:$dst)]>; + [(store KRC:$src, addr:$dst)], IIC_SSE_MOVDQ>; } multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, @@ -2720,9 +2638,11 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk, RegisterClass KRC, RegisterClass GRC> { let hasSideEffects = 0 in { def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src), - !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>; + !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], + IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>; } } @@ -2848,17 +2768,11 @@ let Predicates = [HasAVX512] in { def : Pat<(maskVT (scalar_to_vector GR32:$src)), (COPY_TO_REGCLASS GR32:$src, maskRC)>; - def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))), + def : Pat<(i32 (X86kextract maskRC:$src, (iPTR 0))), (COPY_TO_REGCLASS maskRC:$src, GR32)>; def : Pat<(maskVT (scalar_to_vector GR8:$src)), (COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>; - - def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>; - - def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))), - (COPY_TO_REGCLASS maskRC:$src, GR32)>; } defm : operation_gpr_mask_copy_lowering<VK1, v1i1>; @@ -2888,26 +2802,27 @@ let Predicates = [HasAVX512] in { // - KNOT multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode, - Predicate prd> { + OpndItins itins, Predicate prd> { let Predicates = [prd] in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set KRC:$dst, (OpNode KRC:$src))]>; + [(set KRC:$dst, (OpNode KRC:$src))], itins.rr>, + Sched<[itins.Sched]>; } multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode> { + SDPatternOperator OpNode, OpndItins itins> { defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, - HasDQI>, VEX, PD; + itins, HasDQI>, VEX, PD; defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, - HasAVX512>, VEX, PS; + itins, HasAVX512>, VEX, PS; defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, - HasBWI>, VEX, PD, VEX_W; + itins, HasBWI>, VEX, PD, VEX_W; defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, - HasBWI>, VEX, PS, VEX_W; + itins, HasBWI>, VEX, PS, VEX_W; } -defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>; +defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SSE_BIT_ITINS_P>; // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit let Predicates = [HasAVX512, NoDQI] in @@ -2923,25 +2838,26 @@ def : Pat<(vnot VK2:$src), // - KAND, KANDN, KOR, KXNOR, KXOR multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr, RegisterClass KRC, SDPatternOperator OpNode, - Predicate prd, bit IsCommutable> { + OpndItins itins, Predicate prd, bit IsCommutable> { let Predicates = [prd], isCommutable = IsCommutable in def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>; + [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))], itins.rr>, + Sched<[itins.Sched]>; } multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, bit IsCommutable, - Predicate prdW = HasAVX512> { + SDPatternOperator OpNode, OpndItins itins, + bit IsCommutable, Predicate prdW = HasAVX512> { defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode, - HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; + itins, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD; defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode, - prdW, IsCommutable>, VEX_4V, VEX_L, PS; + itins, prdW, IsCommutable>, VEX_4V, VEX_L, PS; defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode, - HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; + itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD; defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode, - HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; + itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS; } def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>; @@ -2950,12 +2866,12 @@ def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>; def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>; def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>; -defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>; -defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>; -defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>; -defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>; -defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>; -defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>; +defm KAND : avx512_mask_binop_all<0x41, "kand", and, SSE_BIT_ITINS_P, 1>; +defm KOR : avx512_mask_binop_all<0x45, "kor", or, SSE_BIT_ITINS_P, 1>; +defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SSE_BIT_ITINS_P, 1>; +defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SSE_BIT_ITINS_P, 1>; +defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SSE_BIT_ITINS_P, 0>; +defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, SSE_BIT_ITINS_P, 1, HasDQI>; multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode, Instruction Inst> { @@ -2990,13 +2906,13 @@ defm : avx512_binop_pat<xor, xor, KXORWrr>; // Mask unpacking multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, - RegisterClass KRCSrc, Predicate prd> { + RegisterClass KRCSrc, OpndItins itins, Predicate prd> { let Predicates = [prd] in { let hasSideEffects = 0 in def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2), - "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - VEX_4V, VEX_L; + "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + itins.rr>, VEX_4V, VEX_L, Sched<[itins.Sched]>; def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)), (!cast<Instruction>(NAME##rr) @@ -3005,61 +2921,63 @@ multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT, } } -defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD; -defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS; -defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W; +defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, SSE_UNPCK, HasAVX512>, PD; +defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, SSE_UNPCK, HasBWI>, PS; +defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, SSE_UNPCK, HasBWI>, PS, VEX_W; // Mask bit testing multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode, Predicate prd> { + SDNode OpNode, OpndItins itins, Predicate prd> { let Predicates = [prd], Defs = [EFLAGS] in def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), - [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>; + [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))], itins.rr>, + Sched<[itins.Sched]>; } multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode, - Predicate prdW = HasAVX512> { - defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>, + OpndItins itins, Predicate prdW = HasAVX512> { + defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, itins, HasDQI>, VEX, PD; - defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>, + defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, itins, prdW>, VEX, PS; - defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>, + defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, itins, HasBWI>, VEX, PS, VEX_W; - defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>, + defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, itins, HasBWI>, VEX, PD, VEX_W; } -defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>; -defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>; +defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SSE_PTEST>; +defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SSE_PTEST, HasDQI>; // Mask shift multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let Predicates = [HasAVX512] in def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm), !strconcat(OpcodeStr, "\t{$imm, $src, $dst|$dst, $src, $imm}"), - [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>; + [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))], + itins.rr>, Sched<[itins.Sched]>; } multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr, - SDNode OpNode> { - defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>, - VEX, TAPD, VEX_W; + SDNode OpNode, OpndItins itins> { + defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode, + itins>, VEX, TAPD, VEX_W; let Predicates = [HasDQI] in - defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>, - VEX, TAPD; + defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode, + itins>, VEX, TAPD; let Predicates = [HasBWI] in { - defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>, - VEX, TAPD, VEX_W; - defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>, - VEX, TAPD; + defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode, + itins>, VEX, TAPD, VEX_W; + defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode, + itins>, VEX, TAPD; } } -defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>; -defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>; +defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, SSE_PSHUF>; +defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, SSE_PSHUF>; multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> { def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), @@ -3067,23 +2985,14 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), - (i64 0)), - (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr) - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), - (i8 8)), (i8 8))>; - -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (and VK8:$mask, - (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), - (i64 0)), - (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk) - (COPY_TO_REGCLASS VK8:$mask, VK16), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), - (i8 8)), (i8 8))>; +def : Pat<(v8i1 (and VK8:$mask, + (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))), + (COPY_TO_REGCLASS + (!cast<Instruction>(InstStr##Zrrk) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + VK8)>; } multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr, @@ -3094,25 +3003,13 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2) (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), imm:$cc), VK8)>; -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)), - (i64 0)), - (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri) - (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), - (i8 8)), (i8 8))>; - -def : Pat<(insert_subvector (v16i1 immAllZerosV), - (v8i1 (and VK8:$mask, - (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))), - (i64 0)), - (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik) - (COPY_TO_REGCLASS VK8:$mask, VK16), - (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), - imm:$cc), - (i8 8)), (i8 8))>; +def : Pat<(v8i1 (and VK8:$mask, (OpNode (_.info256.VT VR256X:$src1), + (_.info256.VT VR256X:$src2), imm:$cc))), + (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik) + (COPY_TO_REGCLASS VK8:$mask, VK16), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), + (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)), + imm:$cc), VK8)>; } let Predicates = [HasAVX512, NoVLX] in { @@ -3127,7 +3024,8 @@ let Predicates = [HasAVX512, NoVLX] in { // Mask setting all 0s or 1s multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> { let Predicates = [HasAVX512] in - let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in + let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1, + SchedRW = [WriteZero] in def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "", [(set KRC:$dst, (VT Val))]>; } @@ -3189,21 +3087,48 @@ defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>; defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>; -def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))), - (v2i1 (COPY_TO_REGCLASS - (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)), - VK2))>; -def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 4))), - (v4i1 (COPY_TO_REGCLASS - (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)), - VK4))>; -def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))), - (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>; -def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))), - (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>; -def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))), - (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>; +multiclass vextract_for_mask_to_mask<string InstrStr, X86KVectorVTInfo From, + X86KVectorVTInfo To, Predicate prd> { +let Predicates = [prd] in + def : + Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))), + (To.KVT(COPY_TO_REGCLASS + (!cast<Instruction>(InstrStr#"ri") From.KVT:$src, + (i8 imm:$imm8)), To.KRC))>; +} + +multiclass vextract_for_mask_to_mask_legal_w<X86KVectorVTInfo From, + X86KVectorVTInfo To> { +def : + Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))), + (To.KVT(COPY_TO_REGCLASS + (KSHIFTRWri(COPY_TO_REGCLASS From.KRC:$src, VK16), + (i8 imm:$imm8)), To.KRC))>; +} + +defm : vextract_for_mask_to_mask_legal_w<v2i1_info, v1i1_info>; +defm : vextract_for_mask_to_mask_legal_w<v4i1_info, v1i1_info>; +defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v1i1_info>; +defm : vextract_for_mask_to_mask_legal_w<v4i1_info, v2i1_info>; +defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v2i1_info>; +defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v4i1_info>; + +defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v1i1_info, HasAVX512>; +defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v1i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v1i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v2i1_info, HasAVX512>; +defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v2i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v2i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v4i1_info, HasAVX512>; +defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v4i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v4i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v8i1_info, HasAVX512>; +defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v8i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v8i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v16i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v16i1_info, HasBWI>; +defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v32i1_info, HasBWI>; // Patterns for kmask shift multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> { @@ -3227,39 +3152,40 @@ defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>; // -multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag ld_frag, PatFrag mload, - SDPatternOperator SelectOprr = vselect> { +multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins, + X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload, + bit NoRMPattern = 0, + SDPatternOperator SelectOprr = vselect> { let hasSideEffects = 0 in { def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [], - _.ExeDomain>, EVEX; + _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>; def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|", "${dst} {${mask}} {z}, $src}"), [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, (_.VT _.RC:$src), - _.ImmAllZerosV)))], _.ExeDomain>, - EVEX, EVEX_KZ; + _.ImmAllZerosV)))], _.ExeDomain, + itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>; - let canFoldAsLoad = 1, isReMaterializable = 1, - SchedRW = [WriteLoad] in + let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))], - _.ExeDomain>, EVEX; + !if(NoRMPattern, [], + [(set _.RC:$dst, + (_.VT (bitconvert (ld_frag addr:$src))))]), + _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>; let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in { - def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), - !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", - "${dst} {${mask}}, $src1}"), - [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, - (_.VT _.RC:$src1), - (_.VT _.RC:$src0))))], _.ExeDomain>, - EVEX, EVEX_K; - let SchedRW = [WriteLoad] in + def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1), + !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", + "${dst} {${mask}}, $src1}"), + [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask, + (_.VT _.RC:$src1), + (_.VT _.RC:$src0))))], _.ExeDomain, + itins.rr>, EVEX, EVEX_K, Sched<[WriteMove]>; def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1), !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|", @@ -3267,16 +3193,16 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, (_.VT (bitconvert (ld_frag addr:$src1))), - (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K; + (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>, + EVEX, EVEX_K, Sched<[WriteLoad]>; } - let SchedRW = [WriteLoad] in def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.MemOp:$src), OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"# "${dst} {${mask}} {z}, $src}", [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask, (_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))], - _.ExeDomain>, EVEX, EVEX_KZ; + _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>; } def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)), (!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>; @@ -3293,59 +3219,72 @@ multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag, - masked_load_aligned512>, EVEX_V512; + defm Z : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info512, + _.info512.AlignedLdFrag, masked_load_aligned512>, + EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag, - masked_load_aligned256>, EVEX_V256; - defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag, - masked_load_aligned128>, EVEX_V128; + defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info256, + _.info256.AlignedLdFrag, masked_load_aligned256>, + EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info128, + _.info128.AlignedLdFrag, masked_load_aligned128>, + EVEX_V128; } } multiclass avx512_load_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, + bit NoRMPattern = 0, SDPatternOperator SelectOprr = vselect> { let Predicates = [prd] in - defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag, - masked_load_unaligned, SelectOprr>, EVEX_V512; + defm Z : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info512, _.info512.LdFrag, + masked_load_unaligned, NoRMPattern, + SelectOprr>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag, - masked_load_unaligned, SelectOprr>, EVEX_V256; - defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag, - masked_load_unaligned, SelectOprr>, EVEX_V128; + defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info256, _.info256.LdFrag, + masked_load_unaligned, NoRMPattern, + SelectOprr>, EVEX_V256; + defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info128, _.info128.LdFrag, + masked_load_unaligned, NoRMPattern, + SelectOprr>, EVEX_V128; } } -multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - PatFrag st_frag, PatFrag mstore, string Name> { - +multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins, + X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore, + string Name, bit NoMRPattern = 0> { let hasSideEffects = 0 in { def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr # ".s\t{$src, $dst|$dst, $src}", - [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>; + [], _.ExeDomain, itins.rr>, EVEX, FoldGenData<Name#rr>, + Sched<[WriteMove]>; def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"# "${dst} {${mask}}, $src}", - [], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>; + [], _.ExeDomain, itins.rr>, EVEX, EVEX_K, + FoldGenData<Name#rrk>, Sched<[WriteMove]>; def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.RC:$src), OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" # "${dst} {${mask}} {z}, $src}", - [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>; + [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ, + FoldGenData<Name#rrkz>, Sched<[WriteMove]>; } + let hasSideEffects = 0, mayStore = 1 in def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX; + !if(NoMRPattern, [], + [(st_frag (_.VT _.RC:$src), addr:$dst)]), + _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>; def mrk : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}", - [], _.ExeDomain>, EVEX, EVEX_K; + [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>; def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)), (!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr, @@ -3355,16 +3294,18 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, multiclass avx512_store_vl< bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, - string Name> { + string Name, bit NoMRPattern = 0> { let Predicates = [prd] in - defm Z : avx512_store<opc, OpcodeStr, _.info512, store, - masked_store_unaligned, Name#Z>, EVEX_V512; + defm Z : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info512, store, + masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store, - masked_store_unaligned, Name#Z256>, EVEX_V256; - defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store, - masked_store_unaligned, Name#Z128>, EVEX_V128; + defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info256, store, + masked_store_unaligned, Name#Z256, + NoMRPattern>, EVEX_V256; + defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info128, store, + masked_store_unaligned, Name#Z128, + NoMRPattern>, EVEX_V128; } } @@ -3372,13 +3313,13 @@ multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _, Predicate prd, string Name> { let Predicates = [prd] in - defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512, + defm Z : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info512, alignedstore, masked_store_aligned512, Name#Z>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256, + defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info256, alignedstore, masked_store_aligned256, Name#Z256>, EVEX_V256; - defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore, + defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info128, alignedstore, masked_store_aligned128, Name#Z128>, EVEX_V128; } } @@ -3396,13 +3337,13 @@ defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512, "VMOVUPS">, PS, EVEX_CD8<32, CD8VF>; defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512, "VMOVUPD">, PD, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3419,24 +3360,24 @@ defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info, HasAVX512, "VMOVDQA64">, PD, VEX_W, EVEX_CD8<64, CD8VF>; -defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>, +defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>, avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info, - HasBWI, "VMOVDQU8">, + HasBWI, "VMOVDQU8", 1>, XD, EVEX_CD8<8, CD8VF>; -defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>, +defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>, avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info, - HasBWI, "VMOVDQU16">, + HasBWI, "VMOVDQU16", 1>, XD, VEX_W, EVEX_CD8<16, CD8VF>; defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info, HasAVX512, "VMOVDQU32">, XS, EVEX_CD8<32, CD8VF>; defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, - null_frag>, + 0, null_frag>, avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info, HasAVX512, "VMOVDQU64">, XS, VEX_W, EVEX_CD8<64, CD8VF>; @@ -3447,24 +3388,24 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512, let isReMaterializable = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in { def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_RM>; } -let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in { +let isPseudo = 1, SchedRW = [WriteStore], mayStore = 1, hasSideEffects = 0 in { def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src), - "", []>; + "", [], IIC_SSE_MOVA_P_MR>; } def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)), @@ -3511,8 +3452,20 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1), sub_ymm)>; } -let Predicates = [HasVLX, NoBWI] in { - // 128-bit load/store without BWI. +let Predicates = [HasAVX512] in { + // 512-bit store. + def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst), + (VMOVDQA32Zmr addr:$dst, VR512:$src)>; + def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst), + (VMOVDQA32Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v32i16 VR512:$src), addr:$dst), + (VMOVDQU32Zmr addr:$dst, VR512:$src)>; + def : Pat<(store (v64i8 VR512:$src), addr:$dst), + (VMOVDQU32Zmr addr:$dst, VR512:$src)>; +} + +let Predicates = [HasVLX] in { + // 128-bit store. def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst), (VMOVDQA32Z128mr addr:$dst, VR128X:$src)>; def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst), @@ -3522,10 +3475,10 @@ let Predicates = [HasVLX, NoBWI] in { def : Pat<(store (v16i8 VR128X:$src), addr:$dst), (VMOVDQU32Z128mr addr:$dst, VR128X:$src)>; - // 256-bit load/store without BWI. - def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst), + // 256-bit store. + def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst), (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>; - def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst), + def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst), (VMOVDQA32Z256mr addr:$dst, VR256X:$src)>; def : Pat<(store (v16i16 VR256X:$src), addr:$dst), (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; @@ -3533,129 +3486,75 @@ let Predicates = [HasVLX, NoBWI] in { (VMOVDQU32Z256mr addr:$dst, VR256X:$src)>; } -let Predicates = [HasVLX] in { - // Special patterns for storing subvector extracts of lower 128-bits of 256. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2f64 (extract_subvector - (v4f64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4f32 (extract_subvector - (v8f32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v2i64 (extract_subvector - (v4i64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v8i32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v16i16 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v32i8 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - - def : Pat<(store (v2f64 (extract_subvector - (v4f64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v4f32 (extract_subvector - (v8f32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v2i64 (extract_subvector - (v4i64 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v8i32 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v16i16 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v32i8 VR256X:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>; - - // Special patterns for storing subvector extracts of lower 128-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v2i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - - def : Pat<(store (v2f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v4f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v2i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>; - - // Special patterns for storing subvector extracts of lower 256-bits of 512. - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore256 (v4f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore256 (v8f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore256 (v4i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore256 (v8i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore256 (v16i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(alignedstore256 (v32i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - - def : Pat<(store (v4f64 (extract_subvector - (v8f64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v8f32 (extract_subvector - (v16f32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v4i64 (extract_subvector - (v8i64 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v8i32 (extract_subvector - (v16i32 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v16i16 (extract_subvector - (v32i16 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; - def : Pat<(store (v32i8 (extract_subvector - (v64i8 VR512:$src), (iPTR 0))), addr:$dst), - (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>; +multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From, + X86VectorVTInfo To, X86VectorVTInfo Cast> { + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (extract_subvector + (From.VT From.RC:$src), (iPTR 0)))), + To.RC:$src0)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrk") + Cast.RC:$src0, Cast.KRCWM:$mask, + (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>; + + def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask, + (bitconvert + (To.VT (extract_subvector + (From.VT From.RC:$src), (iPTR 0)))), + Cast.ImmAllZerosV)), + (Cast.VT (!cast<Instruction>(InstrStr#"rrkz") + Cast.KRCWM:$mask, + (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>; } +let Predicates = [HasVLX] in { +// A masked extract from the first 128-bits of a 256-bit vector can be +// implemented with masked move. +defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>; + +// A masked extract from the first 128-bits of a 512-bit vector can be +// implemented with masked move. +defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>; +defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>; + +// A masked extract from the first 256-bits of a 512-bit vector can be +// implemented with masked move. +defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>; +defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>; +defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>; +defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>; +defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>; +} + // Move Int Doubleword to Packed Double Int // let ExeDomain = SSEPackedInt in { @@ -3663,22 +3562,22 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>, - EVEX; + EVEX, Sched<[WriteMove]>; def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v4i32 (scalar_to_vector (loadi32 addr:$src))))], - IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>; def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector GR64:$src)))], - IIC_SSE_MOVDQ>, EVEX, VEX_W; + IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i64mem:$src), - "vmovq\t{$src, $dst|$dst, $src}", []>, - EVEX, VEX_W, EVEX_CD8<64, CD8VT1>; + "vmovq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, + EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteLoad]>; let isCodeGenOnly = 1 in { def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src), "vmovq\t{$src, $dst|$dst, $src}", @@ -3687,7 +3586,7 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src) def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>, - EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>; def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (bitconvert FR64X:$src))], @@ -3706,12 +3605,12 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in { def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert GR32:$src))], - IIC_SSE_MOVDQ>, EVEX; + IIC_SSE_MOVDQ>, EVEX, Sched<[WriteMove]>; def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))], - IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move doubleword from xmm register to r/m32 @@ -3721,13 +3620,13 @@ def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$s "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (extractelt (v4i32 VR128X:$src), (iPTR 0)))], IIC_SSE_MOVD_ToGP>, - EVEX; + EVEX, Sched<[WriteMove]>; def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(store (i32 (extractelt (v4i32 VR128X:$src), (iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>, - EVEX, EVEX_CD8<32, CD8VT1>; + EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt // Move quadword from xmm1 register to r/m64 @@ -3737,13 +3636,13 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (extractelt (v2i64 VR128X:$src), (iPTR 0)))], - IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteMove]>, Requires<[HasAVX512, In64BitMode]>; let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src), "vmovq\t{$src, $dst|$dst, $src}", - [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, + [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteStore]>, Requires<[HasAVX512, In64BitMode]>; def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), @@ -3757,8 +3656,8 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs), let hasSideEffects = 0 in def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst), (ins VR128X:$src), - "vmovq.s\t{$src, $dst|$dst, $src}",[]>, - EVEX, VEX_W; + "vmovq.s\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>, + EVEX, VEX_W, Sched<[WriteMove]>; } // ExeDomain = SSEPackedInt // Move Scalar Single to Double Int @@ -3768,12 +3667,12 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (bitconvert FR32X:$src))], - IIC_SSE_MOVD_ToGP>, EVEX; + IIC_SSE_MOVD_ToGP>, EVEX, Sched<[WriteMove]>; def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, FR32X:$src), "vmovd\t{$src, $dst|$dst, $src}", [(store (i32 (bitconvert FR32X:$src)), addr:$dst)], - IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>; + IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>; } // ExeDomain = SSEPackedInt, isCodeGenOnly = 1 // Move Quadword Int to Packed Quadword Int @@ -3784,7 +3683,7 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), "vmovq\t{$src, $dst|$dst, $src}", [(set VR128X:$dst, (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, - EVEX, VEX_W, EVEX_CD8<8, CD8VT8>; + EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>; } // ExeDomain = SSEPackedInt //===----------------------------------------------------------------------===// @@ -3794,57 +3693,54 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst), multiclass avx512_move_scalar<string asm, SDNode OpNode, X86VectorVTInfo _> { def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src1, _.FRC:$src2), + (ins _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, - (scalar_to_vector _.FRC:$src2))))], - _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V; + [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))], + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, Sched<[WriteMove]>; def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), - (ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2), + (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|", "$dst {${mask}} {z}, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, - (_.VT (OpNode _.RC:$src1, - (scalar_to_vector _.FRC:$src2))), + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), _.ImmAllZerosV)))], - _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ; + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ, Sched<[WriteMove]>; let Constraints = "$src0 = $dst" in def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst), - (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2), + (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2), !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|", "$dst {${mask}}, $src1, $src2}"), [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask, - (_.VT (OpNode _.RC:$src1, - (scalar_to_vector _.FRC:$src2))), + (_.VT (OpNode _.RC:$src1, _.RC:$src2)), (_.VT _.RC:$src0))))], - _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K; + _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K, Sched<[WriteMove]>; let canFoldAsLoad = 1, isReMaterializable = 1 in def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))], - _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX; + _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, Sched<[WriteLoad]>; let mayLoad = 1, hasSideEffects = 0 in { let Constraints = "$src0 = $dst" in def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|", "$dst {${mask}}, $src}"), - [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K; + [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K, Sched<[WriteLoad]>; def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst), (ins _.KRCWM:$mask, _.ScalarMemOp:$src), !strconcat(asm, "\t{$src, $dst {${mask}} {z}|", "$dst {${mask}} {z}, $src}"), - [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ; + [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ, Sched<[WriteLoad]>; } def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src), !strconcat(asm, "\t{$src, $dst|$dst, $src}"), [(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>, - EVEX; + EVEX, Sched<[WriteStore]>; let mayStore = 1, hasSideEffects = 0 in def mrk: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src), !strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"), - [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K; + [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K, Sched<[WriteStore]>; } defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>, @@ -3862,21 +3758,21 @@ def : Pat<(_.VT (OpNode _.RC:$src0, (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT _.FRC:$src2))))))), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk) - (COPY_TO_REGCLASS _.FRC:$src2, _.RC), - (COPY_TO_REGCLASS GR32:$mask, VK1WM), - (_.VT _.RC:$src0), _.FRC:$src1), - _.RC)>; + (!cast<Instruction>(InstrStr#rrk) + (COPY_TO_REGCLASS _.FRC:$src2, _.RC), + (COPY_TO_REGCLASS GR32:$mask, VK1WM), + (_.VT _.RC:$src0), + (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>; def : Pat<(_.VT (OpNode _.RC:$src0, (_.VT (scalar_to_vector (_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))), (_.EltVT _.FRC:$src1), (_.EltVT ZeroFP))))))), - (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz) - (COPY_TO_REGCLASS GR32:$mask, VK1WM), - (_.VT _.RC:$src0), _.FRC:$src1), - _.RC)>; + (!cast<Instruction>(InstrStr#rrkz) + (COPY_TO_REGCLASS GR32:$mask, VK1WM), + (_.VT _.RC:$src0), + (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>; } multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _, @@ -3982,13 +3878,33 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info, defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info, (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>; +def : Pat<(f32 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))), + (f32 FR32X:$src1), (f32 FR32X:$src2))), + (COPY_TO_REGCLASS + (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), + GR8:$mask, sub_8bit)), VK1WM), + (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src1, VR128X)), + FR32X)>; + def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))), (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X), - VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>; + VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>; + +def : Pat<(f64 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))), + (f64 FR64X:$src1), (f64 FR64X:$src2))), + (COPY_TO_REGCLASS + (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), + (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), + GR8:$mask, sub_8bit)), VK1WM), + (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), + FR64X)>; def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))), (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X), - VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>; + VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), + (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>; def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), (VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM), @@ -3996,63 +3912,60 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask), let hasSideEffects = 0 in { def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR32X:$src2), + (ins VR128X:$src1, VR128X:$src2), "vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], NoItinerary>, XS, EVEX_4V, VEX_LIG, - FoldGenData<"VMOVSSZrr">; + [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>; let Constraints = "$src0 = $dst" in def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask, - VR128X:$src1, FR32X:$src2), + VR128X:$src1, VR128X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", - [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG, - FoldGenData<"VMOVSSZrrk">; + [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>; def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2), + (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2), "vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", - [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, - FoldGenData<"VMOVSSZrrkz">; + [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG, + FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>; def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), - (ins VR128X:$src1, FR64X:$src2), + (ins VR128X:$src1, VR128X:$src2), "vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W, - FoldGenData<"VMOVSDZrr">; + [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W, + FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>; let Constraints = "$src0 = $dst" in def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask, - VR128X:$src1, FR64X:$src2), + VR128X:$src1, VR128X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}}|"# "$dst {${mask}}, $src1, $src2}", - [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG, - VEX_W, FoldGenData<"VMOVSDZrrk">; + [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>; def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst), (ins f64x_info.KRCWM:$mask, VR128X:$src1, - FR64X:$src2), + VR128X:$src2), "vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"# "$dst {${mask}} {z}, $src1, $src2}", - [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, - VEX_W, FoldGenData<"VMOVSDZrrkz">; + [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG, + VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>; } let Predicates = [HasAVX512] in { let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128X then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))), - (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))), - (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))), - (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>; + (VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))), - (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>; + (VMOVSDZrr (v2f64 (AVX512_128_SET0)), + (COPY_TO_REGCLASS FR64X:$src, VR128))>; } // Move low f32 and clear high bits. @@ -4130,14 +4043,6 @@ let Predicates = [HasAVX512] in { def : Pat<(v8f64 (X86vzload addr:$src)), (SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>; } - def : Pat<(v8f32 (X86vzmovl (insert_subvector undef, - (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)), - FR32X:$src)), sub_xmm)>; - def : Pat<(v4f64 (X86vzmovl (insert_subvector undef, - (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))), - (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)), - FR64X:$src)), sub_xmm)>; def : Pat<(v4i64 (X86vzmovl (insert_subvector undef, (v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))), (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>; @@ -4166,50 +4071,23 @@ let Predicates = [HasAVX512] in { // Shuffle with VMOVSS def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)), - (VMOVSSZrr (v4i32 VR128X:$src1), - (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>; - def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)), - (VMOVSSZrr (v4f32 VR128X:$src1), - (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>; - - // 256-bit variants - def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm), - (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)), - sub_xmm)>; - def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm), - (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)), - sub_xmm)>; + (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>; + + def : Pat<(v4f32 (X86Movss VR128X:$src1, (scalar_to_vector FR32X:$src2))), + (VMOVSSZrr VR128X:$src1, + (COPY_TO_REGCLASS FR32X:$src2, VR128X))>; // Shuffle with VMOVSD def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; - def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + (VMOVSDZrr VR128X:$src1, VR128X:$src2)>; - // 256-bit variants - def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm), - (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)), - sub_xmm)>; - def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm), - (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)), - sub_xmm)>; + def : Pat<(v2f64 (X86Movsd VR128X:$src1, (scalar_to_vector FR64X:$src2))), + (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS FR64X:$src2, VR128X))>; def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; - def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + (VMOVSDZrr VR128X:$src1, VR128X:$src2)>; def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; - def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)), - (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>; + (VMOVSDZrr VR128X:$src1, VR128X:$src2)>; } let AddedComplexity = 15 in @@ -4337,12 +4215,6 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in { (VMOVNTDQAZrm addr:$src)>; def : Pat<(v8i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZrm addr:$src)>; - def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZrm addr:$src)>; - def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZrm addr:$src)>; - def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZrm addr:$src)>; } let Predicates = [HasVLX], AddedComplexity = 400 in { @@ -4359,12 +4231,6 @@ let Predicates = [HasVLX], AddedComplexity = 400 in { (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(v4i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZ256rm addr:$src)>; - def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZ256rm addr:$src)>; def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst), (VMOVNTDQZ128mr addr:$dst, VR128X:$src)>; @@ -4379,12 +4245,6 @@ let Predicates = [HasVLX], AddedComplexity = 400 in { (VMOVNTDQAZ128rm addr:$src)>; def : Pat<(v2i64 (alignednontemporalload addr:$src)), (VMOVNTDQAZ128rm addr:$src)>; - def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZ128rm addr:$src)>; - def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZ128rm addr:$src)>; - def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))), - (VMOVNTDQAZ128rm addr:$src)>; } //===----------------------------------------------------------------------===// @@ -4397,16 +4257,16 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), - itins.rr, IsCommutable>, - AVX512BIBase, EVEX_4V; + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))), - itins.rm>, - AVX512BIBase, EVEX_4V; + itins.rm>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -4420,8 +4280,8 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - itins.rm>, - AVX512BIBase, EVEX_4V, EVEX_B; + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -4473,14 +4333,16 @@ multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info, - itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>; + itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>, + VEX_WIG; } multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins, Predicate prd, bit IsCommutable = 0> { defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info, - itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>; + itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>, + VEX_WIG; } multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, @@ -4524,14 +4386,14 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), itins.rr, IsCommutable>, - AVX512BIBase, EVEX_4V; + AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Src.LdFrag addr:$src2)))), - itins.rm>, - AVX512BIBase, EVEX_4V; + itins.rm>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2), @@ -4541,8 +4403,8 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Brdct.VT (X86VBroadcast (_Brdct.ScalarLdFrag addr:$src2)))))), - itins.rm>, - AVX512BIBase, EVEX_4V, EVEX_B; + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add, @@ -4603,7 +4465,8 @@ defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_ X86multishift, HasVBMI, 0>, T8PD; multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _Src, X86VectorVTInfo _Dst> { + X86VectorVTInfo _Src, X86VectorVTInfo _Dst, + OpndItins itins> { defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr, @@ -4611,57 +4474,60 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_Src.BroadcastStr, (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert (_Src.VT (X86VBroadcast - (_Src.ScalarLdFrag addr:$src2))))))>, - EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>; + (_Src.ScalarLdFrag addr:$src2)))))), + itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,X86VectorVTInfo _Src, - X86VectorVTInfo _Dst, bit IsCommutable = 0> { + X86VectorVTInfo _Dst, OpndItins itins, + bit IsCommutable = 0> { defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1","$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (_Src.VT _Src.RC:$src2))), - NoItinerary, IsCommutable>, - EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V; + itins.rr, IsCommutable>, + EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst), (ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), - (bitconvert (_Src.LdFrag addr:$src2))))>, - EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>; + (bitconvert (_Src.LdFrag addr:$src2)))), itins.rm>, + EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info, - v32i16_info>, + v32i16_info, SSE_PACK>, avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info, - v32i16_info>, EVEX_V512; + v32i16_info, SSE_PACK>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info, - v16i16x_info>, + v16i16x_info, SSE_PACK>, avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info, - v16i16x_info>, EVEX_V256; + v16i16x_info, SSE_PACK>, EVEX_V256; defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info, - v8i16x_info>, + v8i16x_info, SSE_PACK>, avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info, - v8i16x_info>, EVEX_V128; + v8i16x_info, SSE_PACK>, EVEX_V128; } } multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasBWI] in defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info, - v64i8_info>, EVEX_V512; + v64i8_info, SSE_PACK>, EVEX_V512, VEX_WIG; let Predicates = [HasBWI, HasVLX] in { defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info, - v32i8x_info>, EVEX_V256; + v32i8x_info, SSE_PACK>, EVEX_V256, VEX_WIG; defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info, - v16i8x_info>, EVEX_V128; + v16i8x_info, SSE_PACK>, EVEX_V128, VEX_WIG; } } @@ -4670,12 +4536,12 @@ multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr, AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> { let Predicates = [HasBWI] in defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512, - _Dst.info512, IsCommutable>, EVEX_V512; + _Dst.info512, SSE_PMADD, IsCommutable>, EVEX_V512; let Predicates = [HasBWI, HasVLX] in { defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256, - _Dst.info256, IsCommutable>, EVEX_V256; + _Dst.info256, SSE_PMADD, IsCommutable>, EVEX_V256; defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128, - _Dst.info128, IsCommutable>, EVEX_V128; + _Dst.info128, SSE_PMADD, IsCommutable>, EVEX_V128; } } @@ -4685,9 +4551,9 @@ defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512B defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase; defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw, - avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD; + avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD, VEX_WIG; defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd, - avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase; + avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, VEX_WIG; defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax, SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD; @@ -4734,90 +4600,135 @@ let Predicates = [HasDQI, NoVLX] in { sub_xmm)>; } +// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX. +let Predicates = [HasDQI, NoVLX] in { + def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))), + (EXTRACT_SUBREG + (VPMULLQZrr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; +} + +multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> { + def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)), + (EXTRACT_SUBREG + (Instr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)), + sub_ymm)>; + + def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)), + (EXTRACT_SUBREG + (Instr + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm), + (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)), + sub_xmm)>; +} + +let Predicates = [HasAVX512] in { + defm : avx512_min_max_lowering<VPMAXUQZrr, umax>; + defm : avx512_min_max_lowering<VPMINUQZrr, umin>; + defm : avx512_min_max_lowering<VPMAXSQZrr, smax>; + defm : avx512_min_max_lowering<VPMINSQZrr, smin>; +} + //===----------------------------------------------------------------------===// // AVX-512 Logical Instructions //===----------------------------------------------------------------------===// -multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable = 0> { +// OpNodeMsk is the OpNode to use when element size is important. OpNode will +// be set to null_frag for 32-bit elements. +multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _, + bit IsCommutable = 0> { + let hasSideEffects = 0 in defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), (bitconvert (_.VT _.RC:$src2)))), - (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, - _.RC:$src2)))), - IIC_SSE_BIT_P_RR, IsCommutable>, - AVX512BIBase, EVEX_4V; + (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, + _.RC:$src2)))), + itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched]>; + let hasSideEffects = 0, mayLoad = 1 in defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)), (bitconvert (_.LdFrag addr:$src2)))), - (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, + (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))))), - IIC_SSE_BIT_P_RM>, - AVX512BIBase, EVEX_4V; + itins.rm>, AVX512BIBase, EVEX_4V, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } -multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, bit IsCommutable = 0> : - avx512_logic_rm<opc, OpcodeStr, OpNode, _, IsCommutable> { +// OpNodeMsk is the OpNode to use where element size is important. So use +// for all of the broadcast patterns. +multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _, + bit IsCommutable = 0> : + avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, itins, _, + IsCommutable> { defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, - (_.i64VT (OpNode _.RC:$src1, + (_.i64VT (OpNodeMsk _.RC:$src1, (bitconvert (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))), - (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1, + (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1, (bitconvert (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))))))), - IIC_SSE_BIT_P_RM>, - AVX512BIBase, EVEX_4V, EVEX_B; + itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } -multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, +multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, + SDPatternOperator OpNode, + SDNode OpNodeMsk, OpndItins itins, AVX512VLVectorVTInfo VTInfo, bit IsCommutable = 0> { let Predicates = [HasAVX512] in - defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512, - IsCommutable>, EVEX_V512; + defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins, + VTInfo.info512, IsCommutable>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256, - IsCommutable>, EVEX_V256; - defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128, - IsCommutable>, EVEX_V128; + defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins, + VTInfo.info256, IsCommutable>, EVEX_V256; + defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins, + VTInfo.info128, IsCommutable>, EVEX_V128; } } -multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode, - bit IsCommutable = 0> { - defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info, - IsCommutable>, EVEX_CD8<32, CD8VF>; -} - -multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode, - bit IsCommutable = 0> { - defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info, - IsCommutable>, - VEX_W, EVEX_CD8<64, CD8VF>; -} - multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, - SDNode OpNode, bit IsCommutable = 0> { - defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, IsCommutable>; - defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, IsCommutable>; + SDNode OpNode, OpndItins itins, + bit IsCommutable = 0> { + defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, itins, + avx512vl_i64_info, IsCommutable>, + VEX_W, EVEX_CD8<64, CD8VF>; + defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, itins, + avx512vl_i32_info, IsCommutable>, + EVEX_CD8<32, CD8VF>; } -defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>; -defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>; -defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>; -defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>; +defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, SSE_BIT_ITINS_P, 1>; +defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, SSE_BIT_ITINS_P, 1>; +defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_BIT_ITINS_P, 1>; +defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, SSE_BIT_ITINS_P>; //===----------------------------------------------------------------------===// // AVX-512 FP arithmetic @@ -4831,7 +4742,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))), - itins.rr>; + itins.rr>, Sched<[itins.Sched]>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, @@ -4839,20 +4750,21 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (_.VT (VecNode _.RC:$src1, _.ScalarIntMemCPat:$src2, (i32 FROUND_CURRENT))), - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], - itins.rr> { + itins.rr>, Sched<[itins.Sched]> { let isCommutable = IsCommutable; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, - (_.ScalarLdFrag addr:$src2)))], itins.rm>; + (_.ScalarLdFrag addr:$src2)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } } @@ -4860,12 +4772,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode VecNode, OpndItins itins, bit IsCommutable = 0> { let ExeDomain = _.ExeDomain in - defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$rc)), itins.rr, IsCommutable>, - EVEX_B, EVEX_RC; + EVEX_B, EVEX_RC, Sched<[itins.Sched]>; } multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, SDNode OpNode, SDNode VecNode, SDNode SaeNode, @@ -4875,35 +4787,37 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, _.RC:$src2)), - itins.rr>; + itins.rr>, Sched<[itins.Sched]>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (VecNode _.RC:$src1, _.ScalarIntMemCPat:$src2)), - itins.rm>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCodeGenOnly = 1, Predicates = [HasAVX512] in { def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], - itins.rr> { + itins.rr>, Sched<[itins.Sched]> { let isCommutable = IsCommutable; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, - (_.ScalarLdFrag addr:$src2)))], itins.rm>; + (_.ScalarLdFrag addr:$src2)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } - defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, EVEX_B, + Sched<[itins.Sched]>; } } @@ -4950,14 +4864,15 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr, (ins _.FRC:$src1, _.FRC:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))], - itins.rr> { + itins.rr>, Sched<[itins.Sched]> { let isCommutable = 1; } def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set _.FRC:$dst, (OpNode _.FRC:$src1, - (_.ScalarLdFrag addr:$src2)))], itins.rm>; + (_.ScalarLdFrag addr:$src2)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc, @@ -4984,43 +4899,43 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr, - IsCommutable>, EVEX_4V; + IsCommutable>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in { defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>, - EVEX_4V; + EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (OpNode _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2)))), - itins.rm>, EVEX_4V, EVEX_B; + itins.rm>, EVEX_4V, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } } multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in - defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix, "$rc, $src2, $src1", "$src1, $src2, $rc", - (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>, - EVEX_4V, EVEX_B, EVEX_RC; + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc))), itins.rr>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>; } - multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in - defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "{sae}, $src2, $src1", "$src1, $src2, {sae}", - (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>, - EVEX_4V, EVEX_B; + (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC))), itins.rr>, + EVEX_4V, EVEX_B, Sched<[itins.Sched]>; } multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode, @@ -5052,36 +4967,38 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op } } -multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { - defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>, +multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + SizeItins itins> { + defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>, + defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>, EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } -multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> { - defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>, +multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd, + SizeItins itins> { + defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>, + defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>, EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>; } defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512, SSE_ALU_ITINS_P, 1>, - avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>; + avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SSE_ALU_ITINS_P>; defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512, SSE_MUL_ITINS_P, 1>, - avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>; + avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SSE_MUL_ITINS_P>; defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>, - avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>; + avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SSE_ALU_ITINS_P>; defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>, - avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>; + avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SSE_DIV_ITINS_P>; defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512, SSE_ALU_ITINS_P, 0>, - avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>; + avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SSE_ALU_ITINS_P>; defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512, SSE_ALU_ITINS_P, 0>, - avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>; + avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SSE_ALU_ITINS_P>; let isCodeGenOnly = 1 in { defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512, SSE_ALU_ITINS_P, 1>; @@ -5202,65 +5119,69 @@ let Predicates = [HasVLX,HasDQI] in { } multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V; + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))), + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V; + (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT)), + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (OpNode _.RC:$src1, (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>, - EVEX_4V, EVEX_B; + (_.ScalarLdFrag addr:$src2))), + (i32 FROUND_CURRENT)), itins.rm>, + EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>; + (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))), itins.rr>, + Sched<[itins.Sched]>; defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix, + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", - (OpNode _.RC:$src1, - (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i32 FROUND_CURRENT))>; + (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> { - defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>, - avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>, + defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>, - avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>, + defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>, + avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; - defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f32x_info>, + defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F32S, f32x_info>, avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>, EVEX_4V,EVEX_CD8<32, CD8VT1>; - defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f64x_info>, + defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F64S, f64x_info>, avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>, + defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>, + defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>, + defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>, + defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } @@ -5271,31 +5192,35 @@ defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs //===----------------------------------------------------------------------===// multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in { let isCommutable = 1 in defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, - EVEX_4V; + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>, + EVEX_4V, Sched<[itins.Sched]>; defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - (_.VT (bitconvert (_.LdFrag addr:$src2))))>, - EVEX_4V, - EVEX_CD8<_.EltSize, CD8VF>; + (_.VT (bitconvert (_.LdFrag addr:$src2)))), itins.rm>, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } } multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { + let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr, (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast - (_.ScalarLdFrag addr:$src2))))>, - EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + (_.ScalarLdFrag addr:$src2)))), + itins.rm>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } // Use 512bit version to implement 128/256 bit in case NoVLX. @@ -5312,16 +5237,17 @@ multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo, } multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _, string Suffix> { + OpndItins itins, AVX512VLVectorVTInfo _, + string Suffix> { let Predicates = [HasAVX512] in - defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>, - avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + defm Z : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info512>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>, - avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; - defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>, - avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info256>, + avx512_vptest_mb<opc, OpcodeStr, OpNode,itins, _.info256>, EVEX_V256; + defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info128>, + avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128; } let Predicates = [HasAVX512, NoVLX] in { defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>; @@ -5329,30 +5255,31 @@ multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, } } -multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, +multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, itins, avx512vl_i32_info, "D">; - defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, + defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, itins, avx512vl_i64_info, "Q">, VEX_W; } multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let Predicates = [HasBWI] in { - defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>, + defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v32i16_info>, EVEX_V512, VEX_W; - defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>, + defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v64i8_info>, EVEX_V512; } let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>, + defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v16i16x_info>, EVEX_V256, VEX_W; - defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>, + defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v8i16x_info>, EVEX_V128, VEX_W; - defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>, + defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v32i8x_info>, EVEX_V256; - defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>, + defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v16i8x_info>, EVEX_V128; } @@ -5362,151 +5289,165 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr, defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">; defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">; } - } multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr, - SDNode OpNode> : - avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>, - avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>; + SDNode OpNode, OpndItins itins> : + avx512_vptest_wb <opc_wb, OpcodeStr, OpNode, itins>, + avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, itins>; -defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD; -defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS; +defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm, + SSE_BIT_ITINS_P>, T8PD; +defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm, + SSE_BIT_ITINS_P>, T8XS; //===----------------------------------------------------------------------===// // AVX-512 Shift instructions //===----------------------------------------------------------------------===// multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + string OpcodeStr, SDNode OpNode, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (i8 imm:$src2))), - SSE_INTSHIFT_ITINS_P.rr>; + itins.rr>, Sched<[itins.Sched]>; defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), (i8 imm:$src2))), - SSE_INTSHIFT_ITINS_P.rm>; + itins.rm>, Sched<[itins.Sched.Folded]>; } } multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM, - string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> { + string OpcodeStr, SDNode OpNode, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", (_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))), - SSE_INTSHIFT_ITINS_P.rm>, EVEX_B; + itins.rm>, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> { + OpndItins itins, ValueType SrcVT, PatFrag bc_frag, + X86VectorVTInfo _> { // src2 is always 128-bit let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, VR128X:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))), - SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V; + itins.rr>, AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, i128mem:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))), - SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase, - EVEX_4V; + itins.rm>, AVX512BIBase, + EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - ValueType SrcVT, PatFrag bc_frag, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, ValueType SrcVT, PatFrag bc_frag, + AVX512VLVectorVTInfo VTInfo, Predicate prd> { let Predicates = [prd] in - defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag, VTInfo.info512>, EVEX_V512, EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag, VTInfo.info256>, EVEX_V256, EVEX_CD8<VTInfo.info256.EltSize, CD8VH>; - defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag, + defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag, VTInfo.info128>, EVEX_V128, EVEX_CD8<VTInfo.info128.EltSize, CD8VF>; } } multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw, - string OpcodeStr, SDNode OpNode> { - defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32, - avx512vl_i32_info, HasAVX512>; - defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64, - avx512vl_i64_info, HasAVX512>, VEX_W; - defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16, - avx512vl_i16_info, HasBWI>; + string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, itins, v4i32, + bc_v4i32, avx512vl_i32_info, HasAVX512>; + defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, itins, v2i64, + bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W; + defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, itins, v8i16, + bc_v2i64, avx512vl_i16_info, HasBWI>; } multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo> { + string OpcodeStr, SDNode OpNode, + OpndItins itins, AVX512VLVectorVTInfo VTInfo> { let Predicates = [HasAVX512] in - defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins, VTInfo.info512>, - avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, + defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins, VTInfo.info256>, - avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins, VTInfo.info256>, EVEX_V256; defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, - VTInfo.info128>, - avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, + itins, VTInfo.info128>, + avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins, VTInfo.info128>, EVEX_V128; } } multiclass avx512_shift_rmi_w<bits<8> opcw, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode> { + string OpcodeStr, SDNode OpNode, + OpndItins itins> { let Predicates = [HasBWI] in defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, - v32i16_info>, EVEX_V512; + itins, v32i16_info>, EVEX_V512, VEX_WIG; let Predicates = [HasVLX, HasBWI] in { defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, - v16i16x_info>, EVEX_V256; + itins, v16i16x_info>, EVEX_V256, VEX_WIG; defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode, - v8i16x_info>, EVEX_V128; + itins, v8i16x_info>, EVEX_V128, VEX_WIG; } } multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq, Format ImmFormR, Format ImmFormM, - string OpcodeStr, SDNode OpNode> { + string OpcodeStr, SDNode OpNode, OpndItins itins> { defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode, - avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + itins, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode, - avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; + itins, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; } -defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>, - avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V; +defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli, + SSE_INTSHIFT_P>, + avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>, - avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V; +defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli, + SSE_INTSHIFT_P>, + avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>, - avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V; +defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai, + SSE_INTSHIFT_P>, + avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V; -defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V; +defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; +defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli, + SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V; -defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>; -defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>; -defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>; +defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SSE_INTSHIFT_P>; +defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, SSE_INTSHIFT_P>; +defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SSE_INTSHIFT_P>; // Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX. let Predicates = [HasAVX512, NoVLX] in { @@ -5539,25 +5480,27 @@ let Predicates = [HasAVX512, NoVLX] in { // Variable Bit Shifts //===-------------------------------------------------------------------===// multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))), - SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V; + itins.rr>, AVX5128IBase, EVEX_4V, + Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2))))), - SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V, - EVEX_CD8<_.EltSize, CD8VF>; + itins.rm>, AVX5128IBase, EVEX_4V, + EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, @@ -5565,29 +5508,30 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src2))))), - SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + itins.rm>, AVX5128IBase, EVEX_B, + EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, - avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, - avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; - defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>, - avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256; + defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128; } } multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr, - SDNode OpNode> { - defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, + SDNode OpNode, OpndItins itins> { + defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, itins, avx512vl_i32_info>; - defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, + defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, itins, avx512vl_i64_info>, VEX_W; } @@ -5613,30 +5557,30 @@ multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr, } } multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let Predicates = [HasBWI] in - defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>, + defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i16_info>, EVEX_V512, VEX_W; let Predicates = [HasVLX, HasBWI] in { - defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>, + defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i16x_info>, EVEX_V256, VEX_W; - defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>, + defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v8i16x_info>, EVEX_V128, VEX_W; } } -defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>, - avx512_var_shift_w<0x12, "vpsllvw", shl>; +defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SSE_INTSHIFT_P>, + avx512_var_shift_w<0x12, "vpsllvw", shl, SSE_INTSHIFT_P>; -defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>, - avx512_var_shift_w<0x11, "vpsravw", sra>; +defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SSE_INTSHIFT_P>, + avx512_var_shift_w<0x11, "vpsravw", sra, SSE_INTSHIFT_P>; -defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>, - avx512_var_shift_w<0x10, "vpsrlvw", srl>; +defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SSE_INTSHIFT_P>, + avx512_var_shift_w<0x10, "vpsrlvw", srl, SSE_INTSHIFT_P>; -defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>; -defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>; +defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SSE_INTSHIFT_P>; +defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SSE_INTSHIFT_P>; defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>; defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>; @@ -5714,26 +5658,26 @@ let Predicates = [HasAVX512, NoVLX] in { (EXTRACT_SUBREG (v8i64 (VPROLVQZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), sub_xmm)>; def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), (EXTRACT_SUBREG (v8i64 (VPROLVQZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), (EXTRACT_SUBREG (v16i32 (VPROLVDZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), sub_xmm)>; def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (EXTRACT_SUBREG (v16i32 (VPROLVDZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))), @@ -5765,26 +5709,26 @@ let Predicates = [HasAVX512, NoVLX] in { (EXTRACT_SUBREG (v8i64 (VPRORVQZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), sub_xmm)>; def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))), (EXTRACT_SUBREG (v8i64 (VPRORVQZrr (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))), (EXTRACT_SUBREG (v16i32 (VPRORVDZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))), sub_xmm)>; def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))), (EXTRACT_SUBREG (v16i32 (VPRORVDZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)), - (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), + (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))), sub_ymm)>; def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))), @@ -5814,84 +5758,86 @@ let Predicates = [HasAVX512, NoVLX] in { // 1-src variable permutation VPERMW/D/Q //===-------------------------------------------------------------------===// multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, - avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in - defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, - avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>, + avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256; } multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo> { let Predicates = [HasAVX512] in defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, - VTInfo.info512>, + itins, VTInfo.info512>, avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, - VTInfo.info512>, EVEX_V512; + itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, - VTInfo.info256>, + itins, VTInfo.info256>, avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, - VTInfo.info256>, EVEX_V256; + itins, VTInfo.info256>, EVEX_V256; } multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr, Predicate prd, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [prd] in - defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>, + defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512 ; let Predicates = [HasVLX, prd] in { - defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>, + defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256 ; - defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>, + defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128 ; } } defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv, - avx512vl_i16_info>, VEX_W; + AVX2_PERMV_I, avx512vl_i16_info>, VEX_W; defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv, - avx512vl_i8_info>; + AVX2_PERMV_I, avx512vl_i8_info>; defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv, - avx512vl_i32_info>; + AVX2_PERMV_I, avx512vl_i32_info>; defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv, - avx512vl_i64_info>, VEX_W; + AVX2_PERMV_I, avx512vl_i64_info>, VEX_W; defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv, - avx512vl_f32_info>; + AVX2_PERMV_F, avx512vl_f32_info>; defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv, - avx512vl_f64_info>, VEX_W; + AVX2_PERMV_F, avx512vl_f64_info>, VEX_W; defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq", - X86VPermi, avx512vl_i64_info>, + X86VPermi, AVX2_PERMV_I, avx512vl_i64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd", - X86VPermi, avx512vl_f64_info>, + X86VPermi, AVX2_PERMV_F, avx512vl_f64_info>, EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 - VPERMIL //===----------------------------------------------------------------------===// -multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, X86VectorVTInfo Ctrl> { +multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, + OpndItins itins, X86VectorVTInfo _, + X86VectorVTInfo Ctrl> { defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, - (Ctrl.VT Ctrl.RC:$src2)))>, - T8PD, EVEX_4V; + (Ctrl.VT Ctrl.RC:$src2))), itins.rr>, + T8PD, EVEX_4V, Sched<[itins.Sched]>; defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode _.RC:$src1, - (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>, - T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2))))), + itins.rm>, T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, "${src2}"##_.BroadcastStr##", $src1", @@ -5899,30 +5845,31 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode, (_.VT (OpNode _.RC:$src1, (Ctrl.VT (X86VBroadcast - (Ctrl.ScalarLdFrag addr:$src2)))))>, - T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + (Ctrl.ScalarLdFrag addr:$src2))))), + itins.rm>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar, - AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ + OpndItins itins, AVX512VLVectorVTInfo _, + AVX512VLVectorVTInfo Ctrl> { let Predicates = [HasAVX512] in { - defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512, - Ctrl.info512>, EVEX_V512; + defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins, + _.info512, Ctrl.info512>, EVEX_V512; } let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128, - Ctrl.info128>, EVEX_V128; - defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256, - Ctrl.info256>, EVEX_V256; + defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins, + _.info128, Ctrl.info128>, EVEX_V128; + defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins, + _.info256, Ctrl.info256>, EVEX_V256; } } multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar, AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{ - - defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>; + defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, AVX_VPERMILV, _, Ctrl>; defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr, - X86VPermilpi, _>, + X86VPermilpi, AVX_VPERMILV, _>, EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>; } @@ -5932,29 +5879,31 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info, let ExeDomain = SSEPackedDouble in defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info, avx512vl_i64_info>, VEX_W; + //===----------------------------------------------------------------------===// // AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW //===----------------------------------------------------------------------===// defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd", - X86PShufd, avx512vl_i32_info>, + X86PShufd, SSE_PSHUF, avx512vl_i32_info>, EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>; defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw", - X86PShufhw>, EVEX, AVX512XSIi8Base; + X86PShufhw, SSE_PSHUF>, EVEX, AVX512XSIi8Base; defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw", - X86PShuflw>, EVEX, AVX512XDIi8Base; + X86PShuflw, SSE_PSHUF>, EVEX, AVX512XDIi8Base; -multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> { +multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { let Predicates = [HasBWI] in - defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512; + defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v64i8_info>, EVEX_V512; let Predicates = [HasVLX, HasBWI] in { - defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256; - defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128; + defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i8x_info>, EVEX_V256; + defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i8x_info>, EVEX_V128; } } -defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>; +defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb, SSE_PSHUFB>, VEX_WIG; //===----------------------------------------------------------------------===// // Move Low to High and High to Low packed FP Instructions @@ -5970,18 +5919,6 @@ def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst), [(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))], IIC_SSE_MOV_LH>, EVEX_4V; -let Predicates = [HasAVX512] in { - // MOVLHPS patterns - def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)), - (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>; - def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)), - (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>; - - // MOVHLPS patterns - def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)), - (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>; -} - //===----------------------------------------------------------------------===// // VMOVHPS/PD VMOVLPS Instructions // All patterns was taken from SSS implementation. @@ -6002,7 +5939,7 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps, v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; -defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd, +defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl, v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W; defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps, v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS; @@ -6015,25 +5952,18 @@ let Predicates = [HasAVX512] in { (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128X:$src1, - (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), (VMOVHPSZ128rm VR128X:$src1, addr:$src2)>; // VMOVHPD patterns def : Pat<(v2f64 (X86Unpckl VR128X:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; - def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))), (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>; // VMOVLPS patterns def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))), (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))), - (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>; // VMOVLPD patterns def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))), (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; - def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))), - (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Movsd VR128X:$src1, (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>; @@ -6079,16 +6009,10 @@ let Predicates = [HasAVX512] in { def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)), addr:$src1), (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; - def : Pat<(store (v4i32 (X86Movlps - (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1), - (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>; // VMOVLPD patterns def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)), addr:$src1), (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; - def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)), - addr:$src1), - (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>; } //===----------------------------------------------------------------------===// // FMA - Fused Multiply Operations @@ -6096,45 +6020,38 @@ let Predicates = [HasAVX512] in { multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), NoItinerary, 1, 1>, + AVX512FMA3Base, Sched<[WriteFMA]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), + NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), (OpNode _.RC:$src2, - _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>, - AVX512FMA3Base, EVEX_B; + _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), + NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[WriteFMALd, ReadAfterLd]>; } - - // Additional pattern for folding broadcast nodes in other orders. - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode _.RC:$src1, _.RC:$src2, - (X86VBroadcast (_.ScalarLdFrag addr:$src3))), - _.RC:$src1)), - (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1, - _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), + NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -6171,18 +6088,18 @@ defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), NoItinerary, 1, 1, + vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), + NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), @@ -6190,36 +6107,20 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src2, ${src3}"##_.BroadcastStr, (_.VT (OpNode _.RC:$src2, (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B; + _.RC:$src1)), NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B, + Sched<[WriteFMALd, ReadAfterLd]>; } - - // Additional patterns for folding broadcast nodes in other orders. - def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1)), - (!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1, - _.RC:$src2, addr:$src3)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1), - _.RC:$src1)), - (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1, - _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src2, _.RC:$src1), - _.ImmAllZerosV)), - (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1, - _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), + NoItinerary, 1, 1, vselect, 1>, + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -6255,45 +6156,42 @@ defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubR multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), NoItinerary, + 1, 1, vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>; + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>, - AVX512FMA3Base; + (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), + NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>; + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, "${src3}"##_.BroadcastStr##", $src2", "$src2, ${src3}"##_.BroadcastStr, - (_.VT (OpNode _.RC:$src1, - (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B; + (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), + _.RC:$src1, _.RC:$src2)), NoItinerary, 1, 0>, + AVX512FMA3Base, EVEX_B, Sched<[WriteFMALd, ReadAfterLd]>; } - - // Additional patterns for folding broadcast nodes in other orders. - def : Pat<(_.VT (vselect _.KRCWM:$mask, - (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), - _.RC:$src1, _.RC:$src2), - _.RC:$src1)), - (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1, - _.KRCWM:$mask, _.RC:$src2, addr:$src3)>; } multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode, X86VectorVTInfo _, string Suff> { - let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in + let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", - (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), + NoItinerary, 1, 1, vselect, 1>, + AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>; } multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -6328,167 +6226,179 @@ defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddR defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>; // Scalar FMA -let Constraints = "$src1 = $dst" in { multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb, - dag RHS_r, dag RHS_m > { + dag RHS_r, dag RHS_m, bit MaskOnlyReg> { +let Constraints = "$src1 = $dst", hasSideEffects = 0 in { defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, - "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_r, NoItinerary, 1, 1>, + AVX512FMA3Base, Sched<[WriteFMA]>; defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr, - "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base; + "$src3, $src2", "$src2, $src3", RHS_VEC_m, NoItinerary, 1, 1>, + AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>; defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc), - OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>, - AVX512FMA3Base, EVEX_B, EVEX_RC; + OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, + NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, + Sched<[WriteFMA]>; let isCodeGenOnly = 1, isCommutable = 1 in { - def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst), + def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [RHS_r]>; - def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst), + !if(MaskOnlyReg, [], [RHS_r])>, Sched<[WriteFMA]>; + def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [RHS_m]>; + [RHS_m]>, Sched<[WriteFMALd, ReadAfterLd]>; }// isCodeGenOnly = 1 -} }// Constraints = "$src1 = $dst" +} multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, - SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> { + string OpcodeStr, SDNode OpNode, SDNode OpNodes1, + SDNode OpNodeRnds1, SDNode OpNodes3, + SDNode OpNodeRnds3, X86VectorVTInfo _, + string SUFF> { let ExeDomain = _.ExeDomain in { - defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ , + defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _, // Operands for intrinsic are in 123 order to preserve passthu // semantics. - (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, - _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))), + (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)), + (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, + _.ScalarIntMemCPat:$src3)), (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, _.FRC:$src3))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1, - (_.ScalarLdFrag addr:$src3))))>; + (_.ScalarLdFrag addr:$src3)))), 0>; - defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ , - (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3, - _.RC:$src1, (i32 FROUND_CURRENT))), + defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _, + (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)), + (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3, + _.RC:$src1)), (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3, _.FRC:$src1))), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, - (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>; - - defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ , - (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3, - _.RC:$src2, (i32 FROUND_CURRENT))), - (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, - (i32 imm:$rc))), + (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>; + + // One pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _, + (null_frag), + (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3, + _.RC:$src2)), + (null_frag), (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3, _.FRC:$src2))), - (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, - (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>; + (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3), + _.FRC:$src1, _.FRC:$src2))), 1>; } } multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132, - string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1, + string OpcodeStr, SDNode OpNode, SDNode OpNodes1, + SDNode OpNodeRnds1, SDNode OpNodes3, SDNode OpNodeRnds3> { let Predicates = [HasAVX512] in { defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, - OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">, + OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3, + f32x_info, "SS">, EVEX_CD8<32, CD8VT1>, VEX_LIG; defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode, - OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">, + OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3, + f64x_info, "SD">, EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W; } } -defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1, - X86FmaddRnds3>; -defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1, - X86FmsubRnds3>; -defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, - X86FnmaddRnds1, X86FnmaddRnds3>; -defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, - X86FnmsubRnds1, X86FnmsubRnds3>; +defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1, + X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>; +defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1, + X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>; +defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1, + X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>; +defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1, + X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>; //===----------------------------------------------------------------------===// // AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA //===----------------------------------------------------------------------===// let Constraints = "$src1 = $dst" in { multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { + // NOTE: The SDNode have the multiply operands first with the add last. + // This enables commuted load patterns to be autogenerated by tablegen. let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), itins.rr, 1, 1>, + AVX512FMA3Base, Sched<[itins.Sched]>; defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3), OpcodeStr, "$src3, $src2", "$src2, $src3", - (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>, - AVX512FMA3Base; + (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), + itins.rm>, AVX512FMA3Base, Sched<[itins.Sched.Folded, ReadAfterLd]>; defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"), !strconcat("$src2, ${src3}", _.BroadcastStr ), - (OpNode _.RC:$src1, - _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>, - AVX512FMA3Base, EVEX_B; + (OpNode _.RC:$src2, + (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))), + _.RC:$src1), itins.rm>, + AVX512FMA3Base, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } } // Constraints = "$src1 = $dst" multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo _> { + OpndItins itins, AVX512VLVectorVTInfo _> { let Predicates = [HasIFMA] in { - defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info512>, + defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>; } let Predicates = [HasVLX, HasIFMA] in { - defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info256>, + defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>; - defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info128>, + defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>; } } defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l, - avx512vl_i64_info>, VEX_W; + SSE_PMADD, avx512vl_i64_info>, VEX_W; defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h, - avx512vl_i64_info>, VEX_W; + SSE_PMADD, avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from sign integer to float/double //===----------------------------------------------------------------------===// -multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, - X86VectorVTInfo DstVT, X86MemOperand x86memop, - PatFrag ld_frag, string asm> { +multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins, + RegisterClass SrcRC, X86VectorVTInfo DstVT, + X86MemOperand x86memop, PatFrag ld_frag, string asm> { let hasSideEffects = 0 in { def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst), (ins DstVT.FRC:$src1, SrcRC:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - EVEX_4V; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [], + itins.rr>, EVEX_4V, Sched<[itins.Sched]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst), (ins DstVT.FRC:$src1, x86memop:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - EVEX_4V; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [], + itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; } // hasSideEffects = 0 let isCodeGenOnly = 1 in { def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), @@ -6497,7 +6407,8 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2, - (i32 FROUND_CURRENT)))]>, EVEX_4V; + (i32 FROUND_CURRENT)))], itins.rr>, + EVEX_4V, Sched<[itins.Sched]>; def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins DstVT.RC:$src1, x86memop:$src2), @@ -6505,12 +6416,13 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), (ld_frag addr:$src2), - (i32 FROUND_CURRENT)))]>, EVEX_4V; + (i32 FROUND_CURRENT)))], itins.rm>, + EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>; }//isCodeGenOnly = 1 } -multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, - X86VectorVTInfo DstVT, string asm> { +multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, OpndItins itins, + RegisterClass SrcRC, X86VectorVTInfo DstVT, string asm> { def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc), !strconcat(asm, @@ -6518,28 +6430,29 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, [(set DstVT.RC:$dst, (OpNode (DstVT.VT DstVT.RC:$src1), SrcRC:$src2, - (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC; + (i32 imm:$rc)))], itins.rr>, + EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>; } -multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC, - X86VectorVTInfo DstVT, X86MemOperand x86memop, - PatFrag ld_frag, string asm> { - defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>, - avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>, - VEX_LIG; +multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, OpndItins itins, + RegisterClass SrcRC, X86VectorVTInfo DstVT, + X86MemOperand x86memop, PatFrag ld_frag, string asm> { + defm NAME : avx512_vcvtsi_round<opc, OpNode, itins, SrcRC, DstVT, asm>, + avx512_vcvtsi<opc, OpNode, itins, SrcRC, DstVT, x86memop, + ld_frag, asm>, VEX_LIG; } let Predicates = [HasAVX512] in { -defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, +defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR32, v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, +defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR64, v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32, +defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">, XD, EVEX_CD8<32, CD8VT1>; -defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64, +defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR64, v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -6566,16 +6479,16 @@ def : Pat<(f64 (sint_to_fp GR32:$src)), def : Pat<(f64 (sint_to_fp GR64:$src)), (VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>; -defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32, +defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR32, v4f32x_info, i32mem, loadi32, "cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, +defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR64, v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">, XS, VEX_W, EVEX_CD8<64, CD8VT1>; -defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info, +defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info, i32mem, loadi32, "cvtusi2sd{l}">, XD, VEX_LIG, EVEX_CD8<32, CD8VT1>; -defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64, +defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR64, v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; @@ -6606,71 +6519,74 @@ def : Pat<(f64 (uint_to_fp GR64:$src)), //===----------------------------------------------------------------------===// // AVX-512 Scalar convert from float/double to integer //===----------------------------------------------------------------------===// -multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT , - X86VectorVTInfo DstVT, SDNode OpNode, string asm> { + +multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT, + X86VectorVTInfo DstVT, SDNode OpNode, + OpndItins itins, string asm> { let Predicates = [HasAVX512] in { - def rr : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src), + def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>, - EVEX, VEX_LIG; - def rb : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc), - !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), - [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>, - EVEX, VEX_LIG, EVEX_B, EVEX_RC; - def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), + [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))], + itins.rr>, EVEX, VEX_LIG, Sched<[itins.Sched]>; + def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc), + !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"), + [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))], + itins.rr>, EVEX, VEX_LIG, EVEX_B, EVEX_RC, + Sched<[itins.Sched]>; + def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.ScalarIntMemCPat:$src), - (i32 FROUND_CURRENT)))]>, - EVEX, VEX_LIG; + (i32 FROUND_CURRENT)))], itins.rm>, + EVEX, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>; } // Predicates = [HasAVX512] } // Convert float/double to signed/unsigned int 32/64 defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info, - X86cvts2si, "cvtss2si">, + X86cvts2si, SSE_CVT_SS2SI_32, "cvtss2si">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info, - X86cvts2si, "cvtss2si">, + X86cvts2si, SSE_CVT_SS2SI_64, "cvtss2si">, XS, VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info, - X86cvts2usi, "cvtss2usi">, + X86cvts2usi, SSE_CVT_SS2SI_32, "cvtss2usi">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info, - X86cvts2usi, "cvtss2usi">, XS, VEX_W, - EVEX_CD8<32, CD8VT1>; + X86cvts2usi, SSE_CVT_SS2SI_64, "cvtss2usi">, + XS, VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info, - X86cvts2si, "cvtsd2si">, + X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info, - X86cvts2si, "cvtsd2si">, + X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info, - X86cvts2usi, "cvtsd2usi">, + X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, - X86cvts2usi, "cvtsd2usi">, XD, VEX_W, - EVEX_CD8<64, CD8VT1>; + X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">, + XD, VEX_W, EVEX_CD8<64, CD8VT1>; // The SSE version of these instructions are disabled for AVX512. // Therefore, the SSE intrinsics are mapped to the AVX512 instructions. let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))), - (VCVTSS2SIZrr VR128X:$src)>; + (VCVTSS2SIZrr_Int VR128X:$src)>; def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)), - (VCVTSS2SIZrm sse_load_f32:$src)>; + (VCVTSS2SIZrm_Int sse_load_f32:$src)>; def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))), - (VCVTSS2SI64Zrr VR128X:$src)>; + (VCVTSS2SI64Zrr_Int VR128X:$src)>; def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)), - (VCVTSS2SI64Zrm sse_load_f32:$src)>; + (VCVTSS2SI64Zrm_Int sse_load_f32:$src)>; def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))), - (VCVTSD2SIZrr VR128X:$src)>; + (VCVTSD2SIZrr_Int VR128X:$src)>; def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)), - (VCVTSD2SIZrm sse_load_f64:$src)>; + (VCVTSD2SIZrm_Int sse_load_f64:$src)>; def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))), - (VCVTSD2SI64Zrr VR128X:$src)>; + (VCVTSD2SI64Zrr_Int VR128X:$src)>; def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)), - (VCVTSD2SI64Zrm sse_load_f64:$src)>; + (VCVTSD2SI64Zrm_Int sse_load_f64:$src)>; } // HasAVX512 let Predicates = [HasAVX512] in { @@ -6723,24 +6639,25 @@ def : Pat<(v2f64 (X86Movsd // Convert float/double to signed/unsigned int 32/64 with truncation multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC, X86VectorVTInfo _DstRC, SDNode OpNode, - SDNode OpNodeRnd, string aliasStr>{ + SDNode OpNodeRnd, OpndItins itins, string aliasStr>{ let Predicates = [HasAVX512] in { def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX; + [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))], itins.rr>, + EVEX, Sched<[itins.Sched]>; let hasSideEffects = 0 in - def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), + def rrb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src), !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), - []>, EVEX, EVEX_B; + [], itins.rr>, EVEX, EVEX_B, Sched<[itins.Sched]>; def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>, - EVEX; + [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))], + itins.rm>, EVEX, Sched<[itins.Sched.Folded, ReadAfterLd]>; def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>; def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}", - (!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>; + (!cast<Instruction>(NAME # "rrb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>; def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst, _SrcRC.ScalarMemOp:$src), 0>; @@ -6749,47 +6666,48 @@ let Predicates = [HasAVX512] in { def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), - (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG; - def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), + (i32 FROUND_CURRENT)))], itins.rr>, + EVEX, VEX_LIG, Sched<[itins.Sched]>; + def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src), !strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"), [(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src), - (i32 FROUND_NO_EXC)))]>, - EVEX,VEX_LIG , EVEX_B; + (i32 FROUND_NO_EXC)))], itins.rr>, + EVEX,VEX_LIG , EVEX_B, Sched<[itins.Sched]>; let mayLoad = 1, hasSideEffects = 0 in def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.IntScalarMemOp:$src), !strconcat(asm,"\t{$src, $dst|$dst, $src}"), - []>, EVEX, VEX_LIG; - + [], itins.rm>, EVEX, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } // isCodeGenOnly = 1 } //HasAVX512 } defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info, - fp_to_sint, X86cvtts2IntRnd, "{l}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_32, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info, - fp_to_sint, X86cvtts2IntRnd, "{q}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_64, "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info, - fp_to_sint, X86cvtts2IntRnd, "{l}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info, - fp_to_sint, X86cvtts2IntRnd, "{q}">, + fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info, - fp_to_uint, X86cvtts2UIntRnd, "{l}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_32, "{l}">, XS, EVEX_CD8<32, CD8VT1>; defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info, - fp_to_uint, X86cvtts2UIntRnd, "{q}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_64, "{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>; defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info, - fp_to_uint, X86cvtts2UIntRnd, "{l}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{l}">, XD, EVEX_CD8<64, CD8VT1>; defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info, - fp_to_uint, X86cvtts2UIntRnd, "{q}">, + fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>; let Predicates = [HasAVX512] in { def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))), @@ -6809,88 +6727,95 @@ let Predicates = [HasAVX512] in { def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)), (VCVTTSD2SI64Zrm_Int sdmem:$src)>; } // HasAVX512 + //===----------------------------------------------------------------------===// // AVX-512 Convert form float to double and back //===----------------------------------------------------------------------===// + multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNode> { + X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins> { defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_Src.VT _Src.RC:$src2), - (i32 FROUND_CURRENT)))>, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + (i32 FROUND_CURRENT))), itins.rr>, + EVEX_4V, VEX_LIG, Sched<[itins.Sched]>; defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (_.VT (OpNode (_.VT _.RC:$src1), (_Src.VT _Src.ScalarIntMemCPat:$src2), - (i32 FROUND_CURRENT)))>, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + (i32 FROUND_CURRENT))), itins.rm>, + EVEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; let isCodeGenOnly = 1, hasSideEffects = 0 in { def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _Src.FRC:$src2), - OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>; + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + itins.rr>, EVEX_4V, VEX_LIG, Sched<[itins.Sched]>; let mayLoad = 1 in def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _Src.ScalarMemOp:$src2), - OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>; + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + itins.rm>, EVEX_4V, VEX_LIG, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } // Scalar Coversion with SAE - suppress all exceptions multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd> { + X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> { defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", (_.VT (OpNodeRnd (_.VT _.RC:$src1), (_Src.VT _Src.RC:$src2), - (i32 FROUND_NO_EXC)))>, - EVEX_4V, VEX_LIG, EVEX_B; + (i32 FROUND_NO_EXC))), itins.rr>, + EVEX_4V, VEX_LIG, EVEX_B, Sched<[itins.Sched]>; } // Scalar Conversion with rounding control (RC) multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd> { + X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> { defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", (_.VT (OpNodeRnd (_.VT _.RC:$src1), - (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>, - EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>, + (_Src.VT _Src.RC:$src2), (i32 imm:$rc))), + itins.rr>, + EVEX_4V, VEX_LIG, Sched<[itins.Sched]>, EVEX_B, EVEX_RC; } multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr, - SDNode OpNodeRnd, X86VectorVTInfo _src, - X86VectorVTInfo _dst> { + SDNode OpNodeRnd, OpndItins itins, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>, avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src, - OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; + OpNodeRnd, itins>, VEX_W, EVEX_CD8<64, CD8VT1>, XD; } } multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr, - SDNode OpNodeRnd, X86VectorVTInfo _src, - X86VectorVTInfo _dst> { + SDNode OpNodeRnd, OpndItins itins, + X86VectorVTInfo _src, X86VectorVTInfo _dst> { let Predicates = [HasAVX512] in { - defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, - avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>, + defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>, + avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>, EVEX_CD8<32, CD8VT1>, XS; } } defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss", - X86froundRnd, f64x_info, f32x_info>; + X86froundRnd, SSE_CVT_SD2SS, f64x_info, + f32x_info>, NotMemoryFoldable; defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", - X86fpextRnd,f32x_info, f64x_info >; + X86fpextRnd, SSE_CVT_SS2SD, f32x_info, + f64x_info>, NotMemoryFoldable; def : Pat<(f64 (fpextend FR32X:$src)), - (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>, + (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>, Requires<[HasAVX512]>; def : Pat<(f64 (fpextend (loadf32 addr:$src))), (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>, @@ -6905,7 +6830,7 @@ def : Pat<(f64 (extloadf32 addr:$src)), Requires<[HasAVX512, OptForSpeed]>; def : Pat<(f32 (fpround FR64X:$src)), - (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>, + (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>, Requires<[HasAVX512]>; def : Pat<(v4f32 (X86Movss @@ -6928,74 +6853,81 @@ def : Pat<(v2f64 (X86Movsd //===----------------------------------------------------------------------===// multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNode, + X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins, string Broadcast = _.BroadcastStr, string Alias = "", X86MemOperand MemOp = _Src.MemOp> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), OpcodeStr, "$src", "$src", - (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX; + (_.VT (OpNode (_Src.VT _Src.RC:$src))), itins.rr>, + EVEX, Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins MemOp:$src), OpcodeStr#Alias, "$src", "$src", (_.VT (OpNode (_Src.VT - (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX; + (bitconvert (_Src.LdFrag addr:$src))))), itins.rm>, + EVEX, Sched<[itins.Sched.Folded]>; defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _Src.ScalarMemOp:$src), OpcodeStr, "${src}"##Broadcast, "${src}"##Broadcast, (_.VT (OpNode (_Src.VT (X86VBroadcast (_Src.ScalarLdFrag addr:$src))) - ))>, EVEX, EVEX_B; + )), itins.rm>, EVEX, EVEX_B, + Sched<[itins.Sched.Folded]>; } // Coversion with SAE - suppress all exceptions multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd> { + X86VectorVTInfo _Src, SDNode OpNodeRnd, + OpndItins itins> { defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), - (i32 FROUND_NO_EXC)))>, - EVEX, EVEX_B; + (i32 FROUND_NO_EXC))), itins.rr>, + EVEX, EVEX_B, Sched<[itins.Sched]>; } // Conversion with rounding control (RC) multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86VectorVTInfo _Src, SDNode OpNodeRnd> { + X86VectorVTInfo _Src, SDNode OpNodeRnd, + OpndItins itins> { defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", - (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>, - EVEX, EVEX_B, EVEX_RC; + (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc))), + itins.rr>, EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>; } // Extend Float to Double -multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> { +multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr, + OpndItins itins> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, + fpextend, itins>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info, - X86vfpextRnd>, EVEX_V512; + X86vfpextRnd, itins>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info, - X86vfpext, "{1to2}", "", f64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>, - EVEX_V256; + X86vfpext, itins, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend, + itins>, EVEX_V256; } } // Truncate Double to Float -multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> { +multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, OpndItins itins> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info, - X86vfproundRnd>, EVEX_V512; + X86vfproundRnd, itins>, EVEX_V512; } let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info, - X86vfpround, "{1to2}", "{x}">, EVEX_V128; + X86vfpround, itins, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround, - "{1to4}", "{y}">, EVEX_V256; + itins, "{1to4}", "{y}">, EVEX_V256; def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; @@ -7008,19 +6940,23 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> { } } -defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">, +defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SSE_CVT_PD2PS>, VEX_W, PD, EVEX_CD8<64, CD8VF>; -defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">, +defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SSE_CVT_PS2PD>, PS, EVEX_CD8<32, CD8VH>; def : Pat<(v8f64 (extloadv8f32 addr:$src)), (VCVTPS2PDZrm addr:$src)>; let Predicates = [HasVLX] in { - let AddedComplexity = 15 in - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), - (VCVTPD2PSZ128rr VR128X:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128X:$src)))))), + (VCVTPD2PSZ128rr VR128X:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), + (VCVTPD2PSZ128rm addr:$src)>; + } def : Pat<(v2f64 (extloadv2f32 addr:$src)), (VCVTPS2PDZ128rm addr:$src)>; def : Pat<(v4f64 (extloadv4f32 addr:$src)), @@ -7029,75 +6965,80 @@ let Predicates = [HasVLX] in { // Convert Signed/Unsigned Doubleword to Double multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128> { + SDNode OpNode128, OpndItins itins> { // No rounding in this op let Predicates = [HasAVX512] in - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>, - EVEX_V512; + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode, + itins>, EVEX_V512; let Predicates = [HasVLX] in { defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info, - OpNode128, "{1to2}", "", i64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>, - EVEX_V256; + OpNode128, itins, "{1to2}", "", i64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode, + itins>, EVEX_V256; } } // Convert Signed/Unsigned Doubleword to Float multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNodeRnd> { + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasAVX512] in - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>, - EVEX_V256; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode, + itins>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode, + itins>, EVEX_V256; } } // Convert Float to Signed/Unsigned Doubleword with truncation -multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, + itins>, avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>, - EVEX_V256; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, + itins>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, + itins>, EVEX_V256; } } // Convert Float to Signed/Unsigned Doubleword -multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>, - EVEX_V256; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode, + itins>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode, + itins>, EVEX_V256; } } // Convert Double to Signed/Unsigned Doubleword with truncation multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128, SDNode OpNodeRnd> { + SDNode OpNode128, SDNode OpNodeRnd, + OpndItins itins> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, + itins>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 @@ -7105,9 +7046,9 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, - OpNode128, "{1to2}", "{x}">, EVEX_V128; + OpNode128, itins, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, - "{1to4}", "{y}">, EVEX_V256; + itins, "{1to4}", "{y}">, EVEX_V256; def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; @@ -7121,12 +7062,13 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, } // Convert Double to Signed/Unsigned Doubleword -multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasAVX512] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 @@ -7134,9 +7076,9 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode, - "{1to2}", "{x}">, EVEX_V128; + itins, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode, - "{1to4}", "{y}">, EVEX_V256; + itins, "{1to4}", "{y}">, EVEX_V256; def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; @@ -7150,96 +7092,102 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, } // Convert Double to Signed/Unsigned Quardword -multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd,itins>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>, - EVEX_V256; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, + itins>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, + itins>, EVEX_V256; } } // Convert Double to Signed/Unsigned Quardword with truncation -multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode, + itins>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>, - EVEX_V256; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode, + itins>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode, + itins>, EVEX_V256; } } // Convert Signed/Unsigned Quardword to Double -multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { - defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>, - EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>, - EVEX_V256; + defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode, + itins>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode, + itins>, EVEX_V256; } } // Convert Float to Signed/Unsigned Quardword -multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, - SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode, - "{1to2}", "", f64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, - EVEX_V256; + itins, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, + itins>, EVEX_V256; } } // Convert Float to Signed/Unsigned Quardword with truncation multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128, SDNode OpNodeRnd> { + SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, + itins>, avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { // Explicitly specified broadcast string, since we take only 2 elements // from v4f32x_info source defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128, - "{1to2}", "", f64mem>, EVEX_V128; - defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>, - EVEX_V256; + itins, "{1to2}", "", f64mem>, EVEX_V128; + defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode, + itins>, EVEX_V256; } } // Convert Signed/Unsigned Quardword to Float multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, - SDNode OpNode128, SDNode OpNodeRnd> { + SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> { let Predicates = [HasDQI] in { - defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>, + defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode, + itins>, avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info, - OpNodeRnd>, EVEX_V512; + OpNodeRnd, itins>, EVEX_V512; } let Predicates = [HasDQI, HasVLX] in { // we need "x"/"y" suffixes in order to distinguish between 128 and 256 @@ -7247,9 +7195,9 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly // due to the same reason. defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128, - "{1to2}", "{x}">, EVEX_V128; + itins, "{1to2}", "{x}">, EVEX_V128; defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode, - "{1to4}", "{y}">, EVEX_V256; + itins, "{1to4}", "{y}">, EVEX_V256; def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}", (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>; @@ -7262,89 +7210,100 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode, } } -defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>, - XS, EVEX_CD8<32, CD8VH>; +defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP, + SSE_CVT_I2PD>, XS, EVEX_CD8<32, CD8VH>; defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp, - X86VSintToFpRnd>, + X86VSintToFpRnd, SSE_CVT_I2PS>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint, - X86cvttp2siRnd>, + X86cvttp2siRnd, SSE_CVT_PS2I>, XS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si, - X86cvttp2siRnd>, + X86cvttp2siRnd, SSE_CVT_PD2I>, PD, VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint, - X86cvttp2uiRnd>, PS, + X86cvttp2uiRnd, SSE_CVT_PS2I>, PS, EVEX_CD8<32, CD8VF>; defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint, - X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W, - EVEX_CD8<64, CD8VF>; + X86cvttp2ui, X86cvttp2uiRnd, SSE_CVT_PD2I>, + PS, VEX_W, EVEX_CD8<64, CD8VF>; -defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>, - XS, EVEX_CD8<32, CD8VH>; +defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, + X86VUintToFP, SSE_CVT_I2PD>, XS, + EVEX_CD8<32, CD8VH>; defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp, - X86VUintToFpRnd>, XD, + X86VUintToFpRnd, SSE_CVT_I2PS>, XD, EVEX_CD8<32, CD8VF>; defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, - X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VF>; + X86cvtp2IntRnd, SSE_CVT_PS2I>, PD, + EVEX_CD8<32, CD8VF>; defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, - X86cvtp2IntRnd>, XD, VEX_W, - EVEX_CD8<64, CD8VF>; + X86cvtp2IntRnd, SSE_CVT_PD2I>, XD, + VEX_W, EVEX_CD8<64, CD8VF>; defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, - X86cvtp2UIntRnd>, + X86cvtp2UIntRnd, SSE_CVT_PS2I>, PS, EVEX_CD8<32, CD8VF>; + defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, - X86cvtp2UIntRnd>, VEX_W, + X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W, PS, EVEX_CD8<64, CD8VF>; defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, - X86cvtp2IntRnd>, VEX_W, + X86cvtp2IntRnd, SSE_CVT_PD2I>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, - X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VH>; + X86cvtp2IntRnd, SSE_CVT_PS2I>, PD, + EVEX_CD8<32, CD8VH>; defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, - X86cvtp2UIntRnd>, VEX_W, + X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, - X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>; + X86cvtp2UIntRnd, SSE_CVT_PS2I>, PD, + EVEX_CD8<32, CD8VH>; defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint, - X86cvttp2siRnd>, VEX_W, + X86cvttp2siRnd, SSE_CVT_PD2I>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si, - X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>; + X86cvttp2siRnd, SSE_CVT_PS2I>, PD, + EVEX_CD8<32, CD8VH>; defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint, - X86cvttp2uiRnd>, VEX_W, + X86cvttp2uiRnd, SSE_CVT_PD2I>, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui, - X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>; + X86cvttp2uiRnd, SSE_CVT_PS2I>, PD, + EVEX_CD8<32, CD8VH>; defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp, - X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; + X86VSintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS, + EVEX_CD8<64, CD8VF>; defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp, - X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>; + X86VUintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS, + EVEX_CD8<64, CD8VF>; defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP, - X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>; + X86VSintToFpRnd, SSE_CVT_I2PS>, VEX_W, PS, + EVEX_CD8<64, CD8VF>; defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP, - X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>; + X86VUintToFpRnd, SSE_CVT_I2PS>, VEX_W, XD, + EVEX_CD8<64, CD8VF>; let Predicates = [HasAVX512, NoVLX] in { def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))), @@ -7362,11 +7321,6 @@ def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))), (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)))), sub_xmm)>; -def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))), - (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr - (v8f64 (INSERT_SUBREG (IMPLICIT_DEF), - VR128X:$src, sub_xmm)))), sub_xmm)>; - def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))), (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), @@ -7393,16 +7347,32 @@ let Predicates = [HasAVX512, HasVLX] in { def : Pat<(X86vzmovl (v2i64 (bitconvert (v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))), (VCVTPD2DQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))), + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), + (VCVTPD2DQZ128rm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))), (VCVTPD2UDQZ128rr VR128X:$src)>; def : Pat<(X86vzmovl (v2i64 (bitconvert (v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))), (VCVTTPD2DQZ128rr VR128X:$src)>; - def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert - (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))), + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), + (VCVTTPD2DQZ128rm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))), (VCVTTPD2UDQZ128rr VR128X:$src)>; } + + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (VCVTDQ2PDZ128rm addr:$src)>; + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (VCVTDQ2PDZ128rm addr:$src)>; + + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), + (VCVTUDQ2PDZ128rm addr:$src)>; + def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (VCVTUDQ2PDZ128rm addr:$src)>; } let Predicates = [HasAVX512] in { @@ -7488,76 +7458,113 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))), //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, - X86MemOperand x86memop, PatFrag ld_frag> { - defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), - "vcvtph2ps", "$src", "$src", - (X86cvtph2ps (_src.VT _src.RC:$src), - (i32 FROUND_CURRENT))>, T8PD; - defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src), - "vcvtph2ps", "$src", "$src", - (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))), - (i32 FROUND_CURRENT))>, T8PD; -} - -multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { - defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src), - "vcvtph2ps", "{sae}, $src", "$src, {sae}", - (X86cvtph2ps (_src.VT _src.RC:$src), - (i32 FROUND_NO_EXC))>, T8PD, EVEX_B; +multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src, + X86MemOperand x86memop, PatFrag ld_frag, + OpndItins itins> { + defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), + (ins _src.RC:$src), "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT _src.RC:$src)),itins.rr>, + T8PD, Sched<[itins.Sched]>; + defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), + (ins x86memop:$src), "vcvtph2ps", "$src", "$src", + (X86cvtph2ps (_src.VT + (bitconvert + (ld_frag addr:$src)))), itins.rm>, + T8PD, Sched<[itins.Sched.Folded]>; +} + +multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, + OpndItins itins> { + defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst), + (ins _src.RC:$src), "vcvtph2ps", + "{sae}, $src", "$src, {sae}", + (X86cvtph2psRnd (_src.VT _src.RC:$src), + (i32 FROUND_NO_EXC)), itins.rr>, + T8PD, EVEX_B, Sched<[itins.Sched]>; } -let Predicates = [HasAVX512] in { - defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>, - avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>, +let Predicates = [HasAVX512] in + defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64, + SSE_CVT_PH2PS>, + avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, SSE_CVT_PH2PS>, EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; - let Predicates = [HasVLX] in { - defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, - loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; - defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, - loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; - } + +let Predicates = [HasVLX] in { + defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem, + loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V256, + EVEX_CD8<32, CD8VH>; + defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem, + loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V128, + EVEX_CD8<32, CD8VH>; + + // Pattern match vcvtph2ps of a scalar i64 load. + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), + (VCVTPH2PSZ128rm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), + (VCVTPH2PSZ128rm addr:$src)>; + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), + (VCVTPH2PSZ128rm addr:$src)>; } multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src, - X86MemOperand x86memop> { + X86MemOperand x86memop, OpndItins itins> { defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, $src1", "$src1, $src2", (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 imm:$src2)), - NoItinerary, 0, 0, X86select>, AVX512AIi8Base; - def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1), - (i32 imm:$src2))), - addr:$dst)]>; - let hasSideEffects = 0, mayStore = 1 in - def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), - (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), - "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", - []>, EVEX_K; + itins.rr, 0, 0>, AVX512AIi8Base, Sched<[itins.Sched]>; + let hasSideEffects = 0, mayStore = 1 in { + def mr : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", + [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs), + (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2), + "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}", + [], itins.rm>, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>; + } } -multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> { + +multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src, + OpndItins itins> { let hasSideEffects = 0 in - defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, + defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest, (outs _dest.RC:$dst), (ins _src.RC:$src1, i32u8imm:$src2), "vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2", - []>, EVEX_B, AVX512AIi8Base; + [], itins.rr>, EVEX_B, AVX512AIi8Base, Sched<[itins.Sched]>; } + let Predicates = [HasAVX512] in { - defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>, - avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>, - EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem, + SSE_CVT_PS2PH>, + avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, + SSE_CVT_PS2PH>, EVEX, EVEX_V512, + EVEX_CD8<32, CD8VH>; let Predicates = [HasVLX] in { - defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>, - EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>; - defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>, - EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem, + SSE_CVT_PS2PH>, EVEX, EVEX_V256, + EVEX_CD8<32, CD8VH>; + defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem, + SSE_CVT_PS2PH>, EVEX, EVEX_V128, + EVEX_CD8<32, CD8VH>; } + + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>; + def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -7580,502 +7587,500 @@ let Predicates = [HasVLX] in { (VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >; } -// Patterns for matching float to half-float conversion when AVX512 is supported -// but F16C isn't. In that case we have to use 512-bit vectors. -let Predicates = [HasAVX512, NoVLX, NoF16C] in { - def : Pat<(fp_to_f16 FR32X:$src), - (i16 (EXTRACT_SUBREG - (VMOVPDI2DIZrr - (v8i16 (EXTRACT_SUBREG - (VCVTPS2PHZrr - (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), - sub_xmm), 4), sub_xmm))), sub_16bit))>; - - def : Pat<(f16_to_fp GR16:$src), - (f32 (COPY_TO_REGCLASS - (v4f32 (EXTRACT_SUBREG - (VCVTPH2PSZrr - (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), - (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)), - sub_xmm)), sub_xmm)), FR32X))>; - - def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))), - (f32 (COPY_TO_REGCLASS - (v4f32 (EXTRACT_SUBREG - (VCVTPH2PSZrr - (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), - (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), - sub_xmm), 4)), sub_xmm)), FR32X))>; -} - // Unordered/Ordered scalar fp compare with Sea and set EFLAGS multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _, - string OpcodeStr> { - def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), - !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), - [], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, - Sched<[WriteFAdd]>; + string OpcodeStr, OpndItins itins> { + let hasSideEffects = 0 in + def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2), + !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), + [], itins.rr>, EVEX, EVEX_B, VEX_LIG, EVEX_V128, + Sched<[itins.Sched]>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { - defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">, + defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSE_COMIS>, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">, + defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSE_COMIS>, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; - defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">, + defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSE_COMIS>, AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>; - defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">, + defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSE_COMIS>, AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>; } let Defs = [EFLAGS], Predicates = [HasAVX512] in { defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS, EVEX, VEX_LIG, + "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD, EVEX, + "ucomisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; let Pattern = []<dag> in { defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32, - "comiss">, PS, EVEX, VEX_LIG, + "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64, - "comisd">, PD, EVEX, + "comisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } let isCodeGenOnly = 1 in { defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG, + sse_load_f32, "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD, EVEX, + sse_load_f64, "ucomisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS, EVEX, VEX_LIG, + sse_load_f32, "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG, EVEX_CD8<32, CD8VT1>; defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD, EVEX, + sse_load_f64, "comisd", SSE_COMIS>, PD, EVEX, VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>; } } /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V; + (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>, + EVEX_4V, Sched<[itins.Sched]>; defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V; + _.ScalarIntMemCPat:$src2), itins.rm>, EVEX_4V, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>, - EVEX_CD8<32, CD8VT1>, T8PD; -defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>, - VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; -defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>, - EVEX_CD8<32, CD8VT1>, T8PD; -defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>, - VEX_W, EVEX_CD8<64, CD8VT1>, T8PD; +defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SSE_RCPS, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable; +defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SSE_RCPS, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable; +defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, SSE_RSQRTSS, f32x_info>, + EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable; +defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, SSE_RSQRTSS, f64x_info>, + VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable; /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD; + (_.FloatVT (OpNode _.RC:$src)), itins.rr>, EVEX, T8PD, + Sched<[itins.Sched]>; defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", (OpNode (_.FloatVT - (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD; + (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX, T8PD, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.FloatVT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, - EVEX, T8PD, EVEX_B; + (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>, + EVEX, T8PD, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>, - EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>, - EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; +multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, itins.s, + v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>; + defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, itins.d, + v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), - OpNode, v4f32x_info>, + OpNode, itins.s, v4f32x_info>, EVEX_V128, EVEX_CD8<32, CD8VF>; defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), - OpNode, v8f32x_info>, + OpNode, itins.s, v8f32x_info>, EVEX_V256, EVEX_CD8<32, CD8VF>; defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), - OpNode, v2f64x_info>, + OpNode, itins.d, v2f64x_info>, EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>; defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), - OpNode, v4f64x_info>, + OpNode, itins.d, v4f64x_info>, EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>; } } -defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>; -defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>; +defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SSE_RSQRT_P>; +defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SSE_RCP_P>; /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rr>, + Sched<[itins.Sched]>; defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "{sae}, $src2, $src1", "$src1, $src2, {sae}", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rm>, EVEX_B, + Sched<[itins.Sched]>; defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNode (_.VT _.RC:$src1), - (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i32 FROUND_CURRENT))>; + (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>, +multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, itins.s>, EVEX_CD8<32, CD8VT1>; - defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>, + defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, itins.d>, EVEX_CD8<64, CD8VT1>, VEX_W; } let Predicates = [HasERI] in { - defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V; - defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V; + defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SSE_RCP_S>, + T8PD, EVEX_4V; + defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, SSE_RSQRT_S>, + T8PD, EVEX_4V; } -defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V; +defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, SSE_ALU_ITINS_S>, + T8PD, EVEX_4V; /// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let ExeDomain = _.ExeDomain in { defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>; + (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT)), + itins.rr>, Sched<[itins.Sched]>; defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", (OpNode (_.FloatVT (bitconvert (_.LdFrag addr:$src))), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, (OpNode (_.FloatVT (X86VBroadcast (_.ScalarLdFrag addr:$src))), - (i32 FROUND_CURRENT))>, EVEX_B; + (i32 FROUND_CURRENT)), itins.rm>, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { let ExeDomain = _.ExeDomain in defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "{sae}, $src", "$src, {sae}", - (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B; + (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)), + itins.rr>, EVEX_B, Sched<[itins.Sched]>; } -multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> { - defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>, - avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>, +multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode, + SizeItins itins> { + defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>, + avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>, T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>; - defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>, - avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>, + defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>, + avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>, T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>; } multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr, - SDNode OpNode> { + SDNode OpNode, SizeItins itins> { // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { - defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>, + defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, itins.s>, EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>; - defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>, + defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, itins.s>, EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>; - defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>, + defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, itins.d>, EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; - defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>, + defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, itins.d>, EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>; } } let Predicates = [HasERI] in { - defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX; - defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX; - defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX; + defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SSE_RSQRT_P>, EVEX; + defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SSE_RCP_P>, EVEX; + defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SSE_ALU_ITINS_P>, EVEX; } -defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>, - avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX; +defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SSE_ALU_ITINS_P>, + avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd, + SSE_ALU_ITINS_P>, EVEX; -multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, - SDNode OpNodeRnd, X86VectorVTInfo _>{ +multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _>{ let ExeDomain = _.ExeDomain in defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc", - (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>, - EVEX, EVEX_B, EVEX_RC; + (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc))), itins.rr>, + EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>; } -multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ +multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _>{ let ExeDomain = _.ExeDomain in { defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.FloatVT (OpNode _.RC:$src))>, EVEX; + (_.FloatVT (fsqrt _.RC:$src)), itins.rr>, EVEX, + Sched<[itins.Sched]>; defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src), OpcodeStr, "$src", "$src", - (OpNode (_.FloatVT - (bitconvert (_.LdFrag addr:$src))))>, EVEX; - + (fsqrt (_.FloatVT + (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr, - (OpNode (_.FloatVT - (X86VBroadcast (_.ScalarLdFrag addr:$src))))>, - EVEX, EVEX_B; + (fsqrt (_.FloatVT + (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>, + EVEX, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr, - SDNode OpNode> { - defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode, - v16f32_info>, +multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr> { + defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS, v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode, - v8f64_info>, + defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD, v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; // Define only if AVX512VL feature is present. let Predicates = [HasVLX] in { defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), - OpNode, v4f32x_info>, + SSE_SQRTPS, v4f32x_info>, EVEX_V128, PS, EVEX_CD8<32, CD8VF>; defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), - OpNode, v8f32x_info>, + SSE_SQRTPS, v8f32x_info>, EVEX_V256, PS, EVEX_CD8<32, CD8VF>; defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), - OpNode, v2f64x_info>, + SSE_SQRTPD, v2f64x_info>, EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>; defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), - OpNode, v4f64x_info>, + SSE_SQRTPD, v4f64x_info>, EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>; } } -multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr, - SDNode OpNodeRnd> { - defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd, +multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr> { + defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS, v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>; - defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd, + defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD, v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>; } -multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _, - string SUFF, SDNode OpNode, SDNode OpNodeRnd> { +multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, OpndItins itins, + X86VectorVTInfo _, string SUFF, Intrinsic Intr> { let ExeDomain = _.ExeDomain in { defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNodeRnd (_.VT _.RC:$src1), + (X86fsqrtRnds (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rr>, + Sched<[itins.Sched]>; defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr, + (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr, "$src2, $src1", "$src1, $src2", - (OpNodeRnd (_.VT _.RC:$src1), - (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src2))), - (i32 FROUND_CURRENT))>; - + (X86fsqrtRnds (_.VT _.RC:$src1), + _.ScalarIntMemCPat:$src2, + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr, "$rc, $src2, $src1", "$src1, $src2, $rc", - (OpNodeRnd (_.VT _.RC:$src1), + (X86fsqrtRnds (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$rc))>, - EVEX_B, EVEX_RC; + (i32 imm:$rc)), itins.rr>, + EVEX_B, EVEX_RC, Sched<[itins.Sched]>; let isCodeGenOnly = 1, hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), (ins _.FRC:$src1, _.FRC:$src2), - OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; - + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rr>, + Sched<[itins.Sched]>; let mayLoad = 1 in def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), (ins _.FRC:$src1, _.ScalarMemOp:$src2), - OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>; + OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } - def : Pat<(_.EltVT (OpNode _.FRC:$src)), +let Predicates = [HasAVX512] in { + def : Pat<(_.EltVT (fsqrt _.FRC:$src)), (!cast<Instruction>(NAME#SUFF#Zr) (_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>; - def : Pat<(_.EltVT (OpNode (load addr:$src))), + def : Pat<(Intr VR128X:$src), + (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src, + VR128X:$src)>; +} + +let Predicates = [HasAVX512, OptForSize] in { + def : Pat<(_.EltVT (fsqrt (load addr:$src))), (!cast<Instruction>(NAME#SUFF#Zm) - (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>; + (_.EltVT (IMPLICIT_DEF)), addr:$src)>; + + def : Pat<(Intr _.ScalarIntMemCPat:$src2), + (!cast<Instruction>(NAME#SUFF#Zm_Int) + (_.VT (IMPLICIT_DEF)), addr:$src2)>; +} + } multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> { - defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt, - X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS; - defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt, - X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W; + defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", SSE_SQRTPS, f32x_info, "SS", + int_x86_sse_sqrt_ss>, + EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable; + defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", SSE_SQRTPD, f64x_info, "SD", + int_x86_sse2_sqrt_sd>, + EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W, + NotMemoryFoldable; } -defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>, - avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>; +defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt">, + avx512_sqrt_packed_all_round<0x51, "vsqrt">; defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG; -let Predicates = [HasAVX512] in { - def : Pat<(f32 (X86frsqrt FR32X:$src)), - (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>; - def : Pat<(f32 (X86frsqrt (load addr:$src))), - (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; - def : Pat<(f32 (X86frcp FR32X:$src)), - (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>; - def : Pat<(f32 (X86frcp (load addr:$src))), - (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>, - Requires<[OptForSize]>; -} - -multiclass -avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> { - +multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { - defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + (i32 imm:$src3))), itins.rr>, + Sched<[itins.Sched]>; - defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), + defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3", - (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B; + (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2), + (i32 imm:$src3), (i32 FROUND_NO_EXC))), itins.rr>, EVEX_B, + Sched<[itins.Sched]>; - defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), - (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), + (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", - (_.VT (X86RndScales (_.VT _.RC:$src1), - (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3), (i32 FROUND_CURRENT)))>; + (_.VT (X86RndScales _.RC:$src1, + _.ScalarIntMemCPat:$src2, (i32 imm:$src3))), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + + let isCodeGenOnly = 1, hasSideEffects = 0 in { + def r : I<opc, MRMSrcReg, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [], itins.rr>, Sched<[itins.Sched]>; + + let mayLoad = 1 in + def m : I<opc, MRMSrcMem, (outs _.FRC:$dst), + (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), + OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } + } + let Predicates = [HasAVX512] in { - def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>; - def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>; - def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>; - def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>; - def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)), - (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>; - - def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0x9))), _.FRC)>; - def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0xa))), _.FRC)>; - def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0xb))), _.FRC)>; - def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0x4))), _.FRC)>; - def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS - (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)), - addr:$src, (i32 0xc))), _.FRC)>; - } -} - -defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>; - -defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>; + def : Pat<(ffloor _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0x9)))>; + def : Pat<(fceil _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0xa)))>; + def : Pat<(ftrunc _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0xb)))>; + def : Pat<(frint _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0x4)))>; + def : Pat<(fnearbyint _.FRC:$src), + (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)), + _.FRC:$src, (i32 0xc)))>; + } + + let Predicates = [HasAVX512, OptForSize] in { + def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0x9)))>; + def : Pat<(fceil (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0xa)))>; + def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0xb)))>; + def : Pat<(frint (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0x4)))>; + def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), + (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)), + addr:$src, (i32 0xc)))>; + } +} + +defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", SSE_ALU_F32S, + f32x_info>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>; + +defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", SSE_ALU_F64S, + f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V, + EVEX_CD8<64, CD8VT1>; //------------------------------------------------- // Integer truncate and extend operations //------------------------------------------------- +let Sched = WriteShuffle256 in +def AVX512_EXTEND : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + +let Sched = WriteShuffle256 in +def AVX512_TRUNCATE : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo, - X86MemOperand x86memop> { + OpndItins itins, X86VectorVTInfo SrcInfo, + X86VectorVTInfo DestInfo, X86MemOperand x86memop> { let ExeDomain = DestInfo.ExeDomain in defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1", - (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>, - EVEX, T8XS; - - // for intrinsic patter match - def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, - (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), - undef)), - (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , - SrcInfo.RC:$src1)>; - - def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, - (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), - DestInfo.ImmAllZerosV)), - (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask , - SrcInfo.RC:$src1)>; - - def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask, - (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), - DestInfo.RC:$src0)), - (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0, - DestInfo.KRCWM:$mask , - SrcInfo.RC:$src1)>; + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))), + itins.rr>, EVEX, T8XS, Sched<[itins.Sched]>; let mayStore = 1, mayLoad = 1, hasSideEffects = 0, ExeDomain = DestInfo.ExeDomain in { def mr : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, SrcInfo.RC:$src), OpcodeStr # "\t{$src, $dst|$dst, $src}", - []>, EVEX; + [], itins.rm>, EVEX, Sched<[itins.Sched.Folded]>; def mrk : AVX512XS8I<opc, MRMDestMem, (outs), (ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", - []>, EVEX, EVEX_K; + [], itins.rm>, EVEX, EVEX_K, Sched<[itins.Sched.Folded]>; }//mayStore = 1, mayLoad = 1, hasSideEffects = 0 } @@ -8094,112 +8099,118 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo, } multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, + OpndItins itins, AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128, X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ, X86MemOperand x86memopZ128, X86MemOperand x86memopZ256, X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag, Predicate prd = HasAVX512>{ let Predicates = [HasVLX, prd] in { - defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128, - DestInfoZ128, x86memopZ128>, + defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, itins, + VTSrcInfo.info128, DestInfoZ128, x86memopZ128>, avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128, truncFrag, mtruncFrag>, EVEX_V128; - defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256, - DestInfoZ256, x86memopZ256>, + defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, itins, + VTSrcInfo.info256, DestInfoZ256, x86memopZ256>, avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256, truncFrag, mtruncFrag>, EVEX_V256; } let Predicates = [prd] in - defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512, - DestInfoZ, x86memopZ>, + defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, itins, + VTSrcInfo.info512, DestInfoZ, x86memopZ>, avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ, truncFrag, mtruncFrag>, EVEX_V512; } multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + OpndItins itins, PatFrag StoreNode, + PatFrag MaskedStoreNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info, v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem, StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>; } multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + OpndItins itins, PatFrag StoreNode, + PatFrag MaskedStoreNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info, v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem, StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>; } multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info, + OpndItins itins, PatFrag StoreNode, + PatFrag MaskedStoreNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info, v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem, StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>; } multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + OpndItins itins, PatFrag StoreNode, + PatFrag MaskedStoreNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info, v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem, StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>; } multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info, + OpndItins itins, PatFrag StoreNode, + PatFrag MaskedStoreNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info, v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem, StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>; } multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode, - PatFrag StoreNode, PatFrag MaskedStoreNode> { - defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info, + OpndItins itins, PatFrag StoreNode, + PatFrag MaskedStoreNode> { + defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i16_info, v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem, StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>; } -defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, +defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, AVX512_TRUNCATE, truncstorevi8, masked_truncstorevi8>; -defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, +defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi8, masked_truncstore_s_vi8>; -defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, +defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi8, masked_truncstore_us_vi8>; -defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, +defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, AVX512_TRUNCATE, truncstorevi16, masked_truncstorevi16>; -defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, +defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi16, masked_truncstore_s_vi16>; -defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, +defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi16, masked_truncstore_us_vi16>; -defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, +defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, AVX512_TRUNCATE, truncstorevi32, masked_truncstorevi32>; -defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, +defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi32, masked_truncstore_s_vi32>; -defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, +defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi32, masked_truncstore_us_vi32>; -defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, +defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, AVX512_TRUNCATE, truncstorevi8, masked_truncstorevi8>; -defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, +defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi8, masked_truncstore_s_vi8>; -defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, +defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi8, masked_truncstore_us_vi8>; -defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, +defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, AVX512_TRUNCATE, truncstorevi16, masked_truncstorevi16>; -defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, +defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi16, masked_truncstore_s_vi16>; -defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, +defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi16, masked_truncstore_us_vi16>; -defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, +defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, AVX512_TRUNCATE, truncstorevi8, masked_truncstorevi8>; -defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, +defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, AVX512_TRUNCATE, truncstore_s_vi8, masked_truncstore_s_vi8>; -defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, +defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, AVX512_TRUNCATE, truncstore_us_vi8, masked_truncstore_us_vi8>; let Predicates = [HasAVX512, NoVLX] in { @@ -8219,191 +8230,151 @@ def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))), VR256X:$src, sub_ymm))), sub_xmm))>; } -multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, +multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, OpndItins itins, X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo, X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{ let ExeDomain = DestInfo.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src", - (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>, - EVEX; + (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src))), itins.rr>, + EVEX, Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), (ins x86memop:$src), OpcodeStr ,"$src", "$src", - (DestInfo.VT (LdFrag addr:$src))>, - EVEX; + (DestInfo.VT (LdFrag addr:$src)), itins.rm>, + EVEX, Sched<[itins.Sched.Folded]>; } } multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasBWI] in { - defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info, + defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v8i16x_info, v16i8x_info, i64mem, LdFrag, InVecNode>, - EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128; + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info, + defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v16i16x_info, v16i8x_info, i128mem, LdFrag, OpNode>, - EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256; + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasBWI] in { - defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info, + defm Z : avx512_extend_common<opc, OpcodeStr, itins, v32i16_info, v32i8x_info, i256mem, LdFrag, OpNode>, - EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512; + EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, + defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info, v16i8x_info, i32mem, LdFrag, InVecNode>, - EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128; + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, + defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info, v16i8x_info, i64mem, LdFrag, OpNode>, - EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256; + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info, + defm Z : avx512_extend_common<opc, OpcodeStr, itins, v16i32_info, v16i8x_info, i128mem, LdFrag, OpNode>, - EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512; + EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info, v16i8x_info, i16mem, LdFrag, InVecNode>, - EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128; + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info, v16i8x_info, i32mem, LdFrag, OpNode>, - EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256; + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info, v16i8x_info, i64mem, LdFrag, OpNode>, - EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512; + EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info, + defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info, v8i16x_info, i64mem, LdFrag, InVecNode>, - EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128; + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info, + defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info, v8i16x_info, i128mem, LdFrag, OpNode>, - EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256; + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info, + defm Z : avx512_extend_common<opc, OpcodeStr, itins, v16i32_info, v16i16x_info, i256mem, LdFrag, OpNode>, - EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512; + EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info, v8i16x_info, i32mem, LdFrag, InVecNode>, - EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128; + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG; - defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info, v8i16x_info, i64mem, LdFrag, OpNode>, - EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256; + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info, v8i16x_info, i128mem, LdFrag, OpNode>, - EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512; + EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG; } } multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr, - SDPatternOperator OpNode, SDPatternOperator InVecNode, - string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { + SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy, + OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> { let Predicates = [HasVLX, HasAVX512] in { - defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info, + defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info, v4i32x_info, i64mem, LdFrag, InVecNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128; - defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info, + defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info, v4i32x_info, i128mem, LdFrag, OpNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256; } let Predicates = [HasAVX512] in { - defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info, + defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info, v8i32x_info, i256mem, LdFrag, OpNode>, EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512; } } -defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">; -defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">; -defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">; -defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">; -defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">; -defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">; +defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", AVX512_EXTEND>; +defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", AVX512_EXTEND>; -defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">; -defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">; -defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">; -defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">; -defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">; -defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">; +defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", AVX512_EXTEND>; +defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", AVX512_EXTEND>; -// EXTLOAD patterns, implemented using vpmovz -multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To, - X86VectorVTInfo From, PatFrag LdFrag> { - def : Pat<(To.VT (LdFrag addr:$src)), - (!cast<Instruction>("VPMOVZX"#InstrStr#"rm") addr:$src)>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)), - (!cast<Instruction>("VPMOVZX"#InstrStr#"rmk") To.RC:$src0, - To.KRC:$mask, addr:$src)>; - def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), - To.ImmAllZerosV)), - (!cast<Instruction>("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask, - addr:$src)>; -} - -let Predicates = [HasVLX, HasBWI] in { - defm : avx512_ext_lowering<"BWZ128", v8i16x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info, extloadvi8>; -} -let Predicates = [HasBWI] in { - defm : avx512_ext_lowering<"BWZ", v32i16_info, v32i8x_info, extloadvi8>; -} -let Predicates = [HasVLX, HasAVX512] in { - defm : avx512_ext_lowering<"BDZ128", v4i32x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BDZ256", v8i32x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BQZ128", v2i64x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BQZ256", v4i64x_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"WDZ128", v4i32x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WDZ256", v8i32x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WQZ128", v2i64x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WQZ256", v4i64x_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"DQZ128", v2i64x_info, v4i32x_info, extloadvi32>; - defm : avx512_ext_lowering<"DQZ256", v4i64x_info, v4i32x_info, extloadvi32>; -} -let Predicates = [HasAVX512] in { - defm : avx512_ext_lowering<"BDZ", v16i32_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"BQZ", v8i64_info, v16i8x_info, extloadvi8>; - defm : avx512_ext_lowering<"WDZ", v16i32_info, v16i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"WQZ", v8i64_info, v8i16x_info, extloadvi16>; - defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>; -} multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp, SDNode InVecOp, PatFrag ExtLoad16> { @@ -8552,18 +8523,20 @@ defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>; //===----------------------------------------------------------------------===// // GATHER - SCATTER Operations +// FIXME: Improve scheduling of gather/scatter instructions. multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _, - X86MemOperand memop, PatFrag GatherNode> { + X86MemOperand memop, PatFrag GatherNode, + RegisterClass MaskRC = _.KRCWM> { let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb", ExeDomain = _.ExeDomain in - def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb), - (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2), + def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb), + (ins _.RC:$src1, MaskRC:$mask, memop:$src2), !strconcat(OpcodeStr#_.Suffix, "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"), - [(set _.RC:$dst, _.KRCWM:$mask_wb, - (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask, + [(set _.RC:$dst, MaskRC:$mask_wb, + (GatherNode (_.VT _.RC:$src1), MaskRC:$mask, vectoraddr:$src2))]>, EVEX, EVEX_K, - EVEX_CD8<_.EltSize, CD8VT1>; + EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>; } multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc, @@ -8598,7 +8571,8 @@ let Predicates = [HasVLX] in { defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128, vx128xmem, mgatherv4i32>, EVEX_V128; defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128, - vx64xmem, X86mgatherv2i64>, EVEX_V128; + vx64xmem, mgatherv2i64, VK2WM>, + EVEX_V128; } } @@ -8620,7 +8594,8 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"), [(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src), _.KRCWM:$mask, vectoraddr:$dst))]>, - EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[WriteStore]>; } multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc, @@ -8671,7 +8646,7 @@ multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeSt let Predicates = [HasPFI], hasSideEffects = 1 in def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src), !strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"), - []>, EVEX, EVEX_K; + [], IIC_SSE_PREFETCH>, EVEX, EVEX_K, Sched<[WriteLoad]>; } defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps", @@ -8722,20 +8697,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd", VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>; -// Helper fragments to match sext vXi1 to vXiY. -def v64i1sextv64i8 : PatLeaf<(v64i8 - (X86vsext - (v64i1 (X86pcmpgtm - (bc_v64i8 (v16i32 immAllZerosV)), - VR512:$src))))>; -def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>; -def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>; -def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>; - multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src), !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"), - [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX; + [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))], + IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>; } // Use 512bit version to implement 128/256 bit in case NoVLX. @@ -8773,7 +8739,8 @@ defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > { def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX; + [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))], + IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>; } // Use 512bit version to implement 128/256 bit in case NoVLX. @@ -8819,27 +8786,39 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m", // AVX-512 - COMPRESS and EXPAND // +// FIXME: Is there a better scheduler itinerary for VPCOMPRESS/VPEXPAND? +let Sched = WriteShuffle256 in { +def AVX512_COMPRESS : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; +def AVX512_EXPAND : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; +} + multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _, - string OpcodeStr> { + string OpcodeStr, OpndItins itins> { defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (X86compress _.RC:$src1))>, AVX5128IBase; + (_.VT (X86compress _.RC:$src1)), itins.rr>, AVX5128IBase, + Sched<[itins.Sched]>; let mayStore = 1, hasSideEffects = 0 in def mr : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src), OpcodeStr # "\t{$src, $dst|$dst, $src}", - []>, EVEX_CD8<_.EltSize, CD8VT1>; + []>, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded]>; def mrk : AVX5128I<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src), OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}", []>, - EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>; + EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded]>; } multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > { - def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask, (_.VT _.RC:$src)), (!cast<Instruction>(NAME#_.ZSuffix##mrk) @@ -8847,39 +8826,44 @@ multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > { } multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo VTInfo> { - defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>, + OpndItins itins, + AVX512VLVectorVTInfo VTInfo, + Predicate Pred = HasAVX512> { + let Predicates = [Pred] in + defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, itins>, compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512; - let Predicates = [HasVLX] in { - defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>, + let Predicates = [Pred, HasVLX] in { + defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, itins>, compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256; - defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>, + defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, itins>, compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128; } } -defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>, - EVEX; -defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>, - EVEX, VEX_W; -defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>, - EVEX; -defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>, - EVEX, VEX_W; +defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", AVX512_COMPRESS, + avx512vl_i32_info>, EVEX; +defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", AVX512_COMPRESS, + avx512vl_i64_info>, EVEX, VEX_W; +defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", AVX512_COMPRESS, + avx512vl_f32_info>, EVEX; +defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", AVX512_COMPRESS, + avx512vl_f64_info>, EVEX, VEX_W; // expand multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _, - string OpcodeStr> { + string OpcodeStr, OpndItins itins> { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (X86expand _.RC:$src1))>, AVX5128IBase; + (_.VT (X86expand _.RC:$src1)), itins.rr>, AVX5128IBase, + Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", (_.VT (X86expand (_.VT (bitconvert - (_.LdFrag addr:$src1)))))>, - AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>; + (_.LdFrag addr:$src1))))), itins.rm>, + AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > { @@ -8895,59 +8879,62 @@ multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > { } multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr, - AVX512VLVectorVTInfo VTInfo> { - defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>, + OpndItins itins, + AVX512VLVectorVTInfo VTInfo, + Predicate Pred = HasAVX512> { + let Predicates = [Pred] in + defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, itins>, expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512; - let Predicates = [HasVLX] in { - defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>, + let Predicates = [Pred, HasVLX] in { + defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, itins>, expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256; - defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>, + defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, itins>, expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128; } } -defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>, - EVEX; -defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>, - EVEX, VEX_W; -defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>, - EVEX; -defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>, - EVEX, VEX_W; +defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", AVX512_EXPAND, + avx512vl_i32_info>, EVEX; +defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", AVX512_EXPAND, + avx512vl_i64_info>, EVEX, VEX_W; +defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", AVX512_EXPAND, + avx512vl_f32_info>, EVEX; +defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", AVX512_EXPAND, + avx512vl_f64_info>, EVEX, VEX_W; //handle instruction reg_vec1 = op(reg_vec,imm) // op(mem_vec,imm) // op(broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT _.RC:$src1), - (i32 imm:$src2), - (i32 FROUND_CURRENT))>; + (i32 imm:$src2)), itins.rr>, Sched<[itins.Sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2", (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i32 imm:$src2), - (i32 FROUND_CURRENT))>; + (i32 imm:$src2)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1, i32u8imm:$src2), OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2", (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))), - (i32 imm:$src2), - (i32 FROUND_CURRENT))>, EVEX_B; + (i32 imm:$src2)), itins.rm>, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ + SDNode OpNode, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, i32u8imm:$src2), @@ -8955,21 +8942,24 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, "$src1, {sae}, $src2", (OpNode (_.VT _.RC:$src1), (i32 imm:$src2), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_B, Sched<[itins.Sched]>; } multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, - AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins, Predicate prd>{ let Predicates = [prd] in { - defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, - avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, - EVEX_V512; + defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins, + _.info512>, + avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, + itins, _.info512>, EVEX_V512; } let Predicates = [prd, HasVLX] in { - defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, - EVEX_V128; - defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, - EVEX_V256; + defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins, + _.info128>, EVEX_V128; + defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins, + _.info256>, EVEX_V256; } } @@ -8978,51 +8968,54 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr, // op(reg_vec2,broadcast(eltVt),imm) //all instruction created with FROUND_CURRENT multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + OpndItins itins, X86VectorVTInfo _>{ let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), - (i32 FROUND_CURRENT))>; + (i32 imm:$src3)), itins.rr>, + Sched<[itins.Sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (bitconvert (_.LdFrag addr:$src2))), - (i32 imm:$src3), - (i32 FROUND_CURRENT))>; + (i32 imm:$src3)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1", "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3), - (i32 FROUND_CURRENT))>, EVEX_B; + (i32 imm:$src3)), itins.rm>, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_vec,imm) multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{ + OpndItins itins, X86VectorVTInfo DestInfo, + X86VectorVTInfo SrcInfo>{ let ExeDomain = DestInfo.ExeDomain in { defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), (SrcInfo.VT SrcInfo.RC:$src2), - (i8 imm:$src3)))>; + (i8 imm:$src3))), itins.rr>, + Sched<[itins.Sched]>; defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst), (ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1), (SrcInfo.VT (bitconvert (SrcInfo.LdFrag addr:$src2))), - (i8 imm:$src3)))>; + (i8 imm:$src3))), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -9030,8 +9023,8 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, // op(reg_vec2,mem_vec,imm) // op(reg_vec2,broadcast(eltVt),imm) multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>: - avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{ + OpndItins itins, X86VectorVTInfo _>: + avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, itins, _, _>{ let ExeDomain = _.ExeDomain in defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), @@ -9040,36 +9033,37 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode, "$src1, ${src2}"##_.BroadcastStr##", $src3", (OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))), - (i8 imm:$src3))>, EVEX_B; + (i8 imm:$src3)), itins.rm>, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm) // op(reg_vec2,mem_scalar,imm) -//all instruction created with FROUND_CURRENT multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), - (i32 imm:$src3), - (i32 FROUND_CURRENT))>; + (i32 imm:$src3)), itins.rr>, + Sched<[itins.Sched]>; defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3), OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3", (OpNode (_.VT _.RC:$src1), (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))), - (i32 imm:$src3), - (i32 FROUND_CURRENT))>; + (i32 imm:$src3)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } //handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ + SDNode OpNode, OpndItins itins, + X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), @@ -9078,11 +9072,13 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_B, Sched<[itins.Sched]>; } + //handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae} -multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _> { +multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), @@ -9091,113 +9087,114 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (i32 imm:$src3), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_B, Sched<[itins.Sched]>; } multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr, - AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins, Predicate prd>{ let Predicates = [prd] in { - defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>, - avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>, + defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info512>, + avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, itins, _.info512>, EVEX_V512; } let Predicates = [prd, HasVLX] in { - defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>, + defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128; - defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>, + defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256; } } multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr, - AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{ - let Predicates = [HasBWI] in { - defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512, + OpndItins itins, AVX512VLVectorVTInfo DestInfo, + AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> { + let Predicates = [Pred] in { + defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info512, SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V; } - let Predicates = [HasBWI, HasVLX] in { - defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128, + let Predicates = [Pred, HasVLX] in { + defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info128, SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V; - defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256, + defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info256, SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V; } } multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _, - bits<8> opc, SDNode OpNode>{ - let Predicates = [HasAVX512] in { - defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; + bits<8> opc, SDNode OpNode, OpndItins itins, + Predicate Pred = HasAVX512> { + let Predicates = [Pred] in { + defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512; } - let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128; - defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; + let Predicates = [Pred, HasVLX] in { + defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128; + defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256; } } multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr, - X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{ + X86VectorVTInfo _, bits<8> opc, SDNode OpNode, + SDNode OpNodeRnd, OpndItins itins, Predicate prd>{ let Predicates = [prd] in { - defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>, - avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>; + defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, itins, _>, + avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, itins, _>; } } multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr, - bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{ + bits<8> opcPs, bits<8> opcPd, SDNode OpNode, + SDNode OpNodeRnd, SizeItins itins, Predicate prd>{ defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info, - opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>; + opcPs, OpNode, OpNodeRnd, itins.s, prd>, + EVEX_CD8<32, CD8VF>; defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info, - opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W; + opcPd, OpNode, OpNodeRnd, itins.d, prd>, + EVEX_CD8<64, CD8VF>, VEX_W; } - defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56, - X86VReduce, HasDQI>, AVX512AIi8Base, EVEX; + X86VReduce, X86VReduceRnd, SSE_ALU_ITINS_P, HasDQI>, + AVX512AIi8Base, EVEX; defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09, - X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX; + X86VRndScale, X86VRndScaleRnd, SSE_ALU_ITINS_P, HasAVX512>, + AVX512AIi8Base, EVEX; defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26, - X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX; - + X86VGetMant, X86VGetMantRnd, SSE_ALU_ITINS_P, HasAVX512>, + AVX512AIi8Base, EVEX; defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info, - 0x50, X86VRange, HasDQI>, + 0x50, X86VRange, X86VRangeRnd, + SSE_ALU_F64P, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info, - 0x50, X86VRange, HasDQI>, + 0x50, X86VRange, X86VRangeRnd, + SSE_ALU_F32P, HasDQI>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info, - 0x51, X86VRange, HasDQI>, +defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", + f64x_info, 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F64S, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info, - 0x51, X86VRange, HasDQI>, + 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F32S, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info, - 0x57, X86Reduces, HasDQI>, + 0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F64S, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info, - 0x57, X86Reduces, HasDQI>, + 0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F32S, HasDQI>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info, - 0x27, X86GetMants, HasAVX512>, + 0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F64S, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info, - 0x27, X86GetMants, HasAVX512>, + 0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F32S, HasAVX512>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; -multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _, - bits<8> opc, SDNode OpNode = X86Shuf128>{ - let Predicates = [HasAVX512] in { - defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512; - - } - let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256; - } -} let Predicates = [HasAVX512] in { def : Pat<(v16f32 (ffloor VR512:$src)), (VRNDSCALEPSZrri VR512:$src, (i32 0x9))>; @@ -9222,14 +9219,71 @@ def : Pat<(v8f64 (ftrunc VR512:$src)), (VRNDSCALEPDZrri VR512:$src, (i32 0xB))>; } -defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; -defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; -defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +let Predicates = [HasVLX] in { +def : Pat<(v4f32 (ffloor VR128X:$src)), + (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>; +def : Pat<(v4f32 (fnearbyint VR128X:$src)), + (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>; +def : Pat<(v4f32 (fceil VR128X:$src)), + (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>; +def : Pat<(v4f32 (frint VR128X:$src)), + (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>; +def : Pat<(v4f32 (ftrunc VR128X:$src)), + (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xB))>; + +def : Pat<(v2f64 (ffloor VR128X:$src)), + (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>; +def : Pat<(v2f64 (fnearbyint VR128X:$src)), + (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>; +def : Pat<(v2f64 (fceil VR128X:$src)), + (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>; +def : Pat<(v2f64 (frint VR128X:$src)), + (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>; +def : Pat<(v2f64 (ftrunc VR128X:$src)), + (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xB))>; + +def : Pat<(v8f32 (ffloor VR256X:$src)), + (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>; +def : Pat<(v8f32 (fnearbyint VR256X:$src)), + (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>; +def : Pat<(v8f32 (fceil VR256X:$src)), + (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>; +def : Pat<(v8f32 (frint VR256X:$src)), + (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>; +def : Pat<(v8f32 (ftrunc VR256X:$src)), + (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xB))>; + +def : Pat<(v4f64 (ffloor VR256X:$src)), + (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>; +def : Pat<(v4f64 (fnearbyint VR256X:$src)), + (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>; +def : Pat<(v4f64 (fceil VR256X:$src)), + (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>; +def : Pat<(v4f64 (frint VR256X:$src)), + (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>; +def : Pat<(v4f64 (ftrunc VR256X:$src)), + (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xB))>; +} + +multiclass avx512_shuff_packed_128<string OpcodeStr, OpndItins itins, + AVX512VLVectorVTInfo _, bits<8> opc>{ + let Predicates = [HasAVX512] in { + defm Z : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info512>, EVEX_V512; + + } + let Predicates = [HasAVX512, HasVLX] in { + defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info256>, EVEX_V256; + } +} + +defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", SSE_SHUFP, + avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", SSE_SHUFP, + avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", SSE_SHUFP, + avx512vl_i32_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; +defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", SSE_SHUFP, + avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; let Predicates = [HasAVX512] in { // Provide fallback in case the load node that is used in the broadcast @@ -9264,120 +9318,230 @@ def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))), 0)>; } -multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> { - defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>, +multiclass avx512_valign<string OpcodeStr, OpndItins itins, + AVX512VLVectorVTInfo VTInfo_I> { + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign, itins>, AVX512AIi8Base, EVEX_4V; } -defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>, +defm VALIGND: avx512_valign<"valignd", SSE_PALIGN, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; -defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>, +defm VALIGNQ: avx512_valign<"valignq", SSE_PALIGN, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W; -multiclass avx512_vpalignr_lowering<X86VectorVTInfo _ , list<Predicate> p>{ - let Predicates = p in - def NAME#_.VTName#rri: - Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))), - (!cast<Instruction>(NAME#_.ZSuffix#rri) - _.RC:$src1, _.RC:$src2, imm:$imm)>; +defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", SSE_PALIGN, + avx512vl_i8_info, avx512vl_i8_info>, + EVEX_CD8<8, CD8VF>; + +// Fragments to help convert valignq into masked valignd. Or valignq/valignd +// into vpalignr. +def ValignqImm32XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 2, SDLoc(N)); +}]>; +def ValignqImm8XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 8, SDLoc(N)); +}]>; +def ValigndImm8XForm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() * 4, SDLoc(N)); +}]>; + +multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode, + X86VectorVTInfo From, X86VectorVTInfo To, + SDNodeXForm ImmXForm> { + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, From.RC:$src2, + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask, + To.RC:$src1, To.RC:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert (To.LdFrag addr:$src2)), + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; +} + +multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode, + X86VectorVTInfo From, + X86VectorVTInfo To, + SDNodeXForm ImmXForm> : + avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> { + def : Pat<(From.VT (OpNode From.RC:$src1, + (bitconvert (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3)), + (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.RC:$src0)), + (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; + + def : Pat<(To.VT (vselect To.KRCWM:$mask, + (bitconvert + (From.VT (OpNode From.RC:$src1, + (bitconvert + (To.VT (X86VBroadcast + (To.ScalarLdFrag addr:$src2)))), + imm:$src3))), + To.ImmAllZerosV)), + (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask, + To.RC:$src1, addr:$src2, + (ImmXForm imm:$src3))>; } -multiclass avx512_vpalignr_lowering_common<AVX512VLVectorVTInfo _>: - avx512_vpalignr_lowering<_.info512, [HasBWI]>, - avx512_vpalignr_lowering<_.info128, [HasBWI, HasVLX]>, - avx512_vpalignr_lowering<_.info256, [HasBWI, HasVLX]>; +let Predicates = [HasAVX512] in { + // For 512-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info, + v16i32_info, ValignqImm32XForm>; +} -defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" , - avx512vl_i8_info, avx512vl_i8_info>, - avx512_vpalignr_lowering_common<avx512vl_i16_info>, - avx512_vpalignr_lowering_common<avx512vl_i32_info>, - avx512_vpalignr_lowering_common<avx512vl_f32_info>, - avx512_vpalignr_lowering_common<avx512vl_i64_info>, - avx512_vpalignr_lowering_common<avx512vl_f64_info>, - EVEX_CD8<8, CD8VF>; +let Predicates = [HasVLX] in { + // For 128-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info, + v4i32x_info, ValignqImm32XForm>; + // For 256-bit we lower to the widest element type we can. So we only need + // to handle converting valignq to valignd. + defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info, + v8i32x_info, ValignqImm32XForm>; +} + +let Predicates = [HasVLX, HasBWI] in { + // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR. + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info, + v16i8x_info, ValignqImm8XForm>; + defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info, + v16i8x_info, ValigndImm8XForm>; +} -defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" , - avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>; +defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw", + SSE_INTMUL_ITINS_P, avx512vl_i16_info, avx512vl_i8_info>, + EVEX_CD8<8, CD8VF>; multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase; + (_.VT (OpNode _.RC:$src1)), itins.rr>, EVEX, AVX5128IBase, + Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1", - (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>, - EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>; + (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1)))), itins.rm>, + EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded]>; } } multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> : - avx512_unary_rm<opc, OpcodeStr, OpNode, _> { + OpndItins itins, X86VectorVTInfo _> : + avx512_unary_rm<opc, OpcodeStr, OpNode, itins, _> { defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src1), OpcodeStr, "${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr, (_.VT (OpNode (X86VBroadcast - (_.ScalarLdFrag addr:$src1))))>, - EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>; + (_.ScalarLdFrag addr:$src1)))), itins.rm>, + EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded]>; } multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info512>, + EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>, + defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info128>, EVEX_V128; } } multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo, Predicate prd> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo, + Predicate prd> { let Predicates = [prd] in - defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>, + defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>, + defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>, + defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128>, EVEX_V128; } } multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr, - SDNode OpNode, Predicate prd> { - defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info, - prd>, VEX_W; - defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info, - prd>; + SDNode OpNode, OpndItins itins, Predicate prd> { + defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, itins, + avx512vl_i64_info, prd>, VEX_W; + defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, itins, + avx512vl_i32_info, prd>; } multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr, - SDNode OpNode, Predicate prd> { - defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>; - defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>; + SDNode OpNode, OpndItins itins, Predicate prd> { + defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, itins, + avx512vl_i16_info, prd>, VEX_WIG; + defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, itins, + avx512vl_i8_info, prd>, VEX_WIG; } multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w, bits<8> opc_d, bits<8> opc_q, - string OpcodeStr, SDNode OpNode> { - defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, + string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, itins, HasAVX512>, - avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, + avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, itins, HasBWI>; } -defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>; +defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SSE_PABS>; // VPABS: Use 512bit version to implement 128/256 bit in case NoVLX. let Predicates = [HasAVX512, NoVLX] in { @@ -9393,137 +9557,111 @@ let Predicates = [HasAVX512, NoVLX] in { sub_xmm)>; } -multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{ +// Use 512bit version to implement 128/256 bit. +multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode, + AVX512VLVectorVTInfo _, Predicate prd> { + let Predicates = [prd, NoVLX] in { + def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), + (EXTRACT_SUBREG + (!cast<Instruction>(InstrStr # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info256.RC:$src1, + _.info256.SubRegIdx)), + _.info256.SubRegIdx)>; - defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>; + def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), + (EXTRACT_SUBREG + (!cast<Instruction>(InstrStr # "Zrr") + (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), + _.info128.RC:$src1, + _.info128.SubRegIdx)), + _.info128.SubRegIdx)>; + } } -defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>; -defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>; +// FIXME: Is there a better scheduler itinerary for VPLZCNT? +defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz, + SSE_INTALU_ITINS_P, HasCDI>; -// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX. -let Predicates = [HasCDI, NoVLX] in { - def : Pat<(v4i64 (ctlz VR256X:$src)), - (EXTRACT_SUBREG - (VPLZCNTQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)), - sub_ymm)>; - def : Pat<(v2i64 (ctlz VR128X:$src)), - (EXTRACT_SUBREG - (VPLZCNTQZrr - (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)), - sub_xmm)>; +// FIXME: Is there a better scheduler itinerary for VPCONFLICT? +defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, + SSE_INTALU_ITINS_P, HasCDI>; - def : Pat<(v8i32 (ctlz VR256X:$src)), - (EXTRACT_SUBREG - (VPLZCNTDZrr - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)), - sub_ymm)>; - def : Pat<(v4i32 (ctlz VR128X:$src)), - (EXTRACT_SUBREG - (VPLZCNTDZrr - (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)), - sub_xmm)>; -} +// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX. +defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>; +defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>; //===---------------------------------------------------------------------===// // Counts number of ones - VPOPCNTD and VPOPCNTQ //===---------------------------------------------------------------------===// -multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> { - let Predicates = [HasVPOPCNTDQ] in - defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512; -} +// FIXME: Is there a better scheduler itinerary for VPOPCNTD/VPOPCNTQ? +defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop, + SSE_INTALU_ITINS_P, HasVPOPCNTDQ>; -// Use 512bit version to implement 128/256 bit. -multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> { - let Predicates = [prd] in { - def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)), - (EXTRACT_SUBREG - (!cast<Instruction>(NAME # "Zrr") - (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), - _.info256.RC:$src1, - _.info256.SubRegIdx)), - _.info256.SubRegIdx)>; - - def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)), - (EXTRACT_SUBREG - (!cast<Instruction>(NAME # "Zrr") - (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)), - _.info128.RC:$src1, - _.info128.SubRegIdx)), - _.info128.SubRegIdx)>; - } -} - -defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>, - avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>; -defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>, - avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W; +defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>; +defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>; //===---------------------------------------------------------------------===// // Replicate Single FP - MOVSHDUP and MOVSLDUP //===---------------------------------------------------------------------===// -multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{ - defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info, - HasAVX512>, XS; +multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, itins, + avx512vl_f32_info, HasAVX512>, XS; } -defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>; -defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>; +defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup, SSE_MOVDDUP>; +defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, SSE_MOVDDUP>; //===----------------------------------------------------------------------===// // AVX-512 - MOVDDUP //===----------------------------------------------------------------------===// multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _> { + OpndItins itins, X86VectorVTInfo _> { let ExeDomain = _.ExeDomain in { defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src), OpcodeStr, "$src", "$src", - (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX; + (_.VT (OpNode (_.VT _.RC:$src))), itins.rr>, EVEX, + Sched<[itins.Sched]>; defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src", (_.VT (OpNode (_.VT (scalar_to_vector - (_.ScalarLdFrag addr:$src)))))>, - EVEX, EVEX_CD8<_.EltSize, CD8VH>; + (_.ScalarLdFrag addr:$src))))), + itins.rm>, EVEX, EVEX_CD8<_.EltSize, CD8VH>, + Sched<[itins.Sched.Folded]>; } } multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode, - AVX512VLVectorVTInfo VTInfo> { + OpndItins itins, AVX512VLVectorVTInfo VTInfo> { - defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512; + defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>, + defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info256>, EVEX_V256; - defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>, - EVEX_V128; + defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, itins, VTInfo.info128>, + EVEX_V128; } } -multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{ - defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, +multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode, + OpndItins itins> { + defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, itins, avx512vl_f64_info>, XD, VEX_W; } -defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>; +defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SSE_MOVDDUP>; let Predicates = [HasVLX] in { -def : Pat<(X86Movddup (loadv2f64 addr:$src)), - (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), (VMOVDDUPZ128rm addr:$src)>; def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (v2f64 VR128X:$src0)), - (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; -def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)), - (bitconvert (v4i32 immAllZerosV))), - (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; +def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPZ128rm addr:$src)>; def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)), (v2f64 VR128X:$src0)), @@ -9539,6 +9677,13 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src) def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))), (bitconvert (v4i32 immAllZerosV))), (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; + +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (v2f64 VR128X:$src0)), + (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>; +def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (bitconvert (v4i32 immAllZerosV))), + (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>; } //===----------------------------------------------------------------------===// @@ -9576,10 +9721,9 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode, def mr : AVX512Ii8<opc, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1), - imm:$src2)))), - addr:$dst)]>, - EVEX, EVEX_CD8<_.EltSize, CD8VT1>; + [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))), + addr:$dst)]>, + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd]>; } multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { @@ -9589,7 +9733,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> { OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, (X86pextrb (_.VT _.RC:$src1), imm:$src2))]>, - EVEX, TAPD; + EVEX, TAPD, Sched<[WriteShuffle]>; defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD; } @@ -9601,14 +9745,15 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> { (ins _.RC:$src1, u8imm:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GR32orGR64:$dst, - (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>, - EVEX, PD; + (X86pextrw (_.VT _.RC:$src1), imm:$src2))], + IIC_SSE_PEXTRW>, EVEX, PD, Sched<[WriteShuffle]>; let hasSideEffects = 0 in def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst), (ins _.RC:$src1, u8imm:$src2), - OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>, - EVEX, TAPD, FoldGenData<NAME#rr>; + OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], + IIC_SSE_PEXTRW>, EVEX, TAPD, FoldGenData<NAME#rr>, + Sched<[WriteShuffle]>; defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD; } @@ -9622,19 +9767,20 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _, OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set GRC:$dst, (extractelt (_.VT _.RC:$src1), imm:$src2))]>, - EVEX, TAPD; + EVEX, TAPD, Sched<[WriteShuffle]>; def mr : AVX512Ii8<0x16, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2), OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(store (extractelt (_.VT _.RC:$src1), imm:$src2),addr:$dst)]>, - EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD; + EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD, + Sched<[WriteShuffleLd]>; } } -defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>; -defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>; +defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>, VEX_WIG; +defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>, VEX_WIG; defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>; defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W; @@ -9645,7 +9791,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode, OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>, - EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>; + EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd, ReadAfterLd]>; } multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, @@ -9655,7 +9801,8 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3), OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, - (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V; + (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V, + Sched<[WriteShuffle]>; defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>; } @@ -9669,7 +9816,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", [(set _.RC:$dst, (_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>, - EVEX_4V, TAPD; + EVEX_4V, TAPD, Sched<[WriteShuffle]>; defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _, _.ScalarLdFrag>, TAPD; @@ -9677,92 +9824,109 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr, } defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info, - extloadi8>, TAPD; + extloadi8>, TAPD, VEX_WIG; defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info, - extloadi16>, PD; + extloadi16>, PD, VEX_WIG; defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>; defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W; + //===----------------------------------------------------------------------===// // VSHUFPS - VSHUFPD Operations //===----------------------------------------------------------------------===// + multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I, AVX512VLVectorVTInfo VTInfo_FP>{ - defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>, - EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, - AVX512AIi8Base, EVEX_4V; + defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp, + SSE_SHUFP>, EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>, + AVX512AIi8Base, EVEX_4V; } defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS; defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W; + //===----------------------------------------------------------------------===// // AVX-512 - Byte shift Left/Right //===----------------------------------------------------------------------===// +let Sched = WriteVecShift in +def AVX512_BYTESHIFT : OpndItins< + IIC_SSE_INTSHDQ_P_RI, IIC_SSE_INTSHDQ_P_RI +>; + multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr, - Format MRMm, string OpcodeStr, X86VectorVTInfo _>{ + Format MRMm, string OpcodeStr, + OpndItins itins, X86VectorVTInfo _>{ def rr : AVX512<opc, MRMr, (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>; + [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))], + itins.rr>, Sched<[itins.Sched]>; def rm : AVX512<opc, MRMm, (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _.RC:$dst,(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))), - (i8 imm:$src2))))]>; + (i8 imm:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr, - Format MRMm, string OpcodeStr, Predicate prd>{ + Format MRMm, string OpcodeStr, + OpndItins itins, Predicate prd>{ let Predicates = [prd] in - defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, - OpcodeStr, v64i8_info>, EVEX_V512; + defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm, + OpcodeStr, itins, v64i8_info>, EVEX_V512; let Predicates = [prd, HasVLX] in { defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, - OpcodeStr, v32i8x_info>, EVEX_V256; + OpcodeStr, itins, v32i8x_info>, EVEX_V256; defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm, - OpcodeStr, v16i8x_info>, EVEX_V128; + OpcodeStr, itins, v16i8x_info>, EVEX_V128; } } defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq", - HasBWI>, AVX512PDIi8Base, EVEX_4V; + AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base, + EVEX_4V, VEX_WIG; defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq", - HasBWI>, AVX512PDIi8Base, EVEX_4V; + AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base, + EVEX_4V, VEX_WIG; multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode, - string OpcodeStr, X86VectorVTInfo _dst, - X86VectorVTInfo _src>{ + string OpcodeStr, OpndItins itins, + X86VectorVTInfo _dst, X86VectorVTInfo _src> { def rr : AVX512BI<opc, MRMSrcReg, (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _dst.RC:$dst,(_dst.VT (OpNode (_src.VT _src.RC:$src1), - (_src.VT _src.RC:$src2))))]>; + (_src.VT _src.RC:$src2))))], itins.rr>, + Sched<[itins.Sched]>; def rm : AVX512BI<opc, MRMSrcMem, (outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set _dst.RC:$dst,(_dst.VT (OpNode (_src.VT _src.RC:$src1), (_src.VT (bitconvert - (_src.LdFrag addr:$src2))))))]>; + (_src.LdFrag addr:$src2))))))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode, - string OpcodeStr, Predicate prd> { + string OpcodeStr, OpndItins itins, + Predicate prd> { let Predicates = [prd] in - defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info, - v64i8_info>, EVEX_V512; + defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v8i64_info, + v64i8_info>, EVEX_V512; let Predicates = [prd, HasVLX] in { - defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info, + defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v4i64x_info, v32i8x_info>, EVEX_V256; - defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info, + defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v2i64x_info, v16i8x_info>, EVEX_V128; } } defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw", - HasBWI>, EVEX_4V; + SSE_MPSADBW_ITINS, HasBWI>, EVEX_4V, VEX_WIG; // Transforms to swizzle an immediate to enable better matching when // memory operand isn't in the right place. @@ -9827,7 +9991,7 @@ def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{ }]>; multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + OpndItins itins, X86VectorVTInfo _>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, u8imm:$src4), @@ -9835,15 +9999,17 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT _.RC:$src3), - (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V; + (i8 imm:$src4)), itins.rr, 1, 1>, + AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4), OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4", (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (bitconvert (_.LdFrag addr:$src3))), - (i8 imm:$src4)), 1, 0>, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + (i8 imm:$src4)), itins.rm, 1, 0>, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4), OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2", @@ -9851,8 +10017,9 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), - (i8 imm:$src4)), 1, 0>, EVEX_B, - AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>; + (i8 imm:$src4)), itins.rm, 1, 0>, EVEX_B, + AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; }// Constraints = "$src1 = $dst" // Additional patterns for matching passthru operand in other positions. @@ -9968,47 +10135,50 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), _.RC:$src2, _.RC:$src1, (i8 imm:$src4)), _.RC:$src1)), - (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, _.RC:$src1, (X86VBroadcast (_.ScalarLdFrag addr:$src3)), (i8 imm:$src4)), _.RC:$src1)), - (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode _.RC:$src2, (X86VBroadcast (_.ScalarLdFrag addr:$src3)), _.RC:$src1, (i8 imm:$src4)), _.RC:$src1)), - (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>; def : Pat<(_.VT (vselect _.KRCWM:$mask, (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)), _.RC:$src1, _.RC:$src2, (i8 imm:$src4)), _.RC:$src1)), - (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask, + (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask, _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>; } -multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{ +multiclass avx512_common_ternlog<string OpcodeStr, OpndItins itins, + AVX512VLVectorVTInfo _> { let Predicates = [HasAVX512] in - defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512; + defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info512>, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128; - defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256; + defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info128>, EVEX_V128; + defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info256>, EVEX_V256; } } -defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>; -defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W; +defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SSE_INTALU_ITINS_P, + avx512vl_i32_info>; +defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SSE_INTALU_ITINS_P, + avx512vl_i64_info>, VEX_W; //===----------------------------------------------------------------------===// // AVX-512 - FixupImm //===----------------------------------------------------------------------===// multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _>{ + OpndItins itins, X86VectorVTInfo _>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), @@ -10017,7 +10187,7 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_.IntVT _.RC:$src3), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>; defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -10025,7 +10195,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_.IntVT (bitconvert (_.LdFrag addr:$src3))), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2", @@ -10034,12 +10205,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))), (i32 imm:$src4), - (i32 FROUND_CURRENT))>, EVEX_B; + (i32 FROUND_CURRENT)), itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; } // Constraints = "$src1 = $dst" } multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr, - SDNode OpNode, X86VectorVTInfo _>{ + SDNode OpNode, OpndItins itins, + X86VectorVTInfo _>{ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), @@ -10049,12 +10222,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in { (_.VT _.RC:$src2), (_.IntVT _.RC:$src3), (i32 imm:$src4), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rr>, + EVEX_B, Sched<[itins.Sched]>; } } multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, - X86VectorVTInfo _, X86VectorVTInfo _src3VT> { + OpndItins itins, X86VectorVTInfo _, + X86VectorVTInfo _src3VT> { let Constraints = "$src1 = $dst" , Predicates = [HasAVX512], ExeDomain = _.ExeDomain in { defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), @@ -10064,8 +10239,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; - + (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>; defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst), (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2", @@ -10074,7 +10248,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, (_.VT _.RC:$src2), (_src3VT.VT _src3VT.RC:$src3), (i32 imm:$src4), - (i32 FROUND_NO_EXC))>, EVEX_B; + (i32 FROUND_NO_EXC)), itins.rm>, + EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>; defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst), (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4), OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4", @@ -10083,32 +10258,34 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode, (_src3VT.VT (scalar_to_vector (_src3VT.ScalarLdFrag addr:$src3))), (i32 imm:$src4), - (i32 FROUND_CURRENT))>; + (i32 FROUND_CURRENT)), itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -multiclass avx512_fixupimm_packed_all<AVX512VLVectorVTInfo _Vec>{ +multiclass avx512_fixupimm_packed_all<OpndItins itins, AVX512VLVectorVTInfo _Vec> { let Predicates = [HasAVX512] in - defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>, - avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>, - AVX512AIi8Base, EVEX_4V, EVEX_V512; + defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info512>, + avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512; let Predicates = [HasAVX512, HasVLX] in { - defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>, - AVX512AIi8Base, EVEX_4V, EVEX_V128; - defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>, - AVX512AIi8Base, EVEX_4V, EVEX_V256; + defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128; + defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins, + _Vec.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256; } } defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f32x_info, v4i32x_info>, + SSE_ALU_F32S, f32x_info, v4i32x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>; defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar, - f64x_info, v2i64x_info>, + SSE_ALU_F64S, f64x_info, v2i64x_info>, AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W; -defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>, +defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SSE_ALU_F32P, avx512vl_f32_info>, EVEX_CD8<32, CD8VF>; -defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>, +defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SSE_ALU_F64P, avx512vl_f64_info>, EVEX_CD8<64, CD8VF>, VEX_W; @@ -10164,23 +10341,11 @@ multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32X:$src, VR128X))>; - // extracted scalar math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))), - FR32X:$src))), (i8 1))), - (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32X:$src, VR128X))>; - // vector math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst), (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))), (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; - // vector math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), - (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))), - (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>; - // extracted masked scalar math op with insert via movss def : Pat<(X86Movss (v4f32 VR128X:$src1), (scalar_to_vector @@ -10208,23 +10373,11 @@ multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - // extracted scalar math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))), - FR64X:$src))), (i8 1))), - (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64X:$src, VR128X))>; - // vector math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst), (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))), (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; - // vector math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), - (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))), - (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>; - // extracted masked scalar math op with insert via movss def : Pat<(X86Movsd (v2f64 VR128X:$src1), (scalar_to_vector @@ -10242,3 +10395,292 @@ defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">; defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">; defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">; defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">; + +//===----------------------------------------------------------------------===// +// AES instructions +//===----------------------------------------------------------------------===// + +multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> { + let Predicates = [HasVLX, HasVAES] in { + defm Z128 : AESI_binop_rm_int<Op, OpStr, + !cast<Intrinsic>(IntPrefix), + loadv2i64, 0, VR128X, i128mem>, + EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG; + defm Z256 : AESI_binop_rm_int<Op, OpStr, + !cast<Intrinsic>(IntPrefix##"_256"), + loadv4i64, 0, VR256X, i256mem>, + EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG; + } + let Predicates = [HasAVX512, HasVAES] in + defm Z : AESI_binop_rm_int<Op, OpStr, + !cast<Intrinsic>(IntPrefix##"_512"), + loadv8i64, 0, VR512, i512mem>, + EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG; +} + +defm VAESENC : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">; +defm VAESENCLAST : avx512_vaes<0xDD, "vaesenclast", "int_x86_aesni_aesenclast">; +defm VAESDEC : avx512_vaes<0xDE, "vaesdec", "int_x86_aesni_aesdec">; +defm VAESDECLAST : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">; + +//===----------------------------------------------------------------------===// +// PCLMUL instructions - Carry less multiplication +//===----------------------------------------------------------------------===// + +let Predicates = [HasAVX512, HasVPCLMULQDQ] in +defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>, + EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_WIG; + +let Predicates = [HasVLX, HasVPCLMULQDQ] in { +defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>, + EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_WIG; + +defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64, + int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256, + EVEX_CD8<64, CD8VF>, VEX_WIG; +} + +// Aliases +defm : vpclmulqdq_aliases<"VPCLMULQDQZ", VR512, i512mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQZ128", VR128X, i128mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>; + +//===----------------------------------------------------------------------===// +// VBMI2 +//===----------------------------------------------------------------------===// + +multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins, X86VectorVTInfo VTI> { + let Constraints = "$src1 = $dst", + ExeDomain = VTI.ExeDomain in { + defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3)), + itins.rr>, AVX512FMA3Base, Sched<[itins.Sched]>; + defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (bitconvert (VTI.LdFrag addr:$src3))))), + itins.rm>, AVX512FMA3Base, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + } +} + +multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins, X86VectorVTInfo VTI> + : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI> { + let Constraints = "$src1 = $dst", + ExeDomain = VTI.ExeDomain in + defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr, + "${src3}"##VTI.BroadcastStr##", $src2", + "$src2, ${src3}"##VTI.BroadcastStr, + (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3)))), + itins.rm>, AVX512FMA3Base, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins, AVX512VLVectorVTInfo VTI> { + let Predicates = [HasVBMI2] in + defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512; + let Predicates = [HasVBMI2, HasVLX] in { + defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256; + defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128; + } +} + +multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins, AVX512VLVectorVTInfo VTI> { + let Predicates = [HasVBMI2] in + defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512; + let Predicates = [HasVBMI2, HasVLX] in { + defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256; + defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128; + } +} +multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix, + SDNode OpNode, OpndItins itins> { + defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, itins, + avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>; + defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, itins, + avx512vl_i32_info>, EVEX_CD8<32, CD8VF>; + defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, itins, + avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>; +} + +multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix, + SDNode OpNode, OpndItins itins> { + defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", itins, + avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>, + VEX_W, EVEX_CD8<16, CD8VF>; + defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp, + OpNode, itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>; + defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode, + itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W; +} + +// Concat & Shift +defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SSE_INTMUL_ITINS_P>; +defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SSE_INTMUL_ITINS_P>; +defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SSE_INTMUL_ITINS_P>; +defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SSE_INTMUL_ITINS_P>; + +// Compress +defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", AVX512_COMPRESS, + avx512vl_i8_info, HasVBMI2>, EVEX; +defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", AVX512_COMPRESS, + avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W; +// Expand +defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", AVX512_EXPAND, + avx512vl_i8_info, HasVBMI2>, EVEX; +defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND, + avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W; + +//===----------------------------------------------------------------------===// +// VNNI +//===----------------------------------------------------------------------===// + +let Constraints = "$src1 = $dst" in +multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins, X86VectorVTInfo VTI> { + defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.RC:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, + VTI.RC:$src2, VTI.RC:$src3)), + itins.rr>, EVEX_4V, T8PD, Sched<[itins.Sched]>; + defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr, + "$src3, $src2", "$src2, $src3", + (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (bitconvert + (VTI.LdFrag addr:$src3))))), + itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD, + Sched<[itins.Sched.Folded, ReadAfterLd]>; + defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), + OpStr, "${src3}"##VTI.BroadcastStr##", $src2", + "$src2, ${src3}"##VTI.BroadcastStr, + (OpNode VTI.RC:$src1, VTI.RC:$src2, + (VTI.VT (X86VBroadcast + (VTI.ScalarLdFrag addr:$src3)))), + itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B, + T8PD, Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, OpndItins itins> { + let Predicates = [HasVNNI] in + defm Z : VNNI_rmb<Op, OpStr, OpNode, itins, v16i32_info>, EVEX_V512; + let Predicates = [HasVNNI, HasVLX] in { + defm Z256 : VNNI_rmb<Op, OpStr, OpNode, itins, v8i32x_info>, EVEX_V256; + defm Z128 : VNNI_rmb<Op, OpStr, OpNode, itins, v4i32x_info>, EVEX_V128; + } +} + +// FIXME: Is there a better scheduler itinerary for VPDP? +defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SSE_PMADD>; +defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SSE_PMADD>; +defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SSE_PMADD>; +defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SSE_PMADD>; + +//===----------------------------------------------------------------------===// +// Bit Algorithms +//===----------------------------------------------------------------------===// + +// FIXME: Is there a better scheduler itinerary for VPOPCNTB/VPOPCNTW? +defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SSE_INTALU_ITINS_P, + avx512vl_i8_info, HasBITALG>; +defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SSE_INTALU_ITINS_P, + avx512vl_i16_info, HasBITALG>, VEX_W; + +defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>; +defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>; + +multiclass VPSHUFBITQMB_rm<OpndItins itins, X86VectorVTInfo VTI> { + defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst), + (ins VTI.RC:$src1, VTI.RC:$src2), + "vpshufbitqmb", + "$src2, $src1", "$src1, $src2", + (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT VTI.RC:$src2)), itins.rr>, EVEX_4V, T8PD, + Sched<[itins.Sched]>; + defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst), + (ins VTI.RC:$src1, VTI.MemOp:$src2), + "vpshufbitqmb", + "$src2, $src1", "$src1, $src2", + (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1), + (VTI.VT (bitconvert (VTI.LdFrag addr:$src2)))), + itins.rm>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass VPSHUFBITQMB_common<OpndItins itins, AVX512VLVectorVTInfo VTI> { + let Predicates = [HasBITALG] in + defm Z : VPSHUFBITQMB_rm<itins, VTI.info512>, EVEX_V512; + let Predicates = [HasBITALG, HasVLX] in { + defm Z256 : VPSHUFBITQMB_rm<itins, VTI.info256>, EVEX_V256; + defm Z128 : VPSHUFBITQMB_rm<itins, VTI.info128>, EVEX_V128; + } +} + +// FIXME: Is there a better scheduler itinerary for VPSHUFBITQMB? +defm VPSHUFBITQMB : VPSHUFBITQMB_common<SSE_INTMUL_ITINS_P, avx512vl_i8_info>; + +//===----------------------------------------------------------------------===// +// GFNI +//===----------------------------------------------------------------------===// + +multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode> { + let Predicates = [HasGFNI, HasAVX512, HasBWI] in + defm Z : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info, + SSE_INTALU_ITINS_P, 1>, EVEX_V512; + let Predicates = [HasGFNI, HasVLX, HasBWI] in { + defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info, + SSE_INTALU_ITINS_P, 1>, EVEX_V256; + defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info, + SSE_INTALU_ITINS_P, 1>, EVEX_V128; + } +} + +defm GF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb>, + EVEX_CD8<8, CD8VF>, T8PD; + +multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins, X86VectorVTInfo VTI, + X86VectorVTInfo BcstVTI> + : avx512_3Op_rm_imm8<Op, OpStr, OpNode, itins, VTI, VTI> { + let ExeDomain = VTI.ExeDomain in + defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst), + (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3), + OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1", + "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3", + (OpNode (VTI.VT VTI.RC:$src1), + (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))), + (i8 imm:$src3)), itins.rm>, EVEX_B, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + +multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode, + OpndItins itins> { + let Predicates = [HasGFNI, HasAVX512, HasBWI] in + defm Z : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v64i8_info, + v8i64_info>, EVEX_V512; + let Predicates = [HasGFNI, HasVLX, HasBWI] in { + defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v32i8x_info, + v4i64x_info>, EVEX_V256; + defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v16i8x_info, + v2i64x_info>, EVEX_V128; + } +} + +defm GF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb", + X86GF2P8affineinvqb, SSE_INTMUL_ITINS_P>, + EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base; +defm GF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb", + X86GF2P8affineqb, SSE_INTMUL_ITINS_P>, + EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base; + diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td index e38bbc9b3d36..d09deb5b7584 100644 --- a/lib/Target/X86/X86InstrArithmetic.td +++ b/lib/Target/X86/X86InstrArithmetic.td @@ -104,7 +104,8 @@ def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src), // RAX,RDX = RAX*[mem64] let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src), - "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>; + "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>, + Requires<[In64BitMode]>; } let hasSideEffects = 0 in { @@ -143,7 +144,8 @@ def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src), // RAX,RDX = RAX*[mem64] let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src), - "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>; + "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>, + Requires<[In64BitMode]>; } } // hasSideEffects @@ -326,7 +328,7 @@ def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src), let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src), "div{q}\t$src", [], IIC_DIV64>, - SchedLoadReg<WriteIDivLd>; + SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>; } // Signed division/remainder. @@ -362,7 +364,7 @@ def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src), let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src), "idiv{q}\t$src", [], IIC_IDIV64>, - SchedLoadReg<WriteIDivLd>; + SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>; } } // hasSideEffects = 0 @@ -407,7 +409,8 @@ def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst", [(store (ineg (loadi64 addr:$dst)), addr:$dst), - (implicit EFLAGS)], IIC_UNARY_MEM>; + (implicit EFLAGS)], IIC_UNARY_MEM>, + Requires<[In64BitMode]>; } // SchedRW } // Defs = [EFLAGS] @@ -444,7 +447,8 @@ def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst), [(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>, OpSize32; def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst", - [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>; + [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>, + Requires<[In64BitMode]>; } // SchedRW } // CodeSize @@ -482,6 +486,7 @@ def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), } // Constraints = "$src1 = $dst", SchedRW let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { +let Predicates = [UseIncDec] in { def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst", [(store (add (loadi8 addr:$dst), 1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; @@ -491,9 +496,12 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst", [(store (add (loadi32 addr:$dst), 1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; +} // Predicates +let Predicates = [UseIncDec, In64BitMode] in { def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst", [(store (add (loadi64 addr:$dst), 1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; +} // Predicates } // CodeSize = 2, SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in { @@ -529,6 +537,7 @@ def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1), let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { +let Predicates = [UseIncDec] in { def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst", [(store (add (loadi8 addr:$dst), -1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; @@ -538,9 +547,12 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in { def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst", [(store (add (loadi32 addr:$dst), -1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32; +} // Predicates +let Predicates = [UseIncDec, In64BitMode] in { def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst", [(store (add (loadi64 addr:$dst), -1), addr:$dst), (implicit EFLAGS)], IIC_UNARY_MEM>; +} // Predicates } // CodeSize = 2, SchedRW } // Defs = [EFLAGS] @@ -652,9 +664,8 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins, // BinOpRR - Instructions like "add reg, reg, reg". class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - dag outlist, list<dag> pattern, InstrItinClass itin, - Format f = MRMDestReg> - : ITy<opcode, f, typeinfo, outlist, + dag outlist, list<dag> pattern, InstrItinClass itin> + : ITy<opcode, MRMDestReg, typeinfo, outlist, (ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2), mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALU]>; @@ -662,11 +673,11 @@ class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, // BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has // just a EFLAGS as a result. class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDPatternOperator opnode, Format f = MRMDestReg> + SDPatternOperator opnode> : BinOpRR<opcode, mnemonic, typeinfo, (outs), [(set EFLAGS, (opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))], - IIC_BIN_NONMEM, f>; + IIC_BIN_NONMEM>; // BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has // both a regclass and EFLAGS as a result. @@ -725,16 +736,9 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>, Sched<[WriteALULd, ReadAfterLd]>; -// BinOpRM_R - Instructions like "add reg, reg, [mem]". -class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode> - : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst), - [(set typeinfo.RegClass:$dst, - (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; - // BinOpRM_F - Instructions like "cmp reg, [mem]". class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDPatternOperator opnode> + SDNode opnode> : BinOpRM<opcode, mnemonic, typeinfo, (outs), [(set EFLAGS, (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>; @@ -844,7 +848,7 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, // BinOpMR_F - Instructions like "cmp [mem], reg". class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo, - SDNode opnode> + SDPatternOperator opnode> : BinOpMR<opcode, mnemonic, typeinfo, [(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>; @@ -1000,11 +1004,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, // first so that they are slightly preferred to the mi forms. def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>; def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>; def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>; // These are for the disassembler since 0x82 opcode behaves like 0x80, but @@ -1083,11 +1089,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, // first so that they are slightly preferred to the mi forms. def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>; def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>; def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>; // These are for the disassembler since 0x82 opcode behaves like 0x80, but @@ -1162,11 +1170,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4, // first so that they are slightly preferred to the mi forms. def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>; def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>; def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>; def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>; + let Predicates = [In64BitMode] in def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>; // These are for the disassembler since 0x82 opcode behaves like 0x80, but @@ -1231,19 +1241,21 @@ let isCompare = 1 in { def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>; } // isCommutable - def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>; - def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>; - def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>; - def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>; + def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , X86testpat>; + def TEST16mr : BinOpMR_F<0x84, "test", Xi16, X86testpat>; + def TEST32mr : BinOpMR_F<0x84, "test", Xi32, X86testpat>; + def TEST64mr : BinOpMR_F<0x84, "test", Xi64, X86testpat>; def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>; def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>; def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>; + let Predicates = [In64BitMode] in def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>; def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>; def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>; def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>; + let Predicates = [In64BitMode] in def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>; // When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td index b85abfb9ca7f..8dd5e1c0626b 100644 --- a/lib/Target/X86/X86InstrCMovSetCC.td +++ b/lib/Target/X86/X86InstrCMovSetCC.td @@ -113,6 +113,6 @@ defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than // SALC is an undocumented instruction. Information for this instruction can be found // here http://www.rcollins.org/secrets/opcodes/SALC.html // Set AL if carry. -let Uses = [EFLAGS], Defs = [AL] in { - def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>; +let Uses = [EFLAGS], Defs = [AL], SchedRW = [WriteALU] in { + def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", [], IIC_AHF>, Requires<[Not64BitMode]>; } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index d003d027ddb9..06600a4ef286 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -32,9 +32,10 @@ def GetLo8XForm : SDNodeXForm<imm, [{ // PIC base construction. This expands to code that looks like this: // call $next_inst // popl %destreg" -let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in +let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP], + SchedRW = [WriteJump] in def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label), - "", []>; + "", [], IIC_CALL_RI>; // ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into @@ -42,16 +43,15 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in // pointer before prolog-epilog rewriting occurs. // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. -let Defs = [ESP, EFLAGS], Uses = [ESP] in { +let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in { def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), - "#ADJCALLSTACKDOWN", - []>, - Requires<[NotLP64]>; + "#ADJCALLSTACKDOWN", [], IIC_ALU_NONMEM>, + Requires<[NotLP64]>; def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", - [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[NotLP64]>; + [(X86callseq_end timm:$amt1, timm:$amt2)], + IIC_ALU_NONMEM>, Requires<[NotLP64]>; } def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>; @@ -62,20 +62,20 @@ def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), // pointer before prolog-epilog rewriting occurs. // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. -let Defs = [RSP, EFLAGS], Uses = [RSP] in { +let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in { def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", - []>, - Requires<[IsLP64]>; + [], IIC_ALU_NONMEM>, Requires<[IsLP64]>; def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP", - [(X86callseq_end timm:$amt1, timm:$amt2)]>, - Requires<[IsLP64]>; + [(X86callseq_end timm:$amt1, timm:$amt2)], + IIC_ALU_NONMEM>, Requires<[IsLP64]>; } def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>; +let SchedRW = [WriteSystem] in { // x86-64 va_start lowering magic. let usesCustomInserter = 1, Defs = [EFLAGS] in { @@ -141,7 +141,19 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size), "# dynamic stack allocation", [(X86WinAlloca GR64:$size)]>, Requires<[In64BitMode]>; +} // SchedRW +// These instructions XOR the frame pointer into a GPR. They are used in some +// stack protection schemes. These are post-RA pseudos because we only know the +// frame register after register allocation. +let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in { + def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src), + "xorl\t$$FP, $src", [], IIC_BIN_NONMEM>, + Requires<[NotLP64]>, Sched<[WriteALU]>; + def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src), + "xorq\t$$FP $src", [], IIC_BIN_NONMEM>, + Requires<[In64BitMode]>, Sched<[WriteALU]>; +} //===----------------------------------------------------------------------===// // EH Pseudo Instructions @@ -207,17 +219,17 @@ let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1, Requires<[In64BitMode]>; } } -} // SchedRW let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in { def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst), "#EH_SjLj_Setup\t$dst", []>; } +} // SchedRW //===----------------------------------------------------------------------===// // Pseudo instructions used by unwind info. // -let isPseudo = 1 in { +let isPseudo = 1, SchedRW = [WriteSystem] in { def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg), "#SEH_PushReg $reg", []>; def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst), @@ -243,15 +255,15 @@ let isPseudo = 1 in { // This is lowered into a RET instruction by MCInstLower. We need // this so that we don't have to have a MachineBasicBlock which ends // with a RET and also has successors. -let isPseudo = 1 in { +let isPseudo = 1, SchedRW = [WriteJumpLd] in { def MORESTACK_RET: I<0, Pseudo, (outs), (ins), - "", []>; + "", [], IIC_RET>; // This instruction is lowered to a RET followed by a MOV. The two // instructions are not generated on a higher level since then the // verifier sees a MachineBasicBlock ending with a non-terminator. def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins), - "", []>; + "", [], IIC_RET>; } //===----------------------------------------------------------------------===// @@ -273,39 +285,42 @@ def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>; def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>; } -let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode], +let Predicates = [OptForSize, Not64BitMode], AddedComplexity = 10 in { + let SchedRW = [WriteALU] in { // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC, // which only require 3 bytes compared to MOV32ri which requires 5. let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in { def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, 1)]>; + [(set GR32:$dst, 1)], IIC_ALU_NONMEM>; def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "", - [(set GR32:$dst, -1)]>; + [(set GR32:$dst, -1)], IIC_ALU_NONMEM>; } + } // SchedRW // MOV16ri is 4 bytes, so the instructions above are smaller. def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>; def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>; } -let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5 in { +let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5, + SchedRW = [WriteALU] in { // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1. -// FIXME: Add itinerary class and Schedule. def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "", - [(set GR32:$dst, i32immSExt8:$src)]>, - Requires<[OptForMinSize, NotWin64WithoutFP]>; + [(set GR32:$dst, i32immSExt8:$src)], IIC_ALU_NONMEM>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "", - [(set GR64:$dst, i64immSExt8:$src)]>, - Requires<[OptForMinSize, NotWin64WithoutFP]>; + [(set GR64:$dst, i64immSExt8:$src)], IIC_ALU_NONMEM>, + Requires<[OptForMinSize, NotWin64WithoutFP]>; } // Materialize i64 constant where top 32-bits are zero. This could theoretically // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however // that would make it more difficult to rematerialize. let isReMaterializable = 1, isAsCheapAsAMove = 1, - isPseudo = 1, hasSideEffects = 0 in -def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>; + isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteALU] in +def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", [], + IIC_ALU_NONMEM>; // This 64-bit pseudo-move can be used for both a 64-bit constant that is // actually the zero-extension of a 32-bit constant and for labels in the @@ -448,6 +463,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in { //===----------------------------------------------------------------------===// // Thread Local Storage Instructions // +let SchedRW = [WriteSystem] in { // ELF TLS Support // All calls clobber the non-callee saved registers. ESP is marked as @@ -458,7 +474,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - usesCustomInserter = 1, Uses = [ESP] in { + usesCustomInserter = 1, Uses = [ESP, SSP] in { def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLS_addr32", [(X86tlsaddr tls32addr:$sym)]>, @@ -478,7 +494,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11, MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7, XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS], - usesCustomInserter = 1, Uses = [RSP] in { + usesCustomInserter = 1, Uses = [RSP, SSP] in { def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLS_addr64", [(X86tlsaddr tls64addr:$sym)]>, @@ -494,7 +510,7 @@ def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym), // address of the variable is in %eax. %ecx is trashed during the function // call. All other registers are preserved. let Defs = [EAX, ECX, EFLAGS], - Uses = [ESP], + Uses = [ESP, SSP], usesCustomInserter = 1 in def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), "# TLSCall_32", @@ -507,13 +523,13 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym), // On return the address of the variable is in %rax. All other // registers are preserved. let Defs = [RAX, EFLAGS], - Uses = [RSP], + Uses = [RSP, SSP], usesCustomInserter = 1 in def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym), "# TLSCall_64", [(X86TLSCall addr:$sym)]>, Requires<[In64BitMode]>; - +} // SchedRW //===----------------------------------------------------------------------===// // Conditional Move Pseudo Instructions @@ -528,7 +544,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> { EFLAGS)))]>; } -let usesCustomInserter = 1, Uses = [EFLAGS] in { +let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in { // X86 doesn't have 8-bit conditional moves. Use a customInserter to // emit control flow. An alternative to this is to mark i8 SELECT as Promote, // however that requires promoting the operands, and can induce additional @@ -566,7 +582,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in { defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>; defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>; defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>; -} // usesCustomInserter = 1, Uses = [EFLAGS] +} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] //===----------------------------------------------------------------------===// // Normal-Instructions-With-Lock-Prefix Pseudo Instructions @@ -593,7 +609,7 @@ def Int_MemBarrier : I<0, Pseudo, (outs), (ins), // ImmOpc8 corresponds to the mi8 version of the instruction // ImmMod corresponds to the instruction format of the mi and mi8 versions multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8, - Format ImmMod, SDPatternOperator Op, string mnemonic> { + Format ImmMod, SDNode Op, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, SchedRW = [WriteALULd, WriteRMW] in { @@ -696,30 +712,52 @@ defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">; defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">; multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form, - int Increment, string mnemonic> { + string frag, string mnemonic> { let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1, - SchedRW = [WriteALULd, WriteRMW], Predicates = [NotSlowIncDec] in { + SchedRW = [WriteALULd, WriteRMW] in { def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst), !strconcat(mnemonic, "{b}\t$dst"), - [(set EFLAGS, (X86lock_add addr:$dst, (i8 Increment)))], + [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))], IIC_UNARY_MEM>, LOCK; def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst), !strconcat(mnemonic, "{w}\t$dst"), - [(set EFLAGS, (X86lock_add addr:$dst, (i16 Increment)))], + [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))], IIC_UNARY_MEM>, OpSize16, LOCK; def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst), !strconcat(mnemonic, "{l}\t$dst"), - [(set EFLAGS, (X86lock_add addr:$dst, (i32 Increment)))], + [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))], IIC_UNARY_MEM>, OpSize32, LOCK; def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst), !strconcat(mnemonic, "{q}\t$dst"), - [(set EFLAGS, (X86lock_add addr:$dst, (i64 Increment)))], + [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))], IIC_UNARY_MEM>, LOCK; } } -defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, 1, "inc">; -defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">; +multiclass unary_atomic_intrin<SDNode atomic_op> { + def _8 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8; + }]>; + def _16 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16; + }]>; + def _32 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32; + }]>; + def _64 : PatFrag<(ops node:$ptr), + (atomic_op node:$ptr), [{ + return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64; + }]>; +} + +defm X86lock_inc : unary_atomic_intrin<X86lock_inc>; +defm X86lock_dec : unary_atomic_intrin<X86lock_dec>; + +defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "X86lock_inc", "inc">; +defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">; // Atomic compare and swap. multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic, @@ -767,7 +805,7 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", // register and the register allocator will ignore any use/def of // it. In other words, the register will not fix the clobbering of // RBX that will happen when setting the arguments for the instrucion. -// +// // Unlike the actual related instuction, we mark that this one // defines EBX (instead of using EBX). // The rationale is that we will define RBX during the expansion of @@ -895,7 +933,7 @@ multiclass RELEASE_BINOP_MI<SDNode op> { [(atomic_store_64 addr:$dst, (op (atomic_load_64 addr:$dst), GR64:$src))]>; } -let Defs = [EFLAGS] in { +let Defs = [EFLAGS], SchedRW = [WriteMicrocoded] in { defm RELEASE_ADD : RELEASE_BINOP_MI<add>; defm RELEASE_AND : RELEASE_BINOP_MI<and>; defm RELEASE_OR : RELEASE_BINOP_MI<or>; @@ -908,20 +946,20 @@ let Defs = [EFLAGS] in { // FIXME: imm version. // FIXME: Version that doesn't clobber $src, using AVX's VADDSS. // FIXME: This could also handle SIMD operations with *ps and *pd instructions. -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in { multiclass RELEASE_FP_BINOP_MI<SDNode op> { def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src), "#BINOP "#NAME#"32mr PSEUDO!", [(atomic_store_32 addr:$dst, - (i32 (bitconvert (op + (i32 (bitconvert (op (f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))), - FR32:$src))))]>, Requires<[HasSSE1]>; + FR32:$src))))]>, Requires<[HasSSE1]>; def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src), "#BINOP "#NAME#"64mr PSEUDO!", [(atomic_store_64 addr:$dst, - (i64 (bitconvert (op + (i64 (bitconvert (op (f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))), - FR64:$src))))]>, Requires<[HasSSE2]>; + FR64:$src))))]>, Requires<[HasSSE2]>; } defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>; // FIXME: Add fsub, fmul, fdiv, ... @@ -942,17 +980,17 @@ multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> { [(atomic_store_64 addr:$dst, dag64)]>; } -let Defs = [EFLAGS] in { +let Defs = [EFLAGS], Predicates = [UseIncDec], SchedRW = [WriteMicrocoded] in { defm RELEASE_INC : RELEASE_UNOP< (add (atomic_load_8 addr:$dst), (i8 1)), (add (atomic_load_16 addr:$dst), (i16 1)), (add (atomic_load_32 addr:$dst), (i32 1)), - (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>; + (add (atomic_load_64 addr:$dst), (i64 1))>; defm RELEASE_DEC : RELEASE_UNOP< (add (atomic_load_8 addr:$dst), (i8 -1)), (add (atomic_load_16 addr:$dst), (i16 -1)), (add (atomic_load_32 addr:$dst), (i32 -1)), - (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>; + (add (atomic_load_64 addr:$dst), (i64 -1))>; } /* TODO: These don't work because the type inference of TableGen fails. @@ -972,18 +1010,19 @@ defm RELEASE_NOT : RELEASE_UNOP< (not (atomic_load_64 addr:$dst))>; */ +let SchedRW = [WriteMicrocoded] in { def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src), - "#RELEASE_MOV8mi PSEUDO!", - [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; + "#RELEASE_MOV8mi PSEUDO!", + [(atomic_store_8 addr:$dst, (i8 imm:$src))]>; def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src), - "#RELEASE_MOV16mi PSEUDO!", - [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; + "#RELEASE_MOV16mi PSEUDO!", + [(atomic_store_16 addr:$dst, (i16 imm:$src))]>; def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src), - "#RELEASE_MOV32mi PSEUDO!", - [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; + "#RELEASE_MOV32mi PSEUDO!", + [(atomic_store_32 addr:$dst, (i32 imm:$src))]>; def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src), - "#RELEASE_MOV64mi32 PSEUDO!", - [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; + "#RELEASE_MOV64mi32 PSEUDO!", + [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>; def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src), "#RELEASE_MOV8mr PSEUDO!", @@ -1010,6 +1049,7 @@ def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src), def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src), "#ACQUIRE_MOV64rm PSEUDO!", [(set GR64:$dst, (atomic_load_64 addr:$src))]>; +} // SchedRW //===----------------------------------------------------------------------===// // DAG Pattern Matching Rules @@ -1239,18 +1279,20 @@ def : Pat<(i64 (anyext GR8 :$src)), def : Pat<(i64 (anyext GR16:$src)), (SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>; def : Pat<(i64 (anyext GR32:$src)), - (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>; + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>; // Any instruction that defines a 32-bit result leaves the high half of the // register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may // be copying from a truncate. Any other 32-bit operation will zero-extend -// up to 64 bits. +// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper +// 32 bits, they're probably just qualifying a CopyFromReg. def def32 : PatLeaf<(i32 GR32:$src), [{ return N->getOpcode() != ISD::TRUNCATE && N->getOpcode() != TargetOpcode::EXTRACT_SUBREG && N->getOpcode() != ISD::CopyFromReg && - N->getOpcode() != ISD::AssertSext; + N->getOpcode() != ISD::AssertSext && + N->getOpcode() != ISD::AssertZext; }]>; // In the case of a 32-bit def that is known to implicitly zero-extend, @@ -1397,16 +1439,11 @@ def : Pat<(and GR32:$src1, 0xffff), (MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>; // r & (2^8-1) ==> movz def : Pat<(and GR32:$src1, 0xff), - (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1, - GR32_ABCD)), - sub_8bit))>, - Requires<[Not64BitMode]>; + (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>; // r & (2^8-1) ==> movz def : Pat<(and GR16:$src1, 0xff), - (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG - (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)), - sub_16bit)>, - Requires<[Not64BitMode]>; + (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)), + sub_16bit)>; // r & (2^32-1) ==> movz def : Pat<(and GR64:$src, 0x00000000FFFFFFFF), @@ -1423,15 +1460,6 @@ def : Pat<(and GR64:$src, 0xff), (SUBREG_TO_REG (i64 0), (MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))), sub_32bit)>; -// r & (2^8-1) ==> movz -def : Pat<(and GR32:$src1, 0xff), - (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>, - Requires<[In64BitMode]>; -// r & (2^8-1) ==> movz -def : Pat<(and GR16:$src1, 0xff), - (EXTRACT_SUBREG (MOVZX32rr8 (i8 - (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>, - Requires<[In64BitMode]>; } // AddedComplexity = 1 @@ -1439,16 +1467,11 @@ def : Pat<(and GR16:$src1, 0xff), def : Pat<(sext_inreg GR32:$src, i16), (MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>; def : Pat<(sext_inreg GR32:$src, i8), - (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, - GR32_ABCD)), - sub_8bit))>, - Requires<[Not64BitMode]>; + (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>; def : Pat<(sext_inreg GR16:$src, i8), - (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG - (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))), - sub_16bit)>, - Requires<[Not64BitMode]>; + (EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)), + sub_16bit)>; def : Pat<(sext_inreg GR64:$src, i32), (MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>; @@ -1456,13 +1479,6 @@ def : Pat<(sext_inreg GR64:$src, i16), (MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>; def : Pat<(sext_inreg GR64:$src, i8), (MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>; -def : Pat<(sext_inreg GR32:$src, i8), - (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>, - Requires<[In64BitMode]>; -def : Pat<(sext_inreg GR16:$src, i8), - (EXTRACT_SUBREG (MOVSX32rr8 - (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>, - Requires<[In64BitMode]>; // sext, sext_load, zext, zext_load def: Pat<(i16 (sext GR8:$src)), @@ -1500,44 +1516,26 @@ def : Pat<(i8 (trunc GR16:$src)), // h-register tricks def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))), - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi)>, + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>, Requires<[Not64BitMode]>; def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))), - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi)>, + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>, Requires<[Not64BitMode]>; def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))), - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), - sub_8bit_hi)>, + (EXTRACT_SUBREG GR32:$src, sub_8bit_hi)>, Requires<[Not64BitMode]>; def : Pat<(srl GR16:$src, (i8 8)), (EXTRACT_SUBREG - (MOVZX32rr8 - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi)), - sub_16bit)>, - Requires<[Not64BitMode]>; + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)), + sub_16bit)>; def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), - (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, - GR16_ABCD)), - sub_8bit_hi))>, - Requires<[Not64BitMode]>; + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>; def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), - (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, - GR16_ABCD)), - sub_8bit_hi))>, - Requires<[Not64BitMode]>; + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>; def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), - (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, - GR32_ABCD)), - sub_8bit_hi))>, - Requires<[Not64BitMode]>; + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), - (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, - GR32_ABCD)), - sub_8bit_hi))>, - Requires<[Not64BitMode]>; + (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>; // h-register tricks. // For now, be conservative on x86-64 and use an h-register extract only if the @@ -1551,68 +1549,35 @@ def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)), (SUBREG_TO_REG (i64 0), (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), - sub_8bit_hi)), + (EXTRACT_SUBREG GR64:$src, sub_8bit_hi)), sub_32bit)>; -def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)), - (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), - sub_8bit_hi))>, - Requires<[In64BitMode]>; -def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)), - (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, - GR32_ABCD)), - sub_8bit_hi))>, - Requires<[In64BitMode]>; -def : Pat<(srl GR16:$src, (i8 8)), - (EXTRACT_SUBREG - (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi)), - sub_16bit)>, - Requires<[In64BitMode]>; -def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))), - (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi))>, - Requires<[In64BitMode]>; -def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))), - (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi))>, - Requires<[In64BitMode]>; def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))), (SUBREG_TO_REG (i64 0), (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi)), + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)), sub_32bit)>; def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))), (SUBREG_TO_REG (i64 0), (MOVZX32_NOREXrr8 - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi)), + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)), sub_32bit)>; // h-register extract and store. def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst), (MOV8mr_NOREX addr:$dst, - (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)), - sub_8bit_hi))>; + (EXTRACT_SUBREG GR64:$src, sub_8bit_hi))>; def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst), (MOV8mr_NOREX addr:$dst, - (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)), - sub_8bit_hi))>, + (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>, Requires<[In64BitMode]>; def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst), (MOV8mr_NOREX addr:$dst, - (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), - sub_8bit_hi))>, + (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>, Requires<[In64BitMode]>; @@ -1627,7 +1592,13 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>; def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>; def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>; -// Helper imms that check if a mask doesn't change significant shift bits. +// Helper imms to check if a mask doesn't change significant shift/rotate bits. +def immShift8 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 3; +}]>; +def immShift16 : ImmLeaf<i8, [{ + return countTrailingOnes<uint64_t>(Imm) >= 4; +}]>; def immShift32 : ImmLeaf<i8, [{ return countTrailingOnes<uint64_t>(Imm) >= 5; }]>; @@ -1654,15 +1625,45 @@ multiclass MaskedShiftAmountPats<SDNode frag, string name> { // (shift x (and y, 63)) ==> (shift x, y) def : Pat<(frag GR64:$src1, (and CL, immShift64)), (!cast<Instruction>(name # "64rCL") GR64:$src1)>; - def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst), + def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), (!cast<Instruction>(name # "64mCL") addr:$dst)>; } defm : MaskedShiftAmountPats<shl, "SHL">; defm : MaskedShiftAmountPats<srl, "SHR">; defm : MaskedShiftAmountPats<sra, "SAR">; -defm : MaskedShiftAmountPats<rotl, "ROL">; -defm : MaskedShiftAmountPats<rotr, "ROR">; + +// ROL/ROR instructions allow a stronger mask optimization than shift for 8- and +// 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount +// because over-rotating produces the same result. This is noted in the Intel +// docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation +// amount could affect EFLAGS results, but that does not matter because we are +// not tracking flags for these nodes. +multiclass MaskedRotateAmountPats<SDNode frag, string name> { + // (rot x (and y, BitWidth - 1)) ==> (rot x, y) + def : Pat<(frag GR8:$src1, (and CL, immShift8)), + (!cast<Instruction>(name # "8rCL") GR8:$src1)>; + def : Pat<(frag GR16:$src1, (and CL, immShift16)), + (!cast<Instruction>(name # "16rCL") GR16:$src1)>; + def : Pat<(frag GR32:$src1, (and CL, immShift32)), + (!cast<Instruction>(name # "32rCL") GR32:$src1)>; + def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst), + (!cast<Instruction>(name # "8mCL") addr:$dst)>; + def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst), + (!cast<Instruction>(name # "16mCL") addr:$dst)>; + def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst), + (!cast<Instruction>(name # "32mCL") addr:$dst)>; + + // (rot x (and y, 63)) ==> (rot x, y) + def : Pat<(frag GR64:$src1, (and CL, immShift64)), + (!cast<Instruction>(name # "64rCL") GR64:$src1)>; + def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst), + (!cast<Instruction>(name # "64mCL") addr:$dst)>; +} + + +defm : MaskedRotateAmountPats<rotl, "ROL">; +defm : MaskedRotateAmountPats<rotr, "ROR">; // Double shift amount is implicitly masked. multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> { @@ -1680,6 +1681,66 @@ multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> { defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">; defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">; +let Predicates = [HasBMI2] in { + let AddedComplexity = 1 in { + def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)), + (SARX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)), + (SARX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)), + (SHRX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)), + (SHRX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)), + (SHLX32rr GR32:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)), + (SHLX64rr GR64:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } + + let AddedComplexity = -20 in { + def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + (SARX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + (SARX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + (SHRX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + (SHRX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)), + (SHLX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)), + (SHLX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } +} + // (anyext (setcc_carry)) -> (setcc_carry) def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))), (SETB_C16r)>; @@ -1821,7 +1882,7 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2), // Increment/Decrement reg. // Do not make INC/DEC if it is slow -let Predicates = [NotSlowIncDec] in { +let Predicates = [UseIncDec] in { def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>; def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>; def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>; diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td index 4ea223e82be9..5581fd462a1d 100644 --- a/lib/Target/X86/X86InstrControl.td +++ b/lib/Target/X86/X86InstrControl.td @@ -171,7 +171,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in { "ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16, Sched<[WriteJumpLd]>; def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst), - "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32, + "{l}jmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32, Sched<[WriteJumpLd]>; } @@ -191,7 +191,7 @@ let isCall = 1 in // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. Uses for argument // registers are added manually. - let Uses = [ESP] in { + let Uses = [ESP, SSP] in { def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm, (outs), (ins i32imm_pcrel:$dst), "call{l}\t$dst", [], IIC_CALL_RI>, OpSize32, @@ -233,7 +233,7 @@ let isCall = 1 in "lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16, Sched<[WriteJumpLd]>; def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst), - "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32, + "{l}call{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32, Sched<[WriteJumpLd]>; } @@ -241,11 +241,11 @@ let isCall = 1 in // Tail call stuff. let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in - let Uses = [ESP] in { + let Uses = [ESP, SSP] in { def TCRETURNdi : PseudoI<(outs), - (ins i32imm_pcrel:$dst, i32imm:$offset), []>; + (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable; def TCRETURNri : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; let mayLoad = 1 in def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset), []>; @@ -268,7 +268,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in - let Uses = [ESP, EFLAGS] in { + let Uses = [ESP, EFLAGS, SSP] in { def TCRETURNdicc : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; @@ -287,7 +287,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, // RSP is marked as a use to prevent stack-pointer assignments that appear // immediately before calls from potentially appearing dead. Uses for argument // registers are added manually. -let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { +let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in { // NOTE: this pattern doesn't match "X86call imm", because we do not know // that the offset between an arbitrary immediate and the call will fit in // the 32-bit pcrel field that we have. @@ -309,16 +309,16 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in { } let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, - isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1, + isCodeGenOnly = 1, Uses = [RSP, SSP], usesCustomInserter = 1, SchedRW = [WriteJump] in { def TCRETURNdi64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst, i32imm:$offset), []>; def TCRETURNri64 : PseudoI<(outs), - (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>; + (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable; let mayLoad = 1 in def TCRETURNmi64 : PseudoI<(outs), - (ins i64mem_TC:$dst, i32imm:$offset), []>; + (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable; def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst), "jmp\t$dst", [], IIC_JMP_REL>; @@ -345,7 +345,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1, // rather than barriers, and they use EFLAGS. let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1, isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in - let Uses = [RSP, EFLAGS] in { + let Uses = [RSP, EFLAGS, SSP] in { def TCRETURNdi64cc : PseudoI<(outs), (ins i64i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>; diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td index af43d9f53325..2a8ab0069b1e 100644 --- a/lib/Target/X86/X86InstrExtension.td +++ b/lib/Target/X86/X86InstrExtension.td @@ -12,32 +12,30 @@ //===----------------------------------------------------------------------===// let hasSideEffects = 0 in { - let Defs = [AX], Uses = [AL] in + let Defs = [AX], Uses = [AL] in // AX = signext(AL) def CBW : I<0x98, RawFrm, (outs), (ins), - "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL) - let Defs = [EAX], Uses = [AX] in + "{cbtw|cbw}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>; + let Defs = [EAX], Uses = [AX] in // EAX = signext(AX) def CWDE : I<0x98, RawFrm, (outs), (ins), - "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX) + "{cwtl|cwde}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>; - let Defs = [AX,DX], Uses = [AX] in + let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX) def CWD : I<0x99, RawFrm, (outs), (ins), - "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX) - let Defs = [EAX,EDX], Uses = [EAX] in + "{cwtd|cwd}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>; + let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX) def CDQ : I<0x99, RawFrm, (outs), (ins), - "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX) + "{cltd|cdq}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>; - let Defs = [RAX], Uses = [EAX] in + let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX) def CDQE : RI<0x98, RawFrm, (outs), (ins), - "{cltq|cdqe}", [], IIC_CBW>; // RAX = signext(EAX) + "{cltq|cdqe}", [], IIC_CBW>, Sched<[WriteALU]>; - let Defs = [RAX,RDX], Uses = [RAX] in + let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX) def CQO : RI<0x99, RawFrm, (outs), (ins), - "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX) + "{cqto|cqo}", [], IIC_CBW>, Sched<[WriteALU]>; } - - // Sign/Zero extenders let hasSideEffects = 0 in { def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src), diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td index 3a3cdc9fa574..35fa45590fc6 100644 --- a/lib/Target/X86/X86InstrFMA.td +++ b/lib/Target/X86/X86InstrFMA.td @@ -15,8 +15,8 @@ // FMA3 - Intel 3 operand Fused Multiply-Add instructions //===----------------------------------------------------------------------===// -// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined -// below, both the register and memory variants are commutable. +// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* multiclasses +// defined below, both the register and memory variants are commutable. // For the register form the commutable operands are 1, 2 and 3. // For the memory variant the folded operand must be in 3. Thus, // in that case, only the operands 1 and 2 can be swapped. @@ -34,56 +34,87 @@ // operands 1 and 3 (register forms only): *231* --> *213*; // operands 2 and 3 (register forms only): *231* --> *231*(no changes). -let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in -multiclass fma3p_rm<bits<8> opc, string OpcodeStr, - PatFrag MemFrag128, PatFrag MemFrag256, - ValueType OpVT128, ValueType OpVT256, - SDPatternOperator Op = null_frag> { - def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, VR128:$src3), +multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op> { + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>, + Sched<[WriteFMA]>; + + let mayLoad = 1 in + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR128:$dst, (OpVT128 (Op VR128:$src2, - VR128:$src1, VR128:$src3)))]>; + [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, + (MemFrag addr:$src3))))]>, + Sched<[WriteFMALd, ReadAfterLd]>; +} + +multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op> { + let hasSideEffects = 0 in + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[WriteFMA]>; let mayLoad = 1 in - def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, f128mem:$src3), + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1, - (MemFrag128 addr:$src3))))]>; + [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3), + RC:$src1)))]>, Sched<[WriteFMALd, ReadAfterLd]>; +} - def Yr : FMA3<opc, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, VR256:$src3), +multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC, + ValueType VT, X86MemOperand x86memop, PatFrag MemFrag, + SDNode Op> { + let hasSideEffects = 0 in + def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1, - VR256:$src3)))]>, VEX_L; + []>, Sched<[WriteFMA]>; + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. let mayLoad = 1 in - def Ym : FMA3<opc, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, f256mem:$src3), + def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set VR256:$dst, - (OpVT256 (Op VR256:$src2, VR256:$src1, - (MemFrag256 addr:$src3))))]>, VEX_L; + [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1, + RC:$src2)))]>, Sched<[WriteFMALd, ReadAfterLd]>; } +let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, string OpcodeStr, string PackTy, string Suff, PatFrag MemFrag128, PatFrag MemFrag256, SDNode Op, ValueType OpTy128, ValueType OpTy256> { - defm NAME#213#Suff : fma3p_rm<opc213, - !strconcat(OpcodeStr, "213", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256, Op>; - defm NAME#132#Suff : fma3p_rm<opc132, - !strconcat(OpcodeStr, "132", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256>; - defm NAME#231#Suff : fma3p_rm<opc231, - !strconcat(OpcodeStr, "231", PackTy), - MemFrag128, MemFrag256, OpTy128, OpTy256>; + defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy), + VR128, OpTy128, f128mem, MemFrag128, Op>; + defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy), + VR128, OpTy128, f128mem, MemFrag128, Op>; + defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy), + VR128, OpTy128, f128mem, MemFrag128, Op>; + + defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy), + VR256, OpTy256, f256mem, MemFrag256, Op>, + VEX_L; + defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy), + VR256, OpTy256, f256mem, MemFrag256, Op>, + VEX_L; + defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy), + VR256, OpTy256, f256mem, MemFrag256, Op>, + VEX_L; } // Fused Multiply-Add @@ -93,11 +124,9 @@ let ExeDomain = SSEPackedSingle in { defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS", loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>; defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS", - loadv4f32, loadv8f32, X86Fmaddsub, - v4f32, v8f32>; + loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>; defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS", - loadv4f32, loadv8f32, X86Fmsubadd, - v4f32, v8f32>; + loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>; } let ExeDomain = SSEPackedDouble in { @@ -138,23 +167,79 @@ let ExeDomain = SSEPackedDouble in { // FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2; // Please see more detailed comment at the very beginning of the section // defining FMA3 opcodes above. -let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in -multiclass fma3s_rm<bits<8> opc, string OpcodeStr, - X86MemOperand x86memop, RegisterClass RC, - SDPatternOperator OpNode = null_frag> { - def r : FMA3<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>; +multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode> { + def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>, + Sched<[WriteFMA]>; let mayLoad = 1 in - def m : FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, x86memop:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - [(set RC:$dst, - (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>; + def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>, + Sched<[WriteFMALd, ReadAfterLd]>; +} + +multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode> { + let hasSideEffects = 0 in + def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[WriteFMA]>; + + let mayLoad = 1 in + def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>, + Sched<[WriteFMALd, ReadAfterLd]>; +} + +multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr, + X86MemOperand x86memop, RegisterClass RC, + SDPatternOperator OpNode> { + let hasSideEffects = 0 in + def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[WriteFMA]>; + + // Pattern is 312 order so that the load is in a different place from the + // 213 and 231 patterns this helps tablegen's duplicate pattern detection. + let mayLoad = 1 in + def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, x86memop:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + [(set RC:$dst, + (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>, + Sched<[WriteFMALd, ReadAfterLd]>; +} + +let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in +multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, + string OpStr, string PackTy, string Suff, + SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop> { + defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy), + x86memop, RC, OpNode>; + defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy), + x86memop, RC, OpNode>; + defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy), + x86memop, RC, OpNode>; } // These FMA*_Int instructions are defined specially for being used when @@ -174,32 +259,18 @@ let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1, hasSideEffects = 0 in multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr, Operand memopr, RegisterClass RC> { - def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst), - (ins RC:$src1, RC:$src2, RC:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; + def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, RC:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[WriteFMA]>; let mayLoad = 1 in - def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst), - (ins RC:$src1, RC:$src2, memopr:$src3), - !strconcat(OpcodeStr, - "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), - []>; -} - -multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, string PackTy, string Suff, - SDNode OpNode, RegisterClass RC, - X86MemOperand x86memop> { - let Predicates = [HasFMA, NoAVX512] in { - defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy), - x86memop, RC>; - defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy), - x86memop, RC, OpNode>; - defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy), - x86memop, RC>; - } + def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, RC:$src2, memopr:$src3), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + []>, Sched<[WriteFMALd, ReadAfterLd]>; } // The FMA 213 form is created for lowering of scalar FMA intrinscis @@ -223,8 +294,7 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231, } multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, - string OpStr, Intrinsic IntF32, Intrinsic IntF64, - SDNode OpNode> { + string OpStr, SDNode OpNodeIntrin, SDNode OpNode> { let ExeDomain = SSEPackedSingle in defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode, FR32, f32mem>, @@ -242,26 +312,44 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231, // This is because src1 is tied to dest, and the scalar intrinsics // require the pass-through values to come from the first source // operand, not the second. - let Predicates = [HasFMA] in { - def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SSr_Int") - $src1, $src2, $src3), VR128)>; - - def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3), - (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SDr_Int") - $src1, $src2, $src3), VR128)>; + let Predicates = [HasFMA, NoAVX512] in { + def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)), + (!cast<Instruction>(NAME#"213SSr_Int") + VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)), + (!cast<Instruction>(NAME#"213SDr_Int") + VR128:$src1, VR128:$src2, VR128:$src3)>; + + def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, + sse_load_f32:$src3)), + (!cast<Instruction>(NAME#"213SSm_Int") + VR128:$src1, VR128:$src2, sse_load_f32:$src3)>; + + def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, + sse_load_f64:$src3)), + (!cast<Instruction>(NAME#"213SDm_Int") + VR128:$src1, VR128:$src2, sse_load_f64:$src3)>; + + def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3, + VR128:$src2)), + (!cast<Instruction>(NAME#"132SSm_Int") + VR128:$src1, VR128:$src2, sse_load_f32:$src3)>; + + def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3, + VR128:$src2)), + (!cast<Instruction>(NAME#"132SDm_Int") + VR128:$src1, VR128:$src2, sse_load_f64:$src3)>; } } -defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss, - int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG; -defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss, - int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG; +defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG; +defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG; -defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss, - int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG; -defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss, - int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG; +defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>, + VEX_LIG; +defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>, + VEX_LIG; //===----------------------------------------------------------------------===// @@ -273,60 +361,66 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop, ValueType OpVT, SDNode OpNode, PatFrag mem_frag> { let isCommutable = 1 in - def rr : FMA4<opc, MRMSrcRegOp4, (outs RC:$dst), + def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG; - def rm : FMA4<opc, MRMSrcMemOp4, (outs RC:$dst), + (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMA]>; + def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (OpNode RC:$src1, RC:$src2, - (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG; - def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst), + (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMALd, ReadAfterLd]>; + def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, - (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG; + (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG, + Sched<[WriteFMALd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in - def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst), + def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_LIG, FoldGenData<NAME#rr>; + VEX_LIG, FoldGenData<NAME#rr>, Sched<[WriteFMA]>; } multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop, - ComplexPattern mem_cpat, Intrinsic Int> { + ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> { let isCodeGenOnly = 1 in { - def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst), + def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG; - def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), + (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W, + VEX_LIG, Sched<[WriteFMA]>; + def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - [(set VR128:$dst, (Int VR128:$src1, VR128:$src2, - mem_cpat:$src3))]>, VEX_W, VEX_LIG; - def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst), + [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2, + mem_cpat:$src3)))]>, VEX_W, VEX_LIG, + Sched<[WriteFMALd, ReadAfterLd]>; + def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG; + (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>, + VEX_LIG, Sched<[WriteFMALd, ReadAfterLd]>; let hasSideEffects = 0 in - def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), + def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, VEX_LIG, FoldGenData<NAME#rr_Int>; + []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[WriteFMA]>; } // isCodeGenOnly = 1 } @@ -340,19 +434,21 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, - VEX_W; + VEX_W, Sched<[WriteFMA]>; def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, f128mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2, - (ld_frag128 addr:$src3)))]>, VEX_W; + (ld_frag128 addr:$src3)))]>, VEX_W, + Sched<[WriteFMALd, ReadAfterLd]>; def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>; + (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>, + Sched<[WriteFMALd, ReadAfterLd]>; let isCommutable = 1 in def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), @@ -360,50 +456,52 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>, - VEX_W, VEX_L; + VEX_W, VEX_L, Sched<[WriteFMA]>; def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, f256mem:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2, - (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L; + (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L, + Sched<[WriteFMALd, ReadAfterLd]>; def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR256:$dst, (OpNode VR256:$src1, - (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L; + (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L, + Sched<[WriteFMALd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in { def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - FoldGenData<NAME#rr>; + Sched<[WriteFMA]>, FoldGenData<NAME#rr>; def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, VR256:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, - VEX_L, FoldGenData<NAME#Yrr>; + VEX_L, Sched<[WriteFMA]>, FoldGenData<NAME#Yrr>; } // isCodeGenOnly = 1 } let ExeDomain = SSEPackedSingle in { // Scalar Instructions defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>, - fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32, - int_x86_fma_vfmadd_ss>; + fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32, + X86Fmadd4s>; defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>, - fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32, - int_x86_fma_vfmsub_ss>; + fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32, + X86Fmsub4s>; defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32, X86Fnmadd, loadf32>, - fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32, - int_x86_fma_vfnmadd_ss>; + fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32, + X86Fnmadd4s>; defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32, X86Fnmsub, loadf32>, - fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32, - int_x86_fma_vfnmsub_ss>; + fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32, + X86Fnmsub4s>; // Packed Instructions defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32, loadv4f32, loadv8f32>; @@ -422,19 +520,19 @@ let ExeDomain = SSEPackedSingle in { let ExeDomain = SSEPackedDouble in { // Scalar Instructions defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>, - fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfmadd_sd>; + fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64, + X86Fmadd4s>; defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>, - fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfmsub_sd>; + fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64, + X86Fmsub4s>; defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64, X86Fnmadd, loadf64>, - fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64, - int_x86_fma_vfnmadd_sd>; + fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64, + X86Fnmadd4s>; defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64, X86Fnmsub, loadf64>, - fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64, - int_x86_fma_vfnmsub_sd>; + fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64, + X86Fnmsub4s>; // Packed Instructions defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64, loadv2f64, loadv4f64>; diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td index 11b1d070ef2f..7e89a4111d86 100644 --- a/lib/Target/X86/X86InstrFPStack.td +++ b/lib/Target/X86/X86InstrFPStack.td @@ -57,24 +57,24 @@ def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore, // FPStack pattern fragments //===----------------------------------------------------------------------===// -def fpimm0 : PatLeaf<(fpimm), [{ - return N->isExactlyValue(+0.0); +def fpimm0 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(+0.0); }]>; -def fpimmneg0 : PatLeaf<(fpimm), [{ - return N->isExactlyValue(-0.0); +def fpimmneg0 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(-0.0); }]>; -def fpimm1 : PatLeaf<(fpimm), [{ - return N->isExactlyValue(+1.0); +def fpimm1 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(+1.0); }]>; -def fpimmneg1 : PatLeaf<(fpimm), [{ - return N->isExactlyValue(-1.0); +def fpimmneg1 : FPImmLeaf<fAny, [{ + return Imm.isExactlyValue(-1.0); }]>; -// Some 'special' instructions -let usesCustomInserter = 1 in { // Expanded after instruction selection. +// Some 'special' instructions - expanded after instruction selection. +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src), [(X86fp_to_i16mem RFP32:$src, addr:$dst)]>; def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src), @@ -118,10 +118,12 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection. // f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1. // f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2. // f80 instructions cannot use SSE and use neither of these. -class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> : - FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>; -class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> : - FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>; +class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin = NoItinerary> : + FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32]>; +class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin = NoItinerary> : + FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64]>; // Factoring for arithmetic. multiclass FPBinary_rr<SDNode OpNode> { @@ -235,24 +237,29 @@ def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src), let Defs = [FPSW] in { // FPBinary_rr just defines pseudo-instructions, no need to set a scheduling // resources. +let hasNoSchedulingInfo = 1 in { defm ADD : FPBinary_rr<fadd>; defm SUB : FPBinary_rr<fsub>; defm MUL : FPBinary_rr<fmul>; defm DIV : FPBinary_rr<fdiv>; +} + // Sets the scheduling resources for the actual NAME#_F<size>m defintions. let SchedRW = [WriteFAddLd] in { defm ADD : FPBinary<fadd, MRM0m, "add">; defm SUB : FPBinary<fsub, MRM4m, "sub">; defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>; } + let SchedRW = [WriteFMulLd] in { defm MUL : FPBinary<fmul, MRM1m, "mul">; } + let SchedRW = [WriteFDivLd] in { defm DIV : FPBinary<fdiv, MRM6m, "div">; defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>; } -} +} // Defs = [FPSW] class FPST0rInst<Format fp, string asm> : FPI<0xD8, fp, (outs), (ins RST:$op), asm>; @@ -274,6 +281,8 @@ def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">; def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">; def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">; def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">; +def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">; +def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">; } // SchedRW let SchedRW = [WriteFMul] in { def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">; @@ -289,84 +298,98 @@ def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">; def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">; } // SchedRW -def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">; -def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">; - // Unary operations. -multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> { +multiclass FPUnary<SDNode OpNode, Format fp, string asmstring, + InstrItinClass itin> { def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW, - [(set RFP32:$dst, (OpNode RFP32:$src))]>; + [(set RFP32:$dst, (OpNode RFP32:$src))], itin>; def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW, - [(set RFP64:$dst, (OpNode RFP64:$src))]>; + [(set RFP64:$dst, (OpNode RFP64:$src))], itin>; def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW, - [(set RFP80:$dst, (OpNode RFP80:$src))]>; -def _F : FPI<0xD9, fp, (outs), (ins), asmstring>; + [(set RFP80:$dst, (OpNode RFP80:$src))], itin>; +def _F : FPI<0xD9, fp, (outs), (ins), asmstring, itin>; } let Defs = [FPSW] in { -defm CHS : FPUnary<fneg, MRM_E0, "fchs">; -defm ABS : FPUnary<fabs, MRM_E1, "fabs">; -let SchedRW = [WriteFSqrt] in { -defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">; + +let SchedRW = [WriteVecLogic] in { +defm CHS : FPUnary<fneg, MRM_E0, "fchs", IIC_FSIGN>; +defm ABS : FPUnary<fabs, MRM_E1, "fabs", IIC_FSIGN>; +} + +let SchedRW = [WriteFSqrt] in +defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt", IIC_FSQRT>; + +let SchedRW = [WriteMicrocoded] in { +defm SIN : FPUnary<fsin, MRM_FE, "fsin", IIC_FSINCOS>; +defm COS : FPUnary<fcos, MRM_FF, "fcos", IIC_FSINCOS>; } -defm SIN : FPUnary<fsin, MRM_FE, "fsin">; -defm COS : FPUnary<fcos, MRM_FF, "fcos">; +let SchedRW = [WriteFAdd] in { let hasSideEffects = 0 in { def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>; def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>; def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>; -} -def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">; +} // hasSideEffects + +def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst", IIC_FCOMI>; +} // SchedRW } // Defs = [FPSW] // Versions of FP instructions that take a single memory operand. Added for the // disassembler; remove as they are included with patterns elsewhere. +let SchedRW = [WriteFAddLd] in { def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">; def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">; -def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; -def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; +def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; +def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; + +def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; +def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">; def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">; +} // SchedRW -def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">; -def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">; +let SchedRW = [WriteMicrocoded] in { +def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">; +def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">; def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">; def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">; def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">; -def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">; -def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">; - def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">; def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">; +} // SchedRW // Floating point cmovs. -class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : - FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>; -class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> : - FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>; +class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin> : + FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32, HasCMov]>; +class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern, + InstrItinClass itin> : + FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64, HasCMov]>; multiclass FPCMov<PatLeaf cc> { def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2), CondMovFP, [(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2, - cc, EFLAGS))]>; + cc, EFLAGS))], IIC_FCMOV>; def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2), CondMovFP, [(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2, - cc, EFLAGS))]>; + cc, EFLAGS))], IIC_FCMOV>; def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2), CondMovFP, [(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2, - cc, EFLAGS))]>, + cc, EFLAGS))], IIC_FCMOV>, Requires<[HasCMov]>; } let Defs = [FPSW] in { +let SchedRW = [WriteFAdd] in { let Uses = [EFLAGS], Constraints = "$src1 = $dst" in { defm CMOVB : FPCMov<X86_COND_B>; defm CMOVBE : FPCMov<X86_COND_BE>; @@ -381,24 +404,26 @@ defm CMOVNP : FPCMov<X86_COND_NP>; let Predicates = [HasCMov] in { // These are not factored because there's no clean way to pass DA/DB. def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op), - "fcmovb\t{$op, %st(0)|st(0), $op}">; + "fcmovb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op), - "fcmovbe\t{$op, %st(0)|st(0), $op}">; + "fcmovbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op), - "fcmove\t{$op, %st(0)|st(0), $op}">; + "fcmove\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op), - "fcmovu\t{$op, %st(0)|st(0), $op}">; + "fcmovu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op), - "fcmovnb\t{$op, %st(0)|st(0), $op}">; + "fcmovnb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op), - "fcmovnbe\t{$op, %st(0)|st(0), $op}">; + "fcmovnbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op), - "fcmovne\t{$op, %st(0)|st(0), $op}">; + "fcmovne\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op), - "fcmovnu\t{$op, %st(0)|st(0), $op}">; + "fcmovnu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>; } // Predicates = [HasCMov] +} // SchedRW // Floating point loads & stores. +let SchedRW = [WriteLoad] in { let canFoldAsLoad = 1 in { def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP32:$dst, (loadf32 addr:$src))]>; @@ -407,7 +432,7 @@ let isReMaterializable = 1 in [(set RFP64:$dst, (loadf64 addr:$src))]>; def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP, [(set RFP80:$dst, (loadf80 addr:$src))]>; -} +} // canFoldAsLoad def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP, [(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>; def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP, @@ -432,7 +457,9 @@ def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild addr:$src, i32))]>; def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP, [(set RFP80:$dst, (X86fild addr:$src, i64))]>; +} // SchedRW +let SchedRW = [WriteStore] in { def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP, [(store RFP32:$src, addr:$op)]>; def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, @@ -451,9 +478,11 @@ def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>; def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>; def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>; def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>; -} +} // mayStore + def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP, [(store RFP80:$src, addr:$op)]>; + let mayStore = 1, hasSideEffects = 0 in { def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>; def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>; @@ -464,7 +493,8 @@ def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>; def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>; def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>; -} +} // mayStore +} // SchedRW let mayLoad = 1, SchedRW = [WriteLoad] in { def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src", @@ -504,7 +534,7 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst", } // FISTTP requires SSE3 even though it's a FPStack op. -let Predicates = [HasSSE3] in { +let Predicates = [HasSSE3], SchedRW = [WriteStore] in { def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, [(X86fp_to_i16mem RFP32:$src, addr:$op)]>; def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, @@ -543,7 +573,7 @@ def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>; } // Floating point constant loads. -let isReMaterializable = 1 in { +let isReMaterializable = 1, SchedRW = [WriteZero] in { def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, [(set RFP32:$dst, fpimm0)]>; def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP, @@ -615,19 +645,19 @@ let Defs = [AX], Uses = [FPSW] in def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags (outs), (ins), "fnstsw\t{%ax|ax}", [(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>; - +let Defs = [FPSW] in def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world (outs), (ins i16mem:$dst), "fnstcw\t$dst", [(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>; } // SchedRW -let mayLoad = 1 in +let Defs = [FPSW], mayLoad = 1 in def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16] (outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>, Sched<[WriteLoad]>; // FPU control instructions let SchedRW = [WriteMicrocoded] in { -let Defs = [FPSW] in +let Defs = [FPSW] in { def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>; def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg), "ffree\t$reg", IIC_FFREE>; @@ -635,16 +665,16 @@ def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg), "ffreep\t$reg", IIC_FFREE>; // Clear exceptions - -let Defs = [FPSW] in def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>; +} // Defs = [FPSW] } // SchedRW // Operandless floating-point instructions for the disassembler. let SchedRW = [WriteMicrocoded] in { -def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; - def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>; + +let Defs = [FPSW] in { +def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>; def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>; def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>; def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>; @@ -665,20 +695,20 @@ def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>; def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>; def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>; def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>; +} // Defs = [FPSW] -let Predicates = [HasFXSR] in { - def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB; - def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), - "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], - IIC_FXSAVE>, TB, Requires<[In64BitMode]>; - def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, - TB; - def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), - "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], - IIC_FXRSTOR>, TB, Requires<[In64BitMode]>; -} // Predicates = [FeatureFXSR] +def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB, + Requires<[HasFXSR]>; +def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst), + "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)], + IIC_FXSAVE>, TB, Requires<[HasFXSR, In64BitMode]>; +def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, + TB, Requires<[HasFXSR]>; +def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src), + "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)], + IIC_FXRSTOR>, TB, Requires<[HasFXSR, In64BitMode]>; } // SchedRW //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td index bfcbf71d252f..2a6ed02fadab 100644 --- a/lib/Target/X86/X86InstrFormats.td +++ b/lib/Target/X86/X86InstrFormats.td @@ -157,9 +157,10 @@ def EncEVEX : Encoding<3>; class OperandSize<bits<2> val> { bits<2> Value = val; } -def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix. -def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode. -def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode. +def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix. +def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode. +def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode. +def OpSizeIgnore : OperandSize<3>; // Takes 0x66 prefix, never emits. // Address size for encodings that change based on mode. class AddressSize<bits<2> val> { @@ -174,6 +175,7 @@ def AdSize64 : AddressSize<3>; // Encodes a 64-bit address. // emitter that various prefix bytes are required. class OpSize16 { OperandSize OpSize = OpSize16; } class OpSize32 { OperandSize OpSize = OpSize32; } +class OpSizeIgnore { OperandSize OpSize = OpSizeIgnore; } class AdSize16 { AddressSize AdSize = AdSize16; } class AdSize32 { AddressSize AdSize = AdSize32; } class AdSize64 { AddressSize AdSize = AdSize64; } @@ -231,6 +233,9 @@ class FoldGenData<string _RegisterForm> { string FoldGenRegForm = _RegisterForm; } +// Mark the instruction as "illegal to memory fold/unfold" +class NotMemoryFoldable { bit isMemoryFoldable = 0; } + class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, string AsmStr, InstrItinClass itin, @@ -314,6 +319,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins, // instruction to replace the current one in case it got picked during generation. string FoldGenRegForm = ?; + bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction? + // TSFlags layout should be kept in sync with X86BaseInfo.h. let TSFlags{6-0} = FormBits; let TSFlags{8-7} = OpSizeBits; @@ -822,7 +829,7 @@ class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm, class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>; -class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, +class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, T8PD, EVEX_4V, Requires<[HasAVX512]>; @@ -839,34 +846,44 @@ class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm, class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = IIC_AES> : I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD, - Requires<[HasAES]>; + Requires<[NoAVX, HasAES]>; class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pattern, InstrItinClass itin = NoItinerary> : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, - Requires<[HasAES]>; + Requires<[NoAVX, HasAES]>; // PCLMUL Instruction Templates class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> - : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, - Requires<[HasPCLMUL]>; - -class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm, - list<dag>pattern, InstrItinClass itin = NoItinerary> - : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD, - VEX_4V, Requires<[HasAVX, HasPCLMUL]>; + : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD; // FMA3 Instruction Templates class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> : I<o, F, outs, ins, asm, pattern, itin>, T8PD, - VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>; + VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>; +class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8PD, + VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>; +class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : I<o, F, outs, ins, asm, pattern, itin>, T8PD, + VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>; // FMA4 Instruction Templates class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag>pattern, InstrItinClass itin = NoItinerary> : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD, + VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>; +class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD, + VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>; +class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm, + list<dag>pattern, InstrItinClass itin = NoItinerary> + : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD, VEX_4V, FMASC, Requires<[HasFMA4]>; // XOP 2, 3 and 4 Operand Instruction Template diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td index 8b5bbf24f6f6..ebbef00c01d9 100644 --- a/lib/Target/X86/X86InstrFragmentsSIMD.td +++ b/lib/Target/X86/X86InstrFragmentsSIMD.td @@ -56,8 +56,6 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp, def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>; def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>; def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>; -def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>; -def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>; def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>; def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>; def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>; @@ -146,8 +144,11 @@ def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND", SDTCisSameSizeAs<0, 2>, SDTCisVT<3, i32>]>>; -def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>; -def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>; +def X86vshiftimm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVT<2, i8>, SDTCisInt<0>]>; + +def X86vshldq : SDNode<"X86ISD::VSHLDQ", X86vshiftimm>; +def X86vshrdq : SDNode<"X86ISD::VSRLDQ", X86vshiftimm>; def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>; def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>; def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>; @@ -164,15 +165,16 @@ def X86CmpMaskCC : SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>; def X86CmpMaskCCRound : SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>, - SDTCisVec<1>, SDTCisSameAs<2, 1>, + SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>, SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; def X86CmpMaskCCScalar : - SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>; + SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>]>; def X86CmpMaskCCScalarRound : - SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>, - SDTCisVT<4, i32>]>; + SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>, + SDTCisVT<3, i8>, SDTCisVT<4, i32>]>; def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>; def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>; @@ -180,23 +182,25 @@ def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>; def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>; def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>; -def X86vshl : SDNode<"X86ISD::VSHL", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVec<2>]>>; -def X86vsrl : SDNode<"X86ISD::VSRL", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVec<2>]>>; -def X86vsra : SDNode<"X86ISD::VSRA", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVec<2>]>>; +def X86phminpos: SDNode<"X86ISD::PHMINPOS", + SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>; + +def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisVec<2>, SDTCisInt<0>, + SDTCisInt<1>]>; + +def X86vshl : SDNode<"X86ISD::VSHL", X86vshiftuniform>; +def X86vsrl : SDNode<"X86ISD::VSRL", X86vshiftuniform>; +def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>; -def X86vsrav : SDNode<"X86ISD::VSRAV" , - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; +def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, SDTCisInt<0>]>; -def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>; -def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>; -def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>; +def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>; + +def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>; +def X86vsrli : SDNode<"X86ISD::VSRLI", X86vshiftimm>; +def X86vsrai : SDNode<"X86ISD::VSRAI", X86vshiftimm>; def X86kshiftl : SDNode<"X86ISD::KSHIFTL", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, @@ -207,31 +211,20 @@ def X86kshiftr : SDNode<"X86ISD::KSHIFTR", SDTCisSameAs<0, 1>, SDTCisVT<2, i8>]>>; -def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>; -def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>; - -def X86vprot : SDNode<"X86ISD::VPROT", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; -def X86vproti : SDNode<"X86ISD::VPROTI", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisVT<2, i8>]>>; +def X86vrotli : SDNode<"X86ISD::VROTLI", X86vshiftimm>; +def X86vrotri : SDNode<"X86ISD::VROTRI", X86vshiftimm>; -def X86vpshl : SDNode<"X86ISD::VPSHL", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; -def X86vpsha : SDNode<"X86ISD::VPSHA", - SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, - SDTCisSameAs<0,2>]>>; +def X86vpshl : SDNode<"X86ISD::VPSHL", X86vshiftvariable>; +def X86vpsha : SDNode<"X86ISD::VPSHA", X86vshiftvariable>; def X86vpcom : SDNode<"X86ISD::VPCOM", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, - SDTCisVT<3, i8>]>>; + SDTCisVT<3, i8>, SDTCisInt<0>]>>; def X86vpcomu : SDNode<"X86ISD::VPCOMU", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, - SDTCisVT<3, i8>]>>; + SDTCisVT<3, i8>, SDTCisInt<0>]>>; def X86vpermil2 : SDNode<"X86ISD::VPERMIL2", SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, @@ -267,12 +260,6 @@ def X86testnm : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>; def X86movmsk : SDNode<"X86ISD::MOVMSK", SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>; -def X86select : SDNode<"X86ISD::SELECT", - SDTypeProfile<1, 3, [SDTCVecEltisVT<1, i1>, - SDTCisSameAs<0, 2>, - SDTCisSameAs<2, 3>, - SDTCisSameNumEltsAs<0, 1>]>>; - def X86selects : SDNode<"X86ISD::SELECTS", SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>, SDTCisSameAs<0, 2>, @@ -314,6 +301,10 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>; def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>; +def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i32>]>; def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisSameAs<0,2>, @@ -326,6 +317,9 @@ def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisSameNumEltsAs<0, 3>, SDTCisVT<4, i32>, SDTCisVT<5, i32>]>; +def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisVT<2, i32>]>; def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0,1>, SDTCisVT<2, i32>, @@ -352,9 +346,26 @@ def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>, SDTCisFP<0>, SDTCisVT<4, i32>]>; -def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>; +def X86PAlignr : SDNode<"X86ISD::PALIGNR", + SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisVT<3, i8>]>>; def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>; +def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>; +def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>; +def X86VShldv : SDNode<"X86ISD::VSHLDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>>; +def X86VShrdv : SDNode<"X86ISD::VSHRDV", + SDTypeProfile<1, 3, [SDTCisVec<0>, + SDTCisSameAs<0,1>, + SDTCisSameAs<0,2>, + SDTCisSameAs<0,3>]>>; + def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>; def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>; @@ -431,10 +442,14 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>; def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>; def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>; -def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>; -def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>; -def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>; -def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>; +def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>; +def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>; +def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>; +def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>; +def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>; +def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>; +def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>; +def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>; def X86Vfpclass : SDNode<"X86ISD::VFPCLASS", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>, SDTCisFP<1>, @@ -450,9 +465,10 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST", def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>; def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>; -def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2, - [SDTCisVec<1>, - SDTCisPtrTy<2>]>, []>; +def X86kextract : SDNode<"ISD::EXTRACT_VECTOR_ELT", + SDTypeProfile<1, 2, [SDTCisVT<0, i32>, + SDTCVecEltisVT<1, i1>, + SDTCisPtrTy<2>]>>; def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>; @@ -477,19 +493,31 @@ def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>; def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>; def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>; -def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFPTernaryOp>; -def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp>; -def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp>; -def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp>; -def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp>; -def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp>; +def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>; + +def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>; + +// Scalar FMA4 intrinsics which zero the non-scalar bits. +def X86Fmadd4s : SDNode<"X86ISD::FMADD4S", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsub4s : SDNode<"X86ISD::FMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>; -def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>; -def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>; -def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>; -def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>; -def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>; -def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>; +// Scalar FMA intrinsics with passthru bits in operand 1. +def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>; +def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>; +def X86Fmsubs1 : SDNode<"X86ISD::FMSUBS1", SDTFPTernaryOp>; +def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>; // Scalar FMA intrinsics with passthru bits in operand 1. def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>; @@ -497,26 +525,49 @@ def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>; def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>; def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>; +def X86Fmadds3 : SDNode<"X86ISD::FMADDS3", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fmsubs3 : SDNode<"X86ISD::FMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>; +def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>; + // Scalar FMA intrinsics with passthru bits in operand 3. -def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>; -def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>; -def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>; -def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>; +def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound, [SDNPCommutative]>; +def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound, [SDNPCommutative]>; def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>, SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; -def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma>; -def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma>; +def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>; +def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>; + +def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>; +def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>; + +// VNNI +def SDTVnni : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>, + SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>; +def X86Vpdpbusd : SDNode<"X86ISD::VPDPBUSD", SDTVnni>; +def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>; +def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>; +def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>; def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>; def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>; def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>; +def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>; +def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>; def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>; def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>; -def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>; -def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImmRound>; -def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImmRound>; +def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>; +def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>; +def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>; +def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>; +def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>; +def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>; +def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>; +def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>; def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>, @@ -534,6 +585,13 @@ def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1, def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>; +// vpshufbitqmb +def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB", + SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<1,2>, + SDTCVecEltisVT<0,i1>, + SDTCisSameNumEltsAs<0,1>]>>; + def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>, SDTCisSameAs<0,1>, SDTCisInt<2>, SDTCisVT<3, i32>]>; @@ -588,7 +646,12 @@ def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>; def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>; def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>; + def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS", + SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>, + SDTCVecEltisVT<1, i16>]> >; + +def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND", SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>, SDTCVecEltisVT<1, i16>, SDTCisVT<2, i32>]> >; @@ -610,6 +673,11 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND", def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>; +// galois field arithmetic +def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>; +def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>; +def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>; + //===----------------------------------------------------------------------===// // SSE Complex Patterns //===----------------------------------------------------------------------===// @@ -643,109 +711,82 @@ def sdmem : Operand<v2f64> { // Vector load wrappers to prevent folding of non-temporal aligned loads on // supporting targets. -def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() || - cast<LoadSDNode>(N)->getAlignment() < 16; -}]>; -def vec256load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !Subtarget->hasAVX2() || !cast<LoadSDNode>(N)->isNonTemporal() || - cast<LoadSDNode>(N)->getAlignment() < 32; -}]>; -def vec512load : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return !Subtarget->hasAVX512() || !cast<LoadSDNode>(N)->isNonTemporal() || - cast<LoadSDNode>(N)->getAlignment() < 64; +def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + return !useNonTemporalLoad(cast<LoadSDNode>(N)); }]>; // 128-bit load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 -def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vec128load node:$ptr))>; -def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vec128load node:$ptr))>; -def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vec128load node:$ptr))>; +def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>; +def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>; +def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>; // 256-bit load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 -def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vec256load node:$ptr))>; -def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vec256load node:$ptr))>; -def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vec256load node:$ptr))>; +def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>; +def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>; +def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>; // 512-bit load pattern fragments -def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vec512load node:$ptr))>; -def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vec512load node:$ptr))>; -def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vec512load node:$ptr))>; +def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>; +def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>; +def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>; // 128-/256-/512-bit extload pattern fragments def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>; def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>; def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>; -// Like 'store', but always requires 128-bit vector alignment. +// Like 'store', but always requires vector size alignment. def alignedstore : PatFrag<(ops node:$val, node:$ptr), (store node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAlignment() >= 16; -}]>; - -// Like 'store', but always requires 256-bit vector alignment. -def alignedstore256 : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAlignment() >= 32; -}]>; - -// Like 'store', but always requires 512-bit vector alignment. -def alignedstore512 : PatFrag<(ops node:$val, node:$ptr), - (store node:$val, node:$ptr), [{ - return cast<StoreSDNode>(N)->getAlignment() >= 64; + auto *St = cast<StoreSDNode>(N); + return St->getAlignment() >= St->getMemoryVT().getStoreSize(); }]>; // Like 'load', but always requires 128-bit vector alignment. -def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 16; -}]>; - -// Like 'load', but always requires 256-bit vector alignment. -def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 32; -}]>; - -// Like 'load', but always requires 512-bit vector alignment. -def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 64; +def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{ + auto *Ld = cast<LoadSDNode>(N); + return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() && + !useNonTemporalLoad(cast<LoadSDNode>(N)); }]>; // 128-bit aligned load pattern fragments // NOTE: all 128-bit integer vector loads are promoted to v2i64 def alignedloadv4f32 : PatFrag<(ops node:$ptr), - (v4f32 (alignedload node:$ptr))>; + (v4f32 (alignedvecload node:$ptr))>; def alignedloadv2f64 : PatFrag<(ops node:$ptr), - (v2f64 (alignedload node:$ptr))>; + (v2f64 (alignedvecload node:$ptr))>; def alignedloadv2i64 : PatFrag<(ops node:$ptr), - (v2i64 (alignedload node:$ptr))>; + (v2i64 (alignedvecload node:$ptr))>; // 256-bit aligned load pattern fragments // NOTE: all 256-bit integer vector loads are promoted to v4i64 def alignedloadv8f32 : PatFrag<(ops node:$ptr), - (v8f32 (alignedload256 node:$ptr))>; + (v8f32 (alignedvecload node:$ptr))>; def alignedloadv4f64 : PatFrag<(ops node:$ptr), - (v4f64 (alignedload256 node:$ptr))>; + (v4f64 (alignedvecload node:$ptr))>; def alignedloadv4i64 : PatFrag<(ops node:$ptr), - (v4i64 (alignedload256 node:$ptr))>; + (v4i64 (alignedvecload node:$ptr))>; // 512-bit aligned load pattern fragments def alignedloadv16f32 : PatFrag<(ops node:$ptr), - (v16f32 (alignedload512 node:$ptr))>; + (v16f32 (alignedvecload node:$ptr))>; def alignedloadv8f64 : PatFrag<(ops node:$ptr), - (v8f64 (alignedload512 node:$ptr))>; + (v8f64 (alignedvecload node:$ptr))>; def alignedloadv8i64 : PatFrag<(ops node:$ptr), - (v8i64 (alignedload512 node:$ptr))>; + (v8i64 (alignedvecload node:$ptr))>; -// Like 'vec128load', but uses special alignment checks suitable for use in +// Like 'vecload', but uses special alignment checks suitable for use in // memory operands in most SSE instructions, which are required to // be naturally aligned on some targets but not on others. If the subtarget // allows unaligned accesses, match any load, though this may require // setting a feature bit in the processor (on startup, for example). // Opteron 10h and later implement such a feature. -def memop : PatFrag<(ops node:$ptr), (vec128load node:$ptr), [{ +def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{ + auto *Ld = cast<LoadSDNode>(N); return Subtarget->hasSSEUnalignedMem() || - cast<LoadSDNode>(N)->getAlignment() >= 16; + Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize(); }]>; // 128-bit memop pattern fragments @@ -754,117 +795,87 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>; def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>; def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>; -// SSSE3 uses MMX registers for some instructions. They aren't aligned on a -// 16-byte boundary. -// FIXME: 8 byte alignment for mmx reads is not required -def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{ - return cast<LoadSDNode>(N)->getAlignment() >= 8; -}]>; - -def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>; - -def X86masked_gather : SDNode<"X86ISD::MGATHER", SDTMaskedGather, - [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; +def X86masked_gather : SDNode<"X86ISD::MGATHER", + SDTypeProfile<2, 3, [SDTCisVec<0>, + SDTCisVec<1>, SDTCisInt<1>, + SDTCisSameAs<0, 2>, + SDTCisSameAs<1, 3>, + SDTCisPtrTy<4>]>, + [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>; + +def X86masked_scatter : SDNode<"X86ISD::MSCATTER", + SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>, + SDTCisSameAs<0, 2>, + SDTCVecEltisVT<0, i1>, + SDTCisPtrTy<3>]>, + [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>; def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v4i32 || - Mgt->getBasePtr().getValueType() == MVT::v4i32); - return false; + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v4i32; }]>; def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v8i32 || - Mgt->getBasePtr().getValueType() == MVT::v8i32); - return false; + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v8i32; }]>; def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v2i64 || - Mgt->getBasePtr().getValueType() == MVT::v2i64); - return false; -}]>; -def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v2i64 || - Mgt->getBasePtr().getValueType() == MVT::v2i64) && - (Mgt->getMemoryVT() == MVT::v2i32 || - Mgt->getMemoryVT() == MVT::v2f32); - return false; + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v2i64; }]>; def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v4i64 || - Mgt->getBasePtr().getValueType() == MVT::v4i64); - return false; + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v4i64; }]>; def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v8i64 || - Mgt->getBasePtr().getValueType() == MVT::v8i64); - return false; + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v8i64; }]>; def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_gather node:$src1, node:$src2, node:$src3) , [{ - if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N)) - return (Mgt->getIndex().getValueType() == MVT::v16i32 || - Mgt->getBasePtr().getValueType() == MVT::v16i32); - return false; + (X86masked_gather node:$src1, node:$src2, node:$src3) , [{ + X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N); + return Mgt->getIndex().getValueType() == MVT::v16i32; }]>; def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - return (Sc->getIndex().getValueType() == MVT::v2i64 || - Sc->getBasePtr().getValueType() == MVT::v2i64); - return false; + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v2i64; }]>; def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - return (Sc->getIndex().getValueType() == MVT::v4i32 || - Sc->getBasePtr().getValueType() == MVT::v4i32); - return false; + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v4i32; }]>; def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - return (Sc->getIndex().getValueType() == MVT::v4i64 || - Sc->getBasePtr().getValueType() == MVT::v4i64); - return false; + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v4i64; }]>; def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - return (Sc->getIndex().getValueType() == MVT::v8i32 || - Sc->getBasePtr().getValueType() == MVT::v8i32); - return false; + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v8i32; }]>; def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - return (Sc->getIndex().getValueType() == MVT::v8i64 || - Sc->getBasePtr().getValueType() == MVT::v8i64); - return false; + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v8i64; }]>; def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3), - (masked_scatter node:$src1, node:$src2, node:$src3) , [{ - if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N)) - return (Sc->getIndex().getValueType() == MVT::v16i32 || - Sc->getBasePtr().getValueType() == MVT::v16i32); - return false; + (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{ + X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N); + return Sc->getIndex().getValueType() == MVT::v16i32; }]>; // 128-bit bitconvert pattern fragments @@ -927,53 +938,48 @@ def BYTE_imm : SDNodeXForm<imm, [{ // EXTRACT_get_vextract128_imm xform function: convert extract_subvector index // to VEXTRACTF128/VEXTRACTI128 imm. def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N)); + return getExtractVEXTRACTImmediate(N, 128, SDLoc(N)); }]>; // INSERT_get_vinsert128_imm xform function: convert insert_subvector index to // VINSERTF128/VINSERTI128 imm. def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N)); + return getInsertVINSERTImmediate(N, 128, SDLoc(N)); }]>; // EXTRACT_get_vextract256_imm xform function: convert extract_subvector index // to VEXTRACTF64x4 imm. def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{ - return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N)); + return getExtractVEXTRACTImmediate(N, 256, SDLoc(N)); }]>; // INSERT_get_vinsert256_imm xform function: convert insert_subvector index to // VINSERTF64x4 imm. def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{ - return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N)); + return getInsertVINSERTImmediate(N, 256, SDLoc(N)); }]>; def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, - node:$index), [{ - return X86::isVEXTRACT128Index(N); -}], EXTRACT_get_vextract128_imm>; + node:$index), [{}], + EXTRACT_get_vextract128_imm>; def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), (insert_subvector node:$bigvec, node:$smallvec, - node:$index), [{ - return X86::isVINSERT128Index(N); -}], INSERT_get_vinsert128_imm>; - + node:$index), [{}], + INSERT_get_vinsert128_imm>; def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index), (extract_subvector node:$bigvec, - node:$index), [{ - return X86::isVEXTRACT256Index(N); -}], EXTRACT_get_vextract256_imm>; + node:$index), [{}], + EXTRACT_get_vextract256_imm>; def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec, node:$index), (insert_subvector node:$bigvec, node:$smallvec, - node:$index), [{ - return X86::isVINSERT256Index(N); -}], INSERT_get_vinsert256_imm>; + node:$index), [{}], + INSERT_get_vinsert256_imm>; def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3), (masked_load node:$src1, node:$src2, node:$src3), [{ @@ -1136,8 +1142,3 @@ def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3) (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{ return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32; }]>; - -def assertzext_i1 : - PatFrag<(ops node:$src), (assertzext node:$src), [{ - return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1; -}]>; diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 34d4816a2518..7ca1c58184f6 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -47,8 +47,9 @@ using namespace llvm; #include "X86GenInstrInfo.inc" static cl::opt<bool> -NoFusing("disable-spill-fusing", - cl::desc("Disable fusing of spill code into instructions")); + NoFusing("disable-spill-fusing", + cl::desc("Disable fusing of spill code into instructions"), + cl::Hidden); static cl::opt<bool> PrintFailedFusing("print-failed-fuse-candidates", cl::desc("Print instructions that the allocator wants to" @@ -122,12 +123,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) Subtarget(STI), RI(STI.getTargetTriple()) { static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = { + { X86::ADC16ri, X86::ADC16mi, 0 }, + { X86::ADC16ri8, X86::ADC16mi8, 0 }, + { X86::ADC16rr, X86::ADC16mr, 0 }, { X86::ADC32ri, X86::ADC32mi, 0 }, { X86::ADC32ri8, X86::ADC32mi8, 0 }, { X86::ADC32rr, X86::ADC32mr, 0 }, { X86::ADC64ri32, X86::ADC64mi32, 0 }, { X86::ADC64ri8, X86::ADC64mi8, 0 }, { X86::ADC64rr, X86::ADC64mr, 0 }, + { X86::ADC8ri, X86::ADC8mi, 0 }, + { X86::ADC8ri8, X86::ADC8mi8, 0 }, + { X86::ADC8rr, X86::ADC8mr, 0 }, { X86::ADD16ri, X86::ADD16mi, 0 }, { X86::ADD16ri8, X86::ADD16mi8, 0 }, { X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE }, @@ -147,6 +154,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::ADD64rr, X86::ADD64mr, 0 }, { X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE }, { X86::ADD8ri, X86::ADD8mi, 0 }, + { X86::ADD8ri8, X86::ADD8mi8, 0 }, { X86::ADD8rr, X86::ADD8mr, 0 }, { X86::AND16ri, X86::AND16mi, 0 }, { X86::AND16ri8, X86::AND16mi8, 0 }, @@ -158,7 +166,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::AND64ri8, X86::AND64mi8, 0 }, { X86::AND64rr, X86::AND64mr, 0 }, { X86::AND8ri, X86::AND8mi, 0 }, + { X86::AND8ri8, X86::AND8mi8, 0 }, { X86::AND8rr, X86::AND8mr, 0 }, + { X86::BTC16ri8, X86::BTC16mi8, 0 }, + { X86::BTC32ri8, X86::BTC32mi8, 0 }, + { X86::BTC64ri8, X86::BTC64mi8, 0 }, + { X86::BTR16ri8, X86::BTR16mi8, 0 }, + { X86::BTR32ri8, X86::BTR32mi8, 0 }, + { X86::BTR64ri8, X86::BTR64mi8, 0 }, + { X86::BTS16ri8, X86::BTS16mi8, 0 }, + { X86::BTS32ri8, X86::BTS32mi8, 0 }, + { X86::BTS64ri8, X86::BTS64mi8, 0 }, { X86::DEC16r, X86::DEC16m, 0 }, { X86::DEC32r, X86::DEC32m, 0 }, { X86::DEC64r, X86::DEC64m, 0 }, @@ -185,7 +203,32 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::OR64ri8, X86::OR64mi8, 0 }, { X86::OR64rr, X86::OR64mr, 0 }, { X86::OR8ri, X86::OR8mi, 0 }, + { X86::OR8ri8, X86::OR8mi8, 0 }, { X86::OR8rr, X86::OR8mr, 0 }, + { X86::RCL16r1, X86::RCL16m1, 0 }, + { X86::RCL16rCL, X86::RCL16mCL, 0 }, + { X86::RCL16ri, X86::RCL16mi, 0 }, + { X86::RCL32r1, X86::RCL32m1, 0 }, + { X86::RCL32rCL, X86::RCL32mCL, 0 }, + { X86::RCL32ri, X86::RCL32mi, 0 }, + { X86::RCL64r1, X86::RCL64m1, 0 }, + { X86::RCL64rCL, X86::RCL64mCL, 0 }, + { X86::RCL64ri, X86::RCL64mi, 0 }, + { X86::RCL8r1, X86::RCL8m1, 0 }, + { X86::RCL8rCL, X86::RCL8mCL, 0 }, + { X86::RCL8ri, X86::RCL8mi, 0 }, + { X86::RCR16r1, X86::RCR16m1, 0 }, + { X86::RCR16rCL, X86::RCR16mCL, 0 }, + { X86::RCR16ri, X86::RCR16mi, 0 }, + { X86::RCR32r1, X86::RCR32m1, 0 }, + { X86::RCR32rCL, X86::RCR32mCL, 0 }, + { X86::RCR32ri, X86::RCR32mi, 0 }, + { X86::RCR64r1, X86::RCR64m1, 0 }, + { X86::RCR64rCL, X86::RCR64mCL, 0 }, + { X86::RCR64ri, X86::RCR64mi, 0 }, + { X86::RCR8r1, X86::RCR8m1, 0 }, + { X86::RCR8rCL, X86::RCR8mCL, 0 }, + { X86::RCR8ri, X86::RCR8mi, 0 }, { X86::ROL16r1, X86::ROL16m1, 0 }, { X86::ROL16rCL, X86::ROL16mCL, 0 }, { X86::ROL16ri, X86::ROL16mi, 0 }, @@ -222,12 +265,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SAR8r1, X86::SAR8m1, 0 }, { X86::SAR8rCL, X86::SAR8mCL, 0 }, { X86::SAR8ri, X86::SAR8mi, 0 }, + { X86::SBB16ri, X86::SBB16mi, 0 }, + { X86::SBB16ri8, X86::SBB16mi8, 0 }, + { X86::SBB16rr, X86::SBB16mr, 0 }, { X86::SBB32ri, X86::SBB32mi, 0 }, { X86::SBB32ri8, X86::SBB32mi8, 0 }, { X86::SBB32rr, X86::SBB32mr, 0 }, { X86::SBB64ri32, X86::SBB64mi32, 0 }, { X86::SBB64ri8, X86::SBB64mi8, 0 }, { X86::SBB64rr, X86::SBB64mr, 0 }, + { X86::SBB8ri, X86::SBB8mi, 0 }, + { X86::SBB8ri8, X86::SBB8mi8, 0 }, + { X86::SBB8rr, X86::SBB8mr, 0 }, { X86::SHL16r1, X86::SHL16m1, 0 }, { X86::SHL16rCL, X86::SHL16mCL, 0 }, { X86::SHL16ri, X86::SHL16mi, 0 }, @@ -274,6 +323,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SUB64ri8, X86::SUB64mi8, 0 }, { X86::SUB64rr, X86::SUB64mr, 0 }, { X86::SUB8ri, X86::SUB8mi, 0 }, + { X86::SUB8ri8, X86::SUB8mi8, 0 }, { X86::SUB8rr, X86::SUB8mr, 0 }, { X86::XOR16ri, X86::XOR16mi, 0 }, { X86::XOR16ri8, X86::XOR16mi8, 0 }, @@ -285,6 +335,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::XOR64ri8, X86::XOR64mi8, 0 }, { X86::XOR64rr, X86::XOR64mr, 0 }, { X86::XOR8ri, X86::XOR8mi, 0 }, + { X86::XOR8ri8, X86::XOR8mi8, 0 }, { X86::XOR8rr, X86::XOR8mr, 0 } }; @@ -375,9 +426,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD }, { X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD }, { X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD }, + { X86::TEST16rr, X86::TEST16mr, TB_FOLDED_LOAD }, { X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD }, + { X86::TEST32rr, X86::TEST32mr, TB_FOLDED_LOAD }, { X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD }, + { X86::TEST64rr, X86::TEST64mr, TB_FOLDED_LOAD }, { X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD }, + { X86::TEST8rr, X86::TEST8mr, TB_FOLDED_LOAD }, // AVX 128-bit versions of foldable instructions { X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE }, @@ -504,14 +559,30 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CMP32rr, X86::CMP32rm, 0 }, { X86::CMP64rr, X86::CMP64rm, 0 }, { X86::CMP8rr, X86::CMP8rm, 0 }, + { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE }, + { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, + { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, + { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, + { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, + { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE }, + { X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE }, { X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 }, - { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 }, + { X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 }, { X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 }, - { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 }, + { X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 }, { X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 }, { X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 }, + { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE }, + { X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE }, + { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, + { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, { X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 }, + { X86::CVTTSD2SI64rr_Int,X86::CVTTSD2SI64rm_Int, TB_NO_REVERSE }, { X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 }, + { X86::CVTTSD2SIrr_Int, X86::CVTTSD2SIrm_Int, TB_NO_REVERSE }, + { X86::CVTTSS2SI64rr_Int,X86::CVTTSS2SI64rm_Int, TB_NO_REVERSE }, + { X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int, TB_NO_REVERSE }, { X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 }, { X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 }, { X86::IMUL16rri, X86::IMUL16rmi, 0 }, @@ -522,22 +593,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::IMUL64rri8, X86::IMUL64rmi8, 0 }, { X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE }, { X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE }, - { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE }, - { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE }, - { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE }, - { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE }, - { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE }, - { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 }, - { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 }, - { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 }, - { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 }, - { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE }, - { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 }, - { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 }, - { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE }, - { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE }, - { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE }, - { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE }, { X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE }, { X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE }, { X86::MOV16rr, X86::MOV16rm, 0 }, @@ -608,10 +663,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE }, { X86::SQRTSSr, X86::SQRTSSm, 0 }, { X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE }, - { X86::TEST16rr, X86::TEST16rm, 0 }, - { X86::TEST32rr, X86::TEST32rm, 0 }, - { X86::TEST64rr, X86::TEST64rm, 0 }, - { X86::TEST8rr, X86::TEST8rm, 0 }, // FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0 { X86::UCOMISDrr, X86::UCOMISDrm, 0 }, { X86::UCOMISSrr, X86::UCOMISSrm, 0 }, @@ -643,17 +694,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE }, { X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE }, { X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 }, - { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE }, + { X86::VCVTTSD2SI64rr_Int,X86::VCVTTSD2SI64rm_Int,TB_NO_REVERSE }, { X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 }, - { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE }, + { X86::VCVTTSD2SIrr_Int,X86::VCVTTSD2SIrm_Int, TB_NO_REVERSE }, { X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 }, - { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE }, + { X86::VCVTTSS2SI64rr_Int,X86::VCVTTSS2SI64rm_Int,TB_NO_REVERSE }, { X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 }, - { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE }, - { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE }, - { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE }, - { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE }, - { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE }, + { X86::VCVTTSS2SIrr_Int,X86::VCVTTSS2SIrm_Int, TB_NO_REVERSE }, + { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE }, + { X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE }, + { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE }, { X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE }, { X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 }, { X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 }, @@ -714,12 +765,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VUCOMISSrr, X86::VUCOMISSrm, 0 }, // AVX 256-bit foldable instructions - { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE }, + { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 }, { X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 }, { X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 }, { X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 }, { X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 }, - { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE }, + { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 }, { X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 }, { X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 }, { X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 }, @@ -879,6 +930,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions { X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE }, + { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 }, + { X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrm, 0 }, + { X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrm, 0 }, { X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 }, { X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 }, { X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 }, @@ -923,14 +977,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 }, { X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 }, { X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 }, - { X86::VPSLLDQZ512rr, X86::VPSLLDQZ512rm, 0 }, + { X86::VPSLLDQZrr, X86::VPSLLDQZrm, 0 }, { X86::VPSLLDZri, X86::VPSLLDZmi, 0 }, { X86::VPSLLQZri, X86::VPSLLQZmi, 0 }, { X86::VPSLLWZri, X86::VPSLLWZmi, 0 }, { X86::VPSRADZri, X86::VPSRADZmi, 0 }, { X86::VPSRAQZri, X86::VPSRAQZmi, 0 }, { X86::VPSRAWZri, X86::VPSRAWZmi, 0 }, - { X86::VPSRLDQZ512rr, X86::VPSRLDQZ512rm, 0 }, + { X86::VPSRLDQZrr, X86::VPSRLDQZrm, 0 }, { X86::VPSRLDZri, X86::VPSRLDZmi, 0 }, { X86::VPSRLQZri, X86::VPSRLQZmi, 0 }, { X86::VPSRLWZri, X86::VPSRLWZmi, 0 }, @@ -938,6 +992,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions (256-bit versions) { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE }, { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE }, + { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 }, + { X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rm, 0 }, + { X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rm, 0 }, { X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 }, { X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 }, { X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 }, @@ -989,6 +1046,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) // AVX-512 foldable instructions (128-bit versions) { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE }, + { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE }, + { X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rm, 0 }, + { X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rm, TB_NO_REVERSE }, { X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 }, { X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 }, { X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 }, @@ -1135,9 +1195,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 }, { X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 }, { X86::CMPSDrr, X86::CMPSDrm, 0 }, + { X86::CMPSDrr_Int, X86::CMPSDrm_Int, TB_NO_REVERSE }, { X86::CMPSSrr, X86::CMPSSrm, 0 }, + { X86::CMPSSrr_Int, X86::CMPSSrm_Int, TB_NO_REVERSE }, { X86::CRC32r32r32, X86::CRC32r32m32, 0 }, { X86::CRC32r64r64, X86::CRC32r64m64, 0 }, + { X86::CVTSD2SSrr_Int, X86::CVTSD2SSrm_Int, TB_NO_REVERSE }, + { X86::CVTSS2SDrr_Int, X86::CVTSS2SDrm_Int, TB_NO_REVERSE }, { X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 }, { X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 }, { X86::DIVSDrr, X86::DIVSDrm, 0 }, @@ -1153,14 +1217,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::IMUL16rr, X86::IMUL16rm, 0 }, { X86::IMUL32rr, X86::IMUL32rm, 0 }, { X86::IMUL64rr, X86::IMUL64rm, 0 }, - { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE }, - { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE }, - { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE }, - { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 }, - { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 }, - { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 }, - { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 }, - { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE }, + { X86::CVTSI642SDrr_Int,X86::CVTSI642SDrm_Int, 0 }, + { X86::CVTSI2SDrr_Int, X86::CVTSI2SDrm_Int, 0 }, + { X86::CVTSI642SSrr_Int,X86::CVTSI642SSrm_Int, 0 }, + { X86::CVTSI2SSrr_Int, X86::CVTSI2SSrm_Int, 0 }, { X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 }, { X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 }, { X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 }, @@ -1405,14 +1465,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::PMULHRWrr, X86::PMULHRWrm, 0 }, // AVX 128-bit versions of foldable instructions - { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 }, - { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 }, + { X86::VCVTSI642SDrr, X86::VCVTSI642SDrm, 0 }, + { X86::VCVTSI642SDrr_Int, X86::VCVTSI642SDrm_Int, 0 }, { X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 }, - { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 }, - { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 }, - { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 }, + { X86::VCVTSI2SDrr_Int, X86::VCVTSI2SDrm_Int, 0 }, + { X86::VCVTSI642SSrr, X86::VCVTSI642SSrm, 0 }, + { X86::VCVTSI642SSrr_Int, X86::VCVTSI642SSrm_Int, 0 }, { X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 }, - { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 }, + { X86::VCVTSI2SSrr_Int, X86::VCVTSI2SSrm_Int, 0 }, { X86::VADDPDrr, X86::VADDPDrm, 0 }, { X86::VADDPSrr, X86::VADDPSrm, 0 }, { X86::VADDSDrr, X86::VADDSDrm, 0 }, @@ -1432,7 +1492,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VCMPPDrri, X86::VCMPPDrmi, 0 }, { X86::VCMPPSrri, X86::VCMPPSrmi, 0 }, { X86::VCMPSDrr, X86::VCMPSDrm, 0 }, + { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE }, { X86::VCMPSSrr, X86::VCMPSSrm, 0 }, + { X86::VCMPSSrr_Int, X86::VCMPSSrm_Int, TB_NO_REVERSE }, { X86::VDIVPDrr, X86::VDIVPDrm, 0 }, { X86::VDIVPSrr, X86::VDIVPSrm, 0 }, { X86::VDIVSDrr, X86::VDIVSDrm, 0 }, @@ -1445,8 +1507,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VHADDPSrr, X86::VHADDPSrm, 0 }, { X86::VHSUBPDrr, X86::VHSUBPDrm, 0 }, { X86::VHSUBPSrr, X86::VHSUBPSrm, 0 }, - { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE }, - { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE }, { X86::VMAXCPDrr, X86::VMAXCPDrm, 0 }, { X86::VMAXCPSrr, X86::VMAXCPSrm, 0 }, { X86::VMAXCSDrr, X86::VMAXCSDrm, 0 }, @@ -1982,7 +2042,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 }, { X86::VPORDZrr, X86::VPORDZrm, 0 }, { X86::VPORQZrr, X86::VPORQZrm, 0 }, - { X86::VPSADBWZ512rr, X86::VPSADBWZ512rm, 0 }, + { X86::VPSADBWZrr, X86::VPSADBWZrm, 0 }, { X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 }, { X86::VPSLLDZrr, X86::VPSLLDZrm, 0 }, { X86::VPSLLQZrr, X86::VPSLLQZrm, 0 }, @@ -2528,6 +2588,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 }, { X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 }, { X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 }, + { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 }, + { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 }, { X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 }, { X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 }, @@ -2544,6 +2606,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 }, { X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 }, { X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 }, + { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 }, + { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 }, { X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 }, { X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 }, @@ -2560,6 +2624,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 }, { X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 }, { X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 }, + { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 }, + { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 }, { X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 }, { X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 }, @@ -3234,6 +3300,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 }, { X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 }, { X86::VPERMWZrrk, X86::VPERMWZrmk, 0 }, + { X86::VPMADD52HUQZrk, X86::VPMADD52HUQZmk, 0 }, + { X86::VPMADD52LUQZrk, X86::VPMADD52LUQZmk, 0 }, { X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 }, { X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 }, { X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 }, @@ -3376,6 +3444,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 }, { X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 }, { X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 }, + { X86::VPMADD52HUQZ256rk, X86::VPMADD52HUQZ256mk, 0 }, + { X86::VPMADD52LUQZ256rk, X86::VPMADD52LUQZ256mk, 0 }, { X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 }, { X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 }, { X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 }, @@ -3509,6 +3579,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 }, { X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 }, { X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 }, + { X86::VPMADD52HUQZ128rk, X86::VPMADD52HUQZ128mk, 0 }, + { X86::VPMADD52LUQZ128rk, X86::VPMADD52LUQZ128mk, 0 }, { X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 }, { X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 }, { X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 }, @@ -3597,6 +3669,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 }, { X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 }, { X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 }, + { X86::VPMADD52HUQZrkz, X86::VPMADD52HUQZmkz, 0 }, + { X86::VPMADD52LUQZrkz, X86::VPMADD52LUQZmkz, 0 }, { X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 }, { X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 }, @@ -3613,6 +3687,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 }, { X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 }, { X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 }, + { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz, 0 }, + { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz, 0 }, { X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 }, { X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 }, @@ -3629,6 +3705,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI) { X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 }, { X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 }, { X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 }, + { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz, 0 }, + { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz, 0 }, { X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 }, { X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 }, }; @@ -4391,7 +4469,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA( unsigned leaInReg2 = 0; MachineInstr *InsMI2 = nullptr; if (Src == Src2) { - // ADD16rr %reg1028<kill>, %reg1028 + // ADD16rr killed %reg1028, %reg1028 // just a single insert_subreg. addRegReg(MIB, leaInReg, true, leaInReg, false); } else { @@ -5119,7 +5197,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, WorkingMI.setDesc(get(Opc)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); - break; } case X86::BLENDPDrri: case X86::BLENDPSrri: @@ -5171,24 +5248,18 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI, case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break; } - // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy - // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS. - auto &MRI = MI.getParent()->getParent()->getRegInfo(); - auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg()); - unsigned VR128 = MRI.createVirtualRegister(VR128RC); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY), - VR128) - .addReg(MI.getOperand(2).getReg()); - auto &WorkingMI = cloneIfNew(MI); WorkingMI.setDesc(get(Opc)); - WorkingMI.getOperand(2).setReg(VR128); WorkingMI.addOperand(MachineOperand::CreateImm(Mask)); return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false, OpIdx1, OpIdx2); } case X86::PCLMULQDQrr: - case X86::VPCLMULQDQrr:{ + case X86::VPCLMULQDQrr: + case X86::VPCLMULQDQYrr: + case X86::VPCLMULQDQZrr: + case X86::VPCLMULQDQZ128rr: + case X86::VPCLMULQDQZ256rr: { // SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0] // SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0] unsigned Imm = MI.getOperand(3).getImm(); @@ -5631,6 +5702,41 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1, case X86::VPTERNLOGQZ256rmbikz: case X86::VPTERNLOGQZrmbikz: return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2); + case X86::VPMADD52HUQZ128r: + case X86::VPMADD52HUQZ128rk: + case X86::VPMADD52HUQZ128rkz: + case X86::VPMADD52HUQZ256r: + case X86::VPMADD52HUQZ256rk: + case X86::VPMADD52HUQZ256rkz: + case X86::VPMADD52HUQZr: + case X86::VPMADD52HUQZrk: + case X86::VPMADD52HUQZrkz: + case X86::VPMADD52LUQZ128r: + case X86::VPMADD52LUQZ128rk: + case X86::VPMADD52LUQZ128rkz: + case X86::VPMADD52LUQZ256r: + case X86::VPMADD52LUQZ256rk: + case X86::VPMADD52LUQZ256rkz: + case X86::VPMADD52LUQZr: + case X86::VPMADD52LUQZrk: + case X86::VPMADD52LUQZrkz: { + unsigned CommutableOpIdx1 = 2; + unsigned CommutableOpIdx2 = 3; + if (Desc.TSFlags & X86II::EVEX_K) { + // Skip the mask register. + ++CommutableOpIdx1; + ++CommutableOpIdx2; + } + if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, + CommutableOpIdx1, CommutableOpIdx2)) + return false; + if (!MI.getOperand(SrcOpIdx1).isReg() || + !MI.getOperand(SrcOpIdx2).isReg()) + // No idea. + return false; + return true; + } + default: const X86InstrFMA3Group *FMA3Group = X86InstrFMA3Info::getFMA3Group(MI.getOpcode()); @@ -7113,16 +7219,20 @@ inline static bool isDefConvertible(MachineInstr &MI) { case X86::OR8ri: case X86::OR64rr: case X86::OR32rr: case X86::OR16rr: case X86::OR8rr: case X86::OR64rm: case X86::OR32rm: case X86::OR16rm: case X86::OR8rm: + case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri: + case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8: + case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr: + case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm: + case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm: + case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri: + case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8: + case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr: + case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm: + case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm: case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r: case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1: case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1: case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1: - case X86::ADC32ri: case X86::ADC32ri8: - case X86::ADC32rr: case X86::ADC64ri32: - case X86::ADC64ri8: case X86::ADC64rr: - case X86::SBB32ri: case X86::SBB32ri8: - case X86::SBB32rr: case X86::SBB64ri32: - case X86::SBB64ri8: case X86::SBB64rr: case X86::ANDN32rr: case X86::ANDN32rm: case X86::ANDN64rr: case X86::ANDN64rm: case X86::BEXTR32rr: case X86::BEXTR64rr: @@ -7144,6 +7254,22 @@ inline static bool isDefConvertible(MachineInstr &MI) { case X86::TZCNT16rr: case X86::TZCNT16rm: case X86::TZCNT32rr: case X86::TZCNT32rm: case X86::TZCNT64rr: case X86::TZCNT64rm: + case X86::BEXTRI32ri: case X86::BEXTRI32mi: + case X86::BEXTRI64ri: case X86::BEXTRI64mi: + case X86::BLCFILL32rr: case X86::BLCFILL32rm: + case X86::BLCFILL64rr: case X86::BLCFILL64rm: + case X86::BLCI32rr: case X86::BLCI32rm: + case X86::BLCI64rr: case X86::BLCI64rm: + case X86::BLCIC32rr: case X86::BLCIC32rm: + case X86::BLCIC64rr: case X86::BLCIC64rm: + case X86::BLCMSK32rr: case X86::BLCMSK32rm: + case X86::BLCMSK64rr: case X86::BLCMSK64rm: + case X86::BLCS32rr: case X86::BLCS32rm: + case X86::BLCS64rr: case X86::BLCS64rm: + case X86::BLSFILL32rr: case X86::BLSFILL32rm: + case X86::BLSFILL64rr: case X86::BLSFILL64rm: + case X86::BLSIC32rr: case X86::BLSIC32rm: + case X86::BLSIC64rr: case X86::BLSIC64rm: return true; } } @@ -7349,6 +7475,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, } if (OldCC == X86::COND_INVALID) return false; } + X86::CondCode ReplacementCC = X86::COND_INVALID; if (IsCmpZero) { switch (OldCC) { default: break; @@ -7368,31 +7495,32 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg, default: return false; case X86::COND_E: + ReplacementCC = NewCC; break; case X86::COND_NE: - NewCC = GetOppositeBranchCondition(NewCC); + ReplacementCC = GetOppositeBranchCondition(NewCC); break; } } else if (IsSwapped) { // If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs // to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc. // We swap the condition code and synthesize the new opcode. - NewCC = getSwappedCondition(OldCC); - if (NewCC == X86::COND_INVALID) return false; + ReplacementCC = getSwappedCondition(OldCC); + if (ReplacementCC == X86::COND_INVALID) return false; } - if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) { + if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) { // Synthesize the new opcode. bool HasMemoryOperand = Instr.hasOneMemOperand(); unsigned NewOpc; if (Instr.isBranch()) - NewOpc = GetCondBranchFromCond(NewCC); + NewOpc = GetCondBranchFromCond(ReplacementCC); else if(OpcIsSET) - NewOpc = getSETFromCond(NewCC, HasMemoryOperand); + NewOpc = getSETFromCond(ReplacementCC, HasMemoryOperand); else { unsigned DstReg = Instr.getOperand(0).getReg(); const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); - NewOpc = getCMovFromCond(NewCC, TRI->getRegSizeInBits(*DstRC)/8, + NewOpc = getCMovFromCond(ReplacementCC, TRI->getRegSizeInBits(*DstRC)/8, HasMemoryOperand); } @@ -7504,7 +7632,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI, /// This is used for mapping: /// %xmm4 = V_SET0 /// to: -/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef> +/// %xmm4 = PXORrr undef %xmm4, undef %xmm4 /// static bool Expand2AddrUndef(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) { @@ -7597,7 +7725,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB, bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI(); bool NeedsDwarfCFI = !IsWin64Prologue && - (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry()); + (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry()); bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI; if (EmitCFI) { TFL->BuildCFI(MBB, I, DL, @@ -7633,6 +7761,18 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB, MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0); } +static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) { + MachineBasicBlock &MBB = *MIB->getParent(); + MachineFunction &MF = *MBB.getParent(); + const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>(); + const X86RegisterInfo *TRI = Subtarget.getRegisterInfo(); + unsigned XorOp = + MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr; + MIB->setDesc(TII.get(XorOp)); + MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef); + return true; +} + // This is used to handle spills for 128/256-bit registers when we have AVX512, // but not VLX. If it uses an extended register we need to use an instruction // that loads the lower 128/256-bit, but is available with only AVX512F. @@ -7705,9 +7845,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case X86::FsFLD0SS: case X86::FsFLD0SD: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr)); - case X86::AVX_SET0: + case X86::AVX_SET0: { assert(HasAVX && "AVX not supported"); - return Expand2AddrUndef(MIB, get(X86::VXORPSYrr)); + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned SrcReg = MIB->getOperand(0).getReg(); + unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); + MIB->getOperand(0).setReg(XReg); + Expand2AddrUndef(MIB, get(X86::VXORPSrr)); + MIB.addReg(SrcReg, RegState::ImplicitDefine); + return true; + } case X86::AVX512_128_SET0: case X86::AVX512_FsFLD0SS: case X86::AVX512_FsFLD0SD: { @@ -7718,24 +7865,26 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); // Extended register without VLX. Use a larger XOR. - SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); + SrcReg = + TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass); MIB->getOperand(0).setReg(SrcReg); return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } - case X86::AVX512_256_SET0: { + case X86::AVX512_256_SET0: + case X86::AVX512_512_SET0: { bool HasVLX = Subtarget.hasVLX(); unsigned SrcReg = MIB->getOperand(0).getReg(); const TargetRegisterInfo *TRI = &getRegisterInfo(); - if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) - return Expand2AddrUndef(MIB, - get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr)); - // Extended register without VLX. Use a larger XOR. - SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass); - MIB->getOperand(0).setReg(SrcReg); + if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) { + unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm); + MIB->getOperand(0).setReg(XReg); + Expand2AddrUndef(MIB, + get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr)); + MIB.addReg(SrcReg, RegState::ImplicitDefine); + return true; + } return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); } - case X86::AVX512_512_SET0: - return Expand2AddrUndef(MIB, get(X86::VPXORDZrr)); case X86::V_SETALLONES: return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: @@ -7818,10 +7967,287 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { case TargetOpcode::LOAD_STACK_GUARD: expandLoadStackGuard(MIB, *this); return true; + case X86::XOR64_FP: + case X86::XOR32_FP: + return expandXorFP(MIB, *this); } return false; } +/// Return true for all instructions that only update +/// the first 32 or 64-bits of the destination register and leave the rest +/// unmodified. This can be used to avoid folding loads if the instructions +/// only update part of the destination register, and the non-updated part is +/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these +/// instructions breaks the partial register dependency and it can improve +/// performance. e.g.: +/// +/// movss (%rdi), %xmm0 +/// cvtss2sd %xmm0, %xmm0 +/// +/// Instead of +/// cvtss2sd (%rdi), %xmm0 +/// +/// FIXME: This should be turned into a TSFlags. +/// +static bool hasPartialRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::CVTSI2SSrr: + case X86::CVTSI2SSrm: + case X86::CVTSI642SSrr: + case X86::CVTSI642SSrm: + case X86::CVTSI2SDrr: + case X86::CVTSI2SDrm: + case X86::CVTSI642SDrr: + case X86::CVTSI642SDrm: + case X86::CVTSD2SSrr: + case X86::CVTSD2SSrm: + case X86::CVTSS2SDrr: + case X86::CVTSS2SDrm: + case X86::MOVHPDrm: + case X86::MOVHPSrm: + case X86::MOVLPDrm: + case X86::MOVLPSrm: + case X86::RCPSSr: + case X86::RCPSSm: + case X86::RCPSSr_Int: + case X86::RCPSSm_Int: + case X86::ROUNDSDr: + case X86::ROUNDSDm: + case X86::ROUNDSSr: + case X86::ROUNDSSm: + case X86::RSQRTSSr: + case X86::RSQRTSSm: + case X86::RSQRTSSr_Int: + case X86::RSQRTSSm_Int: + case X86::SQRTSSr: + case X86::SQRTSSm: + case X86::SQRTSSr_Int: + case X86::SQRTSSm_Int: + case X86::SQRTSDr: + case X86::SQRTSDm: + case X86::SQRTSDr_Int: + case X86::SQRTSDm_Int: + return true; + } + + return false; +} + +/// Inform the ExecutionDepsFix pass how many idle +/// instructions we would like before a partial register update. +unsigned X86InstrInfo::getPartialRegUpdateClearance( + const MachineInstr &MI, unsigned OpNum, + const TargetRegisterInfo *TRI) const { + if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode())) + return 0; + + // If MI is marked as reading Reg, the partial register update is wanted. + const MachineOperand &MO = MI.getOperand(0); + unsigned Reg = MO.getReg(); + if (TargetRegisterInfo::isVirtualRegister(Reg)) { + if (MO.readsReg() || MI.readsVirtualRegister(Reg)) + return 0; + } else { + if (MI.readsRegister(Reg, TRI)) + return 0; + } + + // If any instructions in the clearance range are reading Reg, insert a + // dependency breaking instruction, which is inexpensive and is likely to + // be hidden in other instruction's cycles. + return PartialRegUpdateClearance; +} + +// Return true for any instruction the copies the high bits of the first source +// operand into the unused high bits of the destination operand. +static bool hasUndefRegUpdate(unsigned Opcode) { + switch (Opcode) { + case X86::VCVTSI2SSrr: + case X86::VCVTSI2SSrm: + case X86::VCVTSI2SSrr_Int: + case X86::VCVTSI2SSrm_Int: + case X86::VCVTSI642SSrr: + case X86::VCVTSI642SSrm: + case X86::VCVTSI642SSrr_Int: + case X86::VCVTSI642SSrm_Int: + case X86::VCVTSI2SDrr: + case X86::VCVTSI2SDrm: + case X86::VCVTSI2SDrr_Int: + case X86::VCVTSI2SDrm_Int: + case X86::VCVTSI642SDrr: + case X86::VCVTSI642SDrm: + case X86::VCVTSI642SDrr_Int: + case X86::VCVTSI642SDrm_Int: + case X86::VCVTSD2SSrr: + case X86::VCVTSD2SSrm: + case X86::VCVTSD2SSrr_Int: + case X86::VCVTSD2SSrm_Int: + case X86::VCVTSS2SDrr: + case X86::VCVTSS2SDrm: + case X86::VCVTSS2SDrr_Int: + case X86::VCVTSS2SDrm_Int: + case X86::VRCPSSr: + case X86::VRCPSSr_Int: + case X86::VRCPSSm: + case X86::VRCPSSm_Int: + case X86::VROUNDSDr: + case X86::VROUNDSDm: + case X86::VROUNDSDr_Int: + case X86::VROUNDSDm_Int: + case X86::VROUNDSSr: + case X86::VROUNDSSm: + case X86::VROUNDSSr_Int: + case X86::VROUNDSSm_Int: + case X86::VRSQRTSSr: + case X86::VRSQRTSSr_Int: + case X86::VRSQRTSSm: + case X86::VRSQRTSSm_Int: + case X86::VSQRTSSr: + case X86::VSQRTSSr_Int: + case X86::VSQRTSSm: + case X86::VSQRTSSm_Int: + case X86::VSQRTSDr: + case X86::VSQRTSDr_Int: + case X86::VSQRTSDm: + case X86::VSQRTSDm_Int: + // AVX-512 + case X86::VCVTSI2SSZrr: + case X86::VCVTSI2SSZrm: + case X86::VCVTSI2SSZrr_Int: + case X86::VCVTSI2SSZrrb_Int: + case X86::VCVTSI2SSZrm_Int: + case X86::VCVTSI642SSZrr: + case X86::VCVTSI642SSZrm: + case X86::VCVTSI642SSZrr_Int: + case X86::VCVTSI642SSZrrb_Int: + case X86::VCVTSI642SSZrm_Int: + case X86::VCVTSI2SDZrr: + case X86::VCVTSI2SDZrm: + case X86::VCVTSI2SDZrr_Int: + case X86::VCVTSI2SDZrrb_Int: + case X86::VCVTSI2SDZrm_Int: + case X86::VCVTSI642SDZrr: + case X86::VCVTSI642SDZrm: + case X86::VCVTSI642SDZrr_Int: + case X86::VCVTSI642SDZrrb_Int: + case X86::VCVTSI642SDZrm_Int: + case X86::VCVTUSI2SSZrr: + case X86::VCVTUSI2SSZrm: + case X86::VCVTUSI2SSZrr_Int: + case X86::VCVTUSI2SSZrrb_Int: + case X86::VCVTUSI2SSZrm_Int: + case X86::VCVTUSI642SSZrr: + case X86::VCVTUSI642SSZrm: + case X86::VCVTUSI642SSZrr_Int: + case X86::VCVTUSI642SSZrrb_Int: + case X86::VCVTUSI642SSZrm_Int: + case X86::VCVTUSI2SDZrr: + case X86::VCVTUSI2SDZrm: + case X86::VCVTUSI2SDZrr_Int: + case X86::VCVTUSI2SDZrm_Int: + case X86::VCVTUSI642SDZrr: + case X86::VCVTUSI642SDZrm: + case X86::VCVTUSI642SDZrr_Int: + case X86::VCVTUSI642SDZrrb_Int: + case X86::VCVTUSI642SDZrm_Int: + case X86::VCVTSD2SSZrr: + case X86::VCVTSD2SSZrr_Int: + case X86::VCVTSD2SSZrrb_Int: + case X86::VCVTSD2SSZrm: + case X86::VCVTSD2SSZrm_Int: + case X86::VCVTSS2SDZrr: + case X86::VCVTSS2SDZrr_Int: + case X86::VCVTSS2SDZrrb_Int: + case X86::VCVTSS2SDZrm: + case X86::VCVTSS2SDZrm_Int: + case X86::VRNDSCALESDr: + case X86::VRNDSCALESDr_Int: + case X86::VRNDSCALESDrb_Int: + case X86::VRNDSCALESDm: + case X86::VRNDSCALESDm_Int: + case X86::VRNDSCALESSr: + case X86::VRNDSCALESSr_Int: + case X86::VRNDSCALESSrb_Int: + case X86::VRNDSCALESSm: + case X86::VRNDSCALESSm_Int: + case X86::VRCP14SSrr: + case X86::VRCP14SSrm: + case X86::VRSQRT14SSrr: + case X86::VRSQRT14SSrm: + case X86::VSQRTSSZr: + case X86::VSQRTSSZr_Int: + case X86::VSQRTSSZrb_Int: + case X86::VSQRTSSZm: + case X86::VSQRTSSZm_Int: + case X86::VSQRTSDZr: + case X86::VSQRTSDZr_Int: + case X86::VSQRTSDZrb_Int: + case X86::VSQRTSDZm: + case X86::VSQRTSDZm_Int: + return true; + } + + return false; +} + +/// Inform the ExecutionDepsFix pass how many idle instructions we would like +/// before certain undef register reads. +/// +/// This catches the VCVTSI2SD family of instructions: +/// +/// vcvtsi2sdq %rax, undef %xmm0, %xmm14 +/// +/// We should to be careful *not* to catch VXOR idioms which are presumably +/// handled specially in the pipeline: +/// +/// vxorps undef %xmm1, undef %xmm1, %xmm1 +/// +/// Like getPartialRegUpdateClearance, this makes a strong assumption that the +/// high bits that are passed-through are not live. +unsigned +X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, + const TargetRegisterInfo *TRI) const { + if (!hasUndefRegUpdate(MI.getOpcode())) + return 0; + + // Set the OpNum parameter to the first source operand. + OpNum = 1; + + const MachineOperand &MO = MI.getOperand(OpNum); + if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { + return UndefRegClearance; + } + return 0; +} + +void X86InstrInfo::breakPartialRegDependency( + MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { + unsigned Reg = MI.getOperand(OpNum).getReg(); + // If MI kills this register, the false dependence is already broken. + if (MI.killsRegister(Reg, TRI)) + return; + + if (X86::VR128RegClass.contains(Reg)) { + // These instructions are all floating point domain, so xorps is the best + // choice. + unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) + .addReg(Reg, RegState::Undef) + .addReg(Reg, RegState::Undef); + MI.addRegisterKilled(Reg, TRI, true); + } else if (X86::VR256RegClass.contains(Reg)) { + // Use vxorps to clear the full ymm register. + // It wants to read and write the xmm sub-register. + unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); + BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) + .addReg(XReg, RegState::Undef) + .addReg(XReg, RegState::Undef) + .addReg(Reg, RegState::ImplicitDefine); + MI.addRegisterKilled(Reg, TRI, true); + } +} + static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs, int PtrOffset = 0) { unsigned NumAddrOps = MOs.size(); @@ -7976,18 +8402,23 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( unsigned Size, unsigned Align, bool AllowCommute) const { const DenseMap<unsigned, std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr; - bool isCallRegIndirect = Subtarget.callRegIndirect(); + bool isSlowTwoMemOps = Subtarget.slowTwoMemOps(); bool isTwoAddrFold = false; // For CPUs that favor the register form of a call or push, // do not fold loads into calls or pushes, unless optimizing for size // aggressively. - if (isCallRegIndirect && !MF.getFunction()->optForMinSize() && + if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() && (MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r || MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r || MI.getOpcode() == X86::PUSH64r)) return nullptr; + // Avoid partial register update stalls unless optimizing for size. + // TODO: we should block undef reg update as well. + if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode())) + return nullptr; + unsigned NumOps = MI.getDesc().getNumOperands(); bool isTwoAddr = NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1; @@ -8142,276 +8573,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( return nullptr; } -/// Return true for all instructions that only update -/// the first 32 or 64-bits of the destination register and leave the rest -/// unmodified. This can be used to avoid folding loads if the instructions -/// only update part of the destination register, and the non-updated part is -/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these -/// instructions breaks the partial register dependency and it can improve -/// performance. e.g.: -/// -/// movss (%rdi), %xmm0 -/// cvtss2sd %xmm0, %xmm0 -/// -/// Instead of -/// cvtss2sd (%rdi), %xmm0 -/// -/// FIXME: This should be turned into a TSFlags. -/// -static bool hasPartialRegUpdate(unsigned Opcode) { - switch (Opcode) { - case X86::CVTSI2SSrr: - case X86::CVTSI2SSrm: - case X86::CVTSI2SS64rr: - case X86::CVTSI2SS64rm: - case X86::CVTSI2SDrr: - case X86::CVTSI2SDrm: - case X86::CVTSI2SD64rr: - case X86::CVTSI2SD64rm: - case X86::CVTSD2SSrr: - case X86::CVTSD2SSrm: - case X86::CVTSS2SDrr: - case X86::CVTSS2SDrm: - case X86::MOVHPDrm: - case X86::MOVHPSrm: - case X86::MOVLPDrm: - case X86::MOVLPSrm: - case X86::RCPSSr: - case X86::RCPSSm: - case X86::RCPSSr_Int: - case X86::RCPSSm_Int: - case X86::ROUNDSDr: - case X86::ROUNDSDm: - case X86::ROUNDSSr: - case X86::ROUNDSSm: - case X86::RSQRTSSr: - case X86::RSQRTSSm: - case X86::RSQRTSSr_Int: - case X86::RSQRTSSm_Int: - case X86::SQRTSSr: - case X86::SQRTSSm: - case X86::SQRTSSr_Int: - case X86::SQRTSSm_Int: - case X86::SQRTSDr: - case X86::SQRTSDm: - case X86::SQRTSDr_Int: - case X86::SQRTSDm_Int: - return true; - } - - return false; -} - -/// Inform the ExecutionDepsFix pass how many idle -/// instructions we would like before a partial register update. -unsigned X86InstrInfo::getPartialRegUpdateClearance( - const MachineInstr &MI, unsigned OpNum, - const TargetRegisterInfo *TRI) const { - if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode())) - return 0; - - // If MI is marked as reading Reg, the partial register update is wanted. - const MachineOperand &MO = MI.getOperand(0); - unsigned Reg = MO.getReg(); - if (TargetRegisterInfo::isVirtualRegister(Reg)) { - if (MO.readsReg() || MI.readsVirtualRegister(Reg)) - return 0; - } else { - if (MI.readsRegister(Reg, TRI)) - return 0; - } - - // If any instructions in the clearance range are reading Reg, insert a - // dependency breaking instruction, which is inexpensive and is likely to - // be hidden in other instruction's cycles. - return PartialRegUpdateClearance; -} - -// Return true for any instruction the copies the high bits of the first source -// operand into the unused high bits of the destination operand. -static bool hasUndefRegUpdate(unsigned Opcode) { - switch (Opcode) { - case X86::VCVTSI2SSrr: - case X86::VCVTSI2SSrm: - case X86::Int_VCVTSI2SSrr: - case X86::Int_VCVTSI2SSrm: - case X86::VCVTSI2SS64rr: - case X86::VCVTSI2SS64rm: - case X86::Int_VCVTSI2SS64rr: - case X86::Int_VCVTSI2SS64rm: - case X86::VCVTSI2SDrr: - case X86::VCVTSI2SDrm: - case X86::Int_VCVTSI2SDrr: - case X86::Int_VCVTSI2SDrm: - case X86::VCVTSI2SD64rr: - case X86::VCVTSI2SD64rm: - case X86::Int_VCVTSI2SD64rr: - case X86::Int_VCVTSI2SD64rm: - case X86::VCVTSD2SSrr: - case X86::VCVTSD2SSrm: - case X86::Int_VCVTSD2SSrr: - case X86::Int_VCVTSD2SSrm: - case X86::VCVTSS2SDrr: - case X86::VCVTSS2SDrm: - case X86::Int_VCVTSS2SDrr: - case X86::Int_VCVTSS2SDrm: - case X86::VRCPSSr: - case X86::VRCPSSr_Int: - case X86::VRCPSSm: - case X86::VRCPSSm_Int: - case X86::VROUNDSDr: - case X86::VROUNDSDm: - case X86::VROUNDSDr_Int: - case X86::VROUNDSDm_Int: - case X86::VROUNDSSr: - case X86::VROUNDSSm: - case X86::VROUNDSSr_Int: - case X86::VROUNDSSm_Int: - case X86::VRSQRTSSr: - case X86::VRSQRTSSr_Int: - case X86::VRSQRTSSm: - case X86::VRSQRTSSm_Int: - case X86::VSQRTSSr: - case X86::VSQRTSSr_Int: - case X86::VSQRTSSm: - case X86::VSQRTSSm_Int: - case X86::VSQRTSDr: - case X86::VSQRTSDr_Int: - case X86::VSQRTSDm: - case X86::VSQRTSDm_Int: - // AVX-512 - case X86::VCVTSI2SSZrr: - case X86::VCVTSI2SSZrm: - case X86::VCVTSI2SSZrr_Int: - case X86::VCVTSI2SSZrrb_Int: - case X86::VCVTSI2SSZrm_Int: - case X86::VCVTSI642SSZrr: - case X86::VCVTSI642SSZrm: - case X86::VCVTSI642SSZrr_Int: - case X86::VCVTSI642SSZrrb_Int: - case X86::VCVTSI642SSZrm_Int: - case X86::VCVTSI2SDZrr: - case X86::VCVTSI2SDZrm: - case X86::VCVTSI2SDZrr_Int: - case X86::VCVTSI2SDZrrb_Int: - case X86::VCVTSI2SDZrm_Int: - case X86::VCVTSI642SDZrr: - case X86::VCVTSI642SDZrm: - case X86::VCVTSI642SDZrr_Int: - case X86::VCVTSI642SDZrrb_Int: - case X86::VCVTSI642SDZrm_Int: - case X86::VCVTUSI2SSZrr: - case X86::VCVTUSI2SSZrm: - case X86::VCVTUSI2SSZrr_Int: - case X86::VCVTUSI2SSZrrb_Int: - case X86::VCVTUSI2SSZrm_Int: - case X86::VCVTUSI642SSZrr: - case X86::VCVTUSI642SSZrm: - case X86::VCVTUSI642SSZrr_Int: - case X86::VCVTUSI642SSZrrb_Int: - case X86::VCVTUSI642SSZrm_Int: - case X86::VCVTUSI2SDZrr: - case X86::VCVTUSI2SDZrm: - case X86::VCVTUSI2SDZrr_Int: - case X86::VCVTUSI2SDZrm_Int: - case X86::VCVTUSI642SDZrr: - case X86::VCVTUSI642SDZrm: - case X86::VCVTUSI642SDZrr_Int: - case X86::VCVTUSI642SDZrrb_Int: - case X86::VCVTUSI642SDZrm_Int: - case X86::VCVTSD2SSZrr: - case X86::VCVTSD2SSZrr_Int: - case X86::VCVTSD2SSZrrb_Int: - case X86::VCVTSD2SSZrm: - case X86::VCVTSD2SSZrm_Int: - case X86::VCVTSS2SDZrr: - case X86::VCVTSS2SDZrr_Int: - case X86::VCVTSS2SDZrrb_Int: - case X86::VCVTSS2SDZrm: - case X86::VCVTSS2SDZrm_Int: - case X86::VRNDSCALESDr: - case X86::VRNDSCALESDrb: - case X86::VRNDSCALESDm: - case X86::VRNDSCALESSr: - case X86::VRNDSCALESSrb: - case X86::VRNDSCALESSm: - case X86::VRCP14SSrr: - case X86::VRCP14SSrm: - case X86::VRSQRT14SSrr: - case X86::VRSQRT14SSrm: - case X86::VSQRTSSZr: - case X86::VSQRTSSZr_Int: - case X86::VSQRTSSZrb_Int: - case X86::VSQRTSSZm: - case X86::VSQRTSSZm_Int: - case X86::VSQRTSDZr: - case X86::VSQRTSDZr_Int: - case X86::VSQRTSDZrb_Int: - case X86::VSQRTSDZm: - case X86::VSQRTSDZm_Int: - return true; - } - - return false; -} - -/// Inform the ExecutionDepsFix pass how many idle instructions we would like -/// before certain undef register reads. -/// -/// This catches the VCVTSI2SD family of instructions: -/// -/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14 -/// -/// We should to be careful *not* to catch VXOR idioms which are presumably -/// handled specially in the pipeline: -/// -/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1 -/// -/// Like getPartialRegUpdateClearance, this makes a strong assumption that the -/// high bits that are passed-through are not live. -unsigned -X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum, - const TargetRegisterInfo *TRI) const { - if (!hasUndefRegUpdate(MI.getOpcode())) - return 0; - - // Set the OpNum parameter to the first source operand. - OpNum = 1; - - const MachineOperand &MO = MI.getOperand(OpNum); - if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) { - return UndefRegClearance; - } - return 0; -} - -void X86InstrInfo::breakPartialRegDependency( - MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const { - unsigned Reg = MI.getOperand(OpNum).getReg(); - // If MI kills this register, the false dependence is already broken. - if (MI.killsRegister(Reg, TRI)) - return; - - if (X86::VR128RegClass.contains(Reg)) { - // These instructions are all floating point domain, so xorps is the best - // choice. - unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr; - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg) - .addReg(Reg, RegState::Undef) - .addReg(Reg, RegState::Undef); - MI.addRegisterKilled(Reg, TRI, true); - } else if (X86::VR256RegClass.contains(Reg)) { - // Use vxorps to clear the full ymm register. - // It wants to read and write the xmm sub-register. - unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm); - BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg) - .addReg(XReg, RegState::Undef) - .addReg(XReg, RegState::Undef) - .addReg(Reg, RegState::ImplicitDefine); - MI.addRegisterKilled(Reg, TRI, true); - } -} - MachineInstr * X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, ArrayRef<unsigned> Ops, @@ -8423,7 +8584,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI, // Unless optimizing for size, don't fold to avoid partial // register update stalls - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) + // TODO: we should block undef reg update as well. + if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; // Don't fold subreg spills, or reloads that use a high subreg. @@ -8498,7 +8660,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // instruction isn't scalar (SS). switch (UserOpc) { case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int: - case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int: + case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int: case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int: case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int: case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int: @@ -8549,7 +8711,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI, // instruction isn't scalar (SD). switch (UserOpc) { case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int: - case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int: + case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int: case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int: case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int: case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int: @@ -8621,7 +8783,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( if (NoFusing) return nullptr; // Avoid partial register update stalls unless optimizing for size. - if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode())) + // TODO: we should block undef reg update as well. + if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode())) return nullptr; // Determine the alignment of the load. @@ -8717,16 +8880,16 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Type *Ty; unsigned Opc = LoadMI.getOpcode(); if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS) - Ty = Type::getFloatTy(MF.getFunction()->getContext()); + Ty = Type::getFloatTy(MF.getFunction().getContext()); else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD) - Ty = Type::getDoubleTy(MF.getFunction()->getContext()); + Ty = Type::getDoubleTy(MF.getFunction().getContext()); else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8); else - Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); + Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || Opc == X86::AVX512_512_SETALLONES || @@ -9301,6 +9464,16 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::ORPSrr, X86::ORPDrr, X86::PORrr }, { X86::XORPSrm, X86::XORPDrm, X86::PXORrm }, { X86::XORPSrr, X86::XORPDrr, X86::PXORrr }, + { X86::UNPCKLPDrm, X86::UNPCKLPDrm, X86::PUNPCKLQDQrm }, + { X86::MOVLHPSrr, X86::UNPCKLPDrr, X86::PUNPCKLQDQrr }, + { X86::UNPCKHPDrm, X86::UNPCKHPDrm, X86::PUNPCKHQDQrm }, + { X86::UNPCKHPDrr, X86::UNPCKHPDrr, X86::PUNPCKHQDQrr }, + { X86::UNPCKLPSrm, X86::UNPCKLPSrm, X86::PUNPCKLDQrm }, + { X86::UNPCKLPSrr, X86::UNPCKLPSrr, X86::PUNPCKLDQrr }, + { X86::UNPCKHPSrm, X86::UNPCKHPSrm, X86::PUNPCKHDQrm }, + { X86::UNPCKHPSrr, X86::UNPCKHPSrr, X86::PUNPCKHDQrr }, + { X86::EXTRACTPSmr, X86::EXTRACTPSmr, X86::PEXTRDmr }, + { X86::EXTRACTPSrr, X86::EXTRACTPSrr, X86::PEXTRDrr }, // AVX 128-bit support { X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr }, { X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm }, @@ -9321,6 +9494,16 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VORPSrr, X86::VORPDrr, X86::VPORrr }, { X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm }, { X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr }, + { X86::VUNPCKLPDrm, X86::VUNPCKLPDrm, X86::VPUNPCKLQDQrm }, + { X86::VMOVLHPSrr, X86::VUNPCKLPDrr, X86::VPUNPCKLQDQrr }, + { X86::VUNPCKHPDrm, X86::VUNPCKHPDrm, X86::VPUNPCKHQDQrm }, + { X86::VUNPCKHPDrr, X86::VUNPCKHPDrr, X86::VPUNPCKHQDQrr }, + { X86::VUNPCKLPSrm, X86::VUNPCKLPSrm, X86::VPUNPCKLDQrm }, + { X86::VUNPCKLPSrr, X86::VUNPCKLPSrr, X86::VPUNPCKLDQrr }, + { X86::VUNPCKHPSrm, X86::VUNPCKHPSrm, X86::VPUNPCKHDQrm }, + { X86::VUNPCKHPSrr, X86::VUNPCKHPSrr, X86::VPUNPCKHDQrr }, + { X86::VEXTRACTPSmr, X86::VEXTRACTPSmr, X86::VPEXTRDmr }, + { X86::VEXTRACTPSrr, X86::VEXTRACTPSrr, X86::VPEXTRDrr }, // AVX 256-bit support { X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr }, { X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm }, @@ -9328,6 +9511,10 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr }, { X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm }, { X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr }, + { X86::VPERMPSYrm, X86::VPERMPSYrm, X86::VPERMDYrm }, + { X86::VPERMPSYrr, X86::VPERMPSYrr, X86::VPERMDYrr }, + { X86::VPERMPDYmi, X86::VPERMPDYmi, X86::VPERMQYmi }, + { X86::VPERMPDYri, X86::VPERMPDYri, X86::VPERMQYri }, // AVX512 support { X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr }, { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr }, @@ -9347,6 +9534,76 @@ static const uint16_t ReplaceableInstrs[][3] = { { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m }, { X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr }, { X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm }, + { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrr, X86::VINSERTI32x4Zrr }, + { X86::VINSERTF32x4Zrm, X86::VINSERTF32x4Zrm, X86::VINSERTI32x4Zrm }, + { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrr, X86::VINSERTI32x8Zrr }, + { X86::VINSERTF32x8Zrm, X86::VINSERTF32x8Zrm, X86::VINSERTI32x8Zrm }, + { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrr, X86::VINSERTI64x2Zrr }, + { X86::VINSERTF64x2Zrm, X86::VINSERTF64x2Zrm, X86::VINSERTI64x2Zrm }, + { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrr, X86::VINSERTI64x4Zrr }, + { X86::VINSERTF64x4Zrm, X86::VINSERTF64x4Zrm, X86::VINSERTI64x4Zrm }, + { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rr,X86::VINSERTI32x4Z256rr }, + { X86::VINSERTF32x4Z256rm,X86::VINSERTF32x4Z256rm,X86::VINSERTI32x4Z256rm }, + { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rr,X86::VINSERTI64x2Z256rr }, + { X86::VINSERTF64x2Z256rm,X86::VINSERTF64x2Z256rm,X86::VINSERTI64x2Z256rm }, + { X86::VEXTRACTF32x4Zrr, X86::VEXTRACTF32x4Zrr, X86::VEXTRACTI32x4Zrr }, + { X86::VEXTRACTF32x4Zmr, X86::VEXTRACTF32x4Zmr, X86::VEXTRACTI32x4Zmr }, + { X86::VEXTRACTF32x8Zrr, X86::VEXTRACTF32x8Zrr, X86::VEXTRACTI32x8Zrr }, + { X86::VEXTRACTF32x8Zmr, X86::VEXTRACTF32x8Zmr, X86::VEXTRACTI32x8Zmr }, + { X86::VEXTRACTF64x2Zrr, X86::VEXTRACTF64x2Zrr, X86::VEXTRACTI64x2Zrr }, + { X86::VEXTRACTF64x2Zmr, X86::VEXTRACTF64x2Zmr, X86::VEXTRACTI64x2Zmr }, + { X86::VEXTRACTF64x4Zrr, X86::VEXTRACTF64x4Zrr, X86::VEXTRACTI64x4Zrr }, + { X86::VEXTRACTF64x4Zmr, X86::VEXTRACTF64x4Zmr, X86::VEXTRACTI64x4Zmr }, + { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTI32x4Z256rr }, + { X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTI32x4Z256mr }, + { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTI64x2Z256rr }, + { X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTI64x2Z256mr }, + { X86::VPERMILPSmi, X86::VPERMILPSmi, X86::VPSHUFDmi }, + { X86::VPERMILPSri, X86::VPERMILPSri, X86::VPSHUFDri }, + { X86::VPERMILPSZ128mi, X86::VPERMILPSZ128mi, X86::VPSHUFDZ128mi }, + { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128ri, X86::VPSHUFDZ128ri }, + { X86::VPERMILPSZ256mi, X86::VPERMILPSZ256mi, X86::VPSHUFDZ256mi }, + { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256ri, X86::VPSHUFDZ256ri }, + { X86::VPERMILPSZmi, X86::VPERMILPSZmi, X86::VPSHUFDZmi }, + { X86::VPERMILPSZri, X86::VPERMILPSZri, X86::VPSHUFDZri }, + { X86::VPERMPSZ256rm, X86::VPERMPSZ256rm, X86::VPERMDZ256rm }, + { X86::VPERMPSZ256rr, X86::VPERMPSZ256rr, X86::VPERMDZ256rr }, + { X86::VPERMPDZ256mi, X86::VPERMPDZ256mi, X86::VPERMQZ256mi }, + { X86::VPERMPDZ256ri, X86::VPERMPDZ256ri, X86::VPERMQZ256ri }, + { X86::VPERMPDZ256rm, X86::VPERMPDZ256rm, X86::VPERMQZ256rm }, + { X86::VPERMPDZ256rr, X86::VPERMPDZ256rr, X86::VPERMQZ256rr }, + { X86::VPERMPSZrm, X86::VPERMPSZrm, X86::VPERMDZrm }, + { X86::VPERMPSZrr, X86::VPERMPSZrr, X86::VPERMDZrr }, + { X86::VPERMPDZmi, X86::VPERMPDZmi, X86::VPERMQZmi }, + { X86::VPERMPDZri, X86::VPERMPDZri, X86::VPERMQZri }, + { X86::VPERMPDZrm, X86::VPERMPDZrm, X86::VPERMQZrm }, + { X86::VPERMPDZrr, X86::VPERMPDZrr, X86::VPERMQZrr }, + { X86::VUNPCKLPDZ256rm, X86::VUNPCKLPDZ256rm, X86::VPUNPCKLQDQZ256rm }, + { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rr, X86::VPUNPCKLQDQZ256rr }, + { X86::VUNPCKHPDZ256rm, X86::VUNPCKHPDZ256rm, X86::VPUNPCKHQDQZ256rm }, + { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rr, X86::VPUNPCKHQDQZ256rr }, + { X86::VUNPCKLPSZ256rm, X86::VUNPCKLPSZ256rm, X86::VPUNPCKLDQZ256rm }, + { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rr, X86::VPUNPCKLDQZ256rr }, + { X86::VUNPCKHPSZ256rm, X86::VUNPCKHPSZ256rm, X86::VPUNPCKHDQZ256rm }, + { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rr, X86::VPUNPCKHDQZ256rr }, + { X86::VUNPCKLPDZ128rm, X86::VUNPCKLPDZ128rm, X86::VPUNPCKLQDQZ128rm }, + { X86::VMOVLHPSZrr, X86::VUNPCKLPDZ128rr, X86::VPUNPCKLQDQZ128rr }, + { X86::VUNPCKHPDZ128rm, X86::VUNPCKHPDZ128rm, X86::VPUNPCKHQDQZ128rm }, + { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rr, X86::VPUNPCKHQDQZ128rr }, + { X86::VUNPCKLPSZ128rm, X86::VUNPCKLPSZ128rm, X86::VPUNPCKLDQZ128rm }, + { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rr, X86::VPUNPCKLDQZ128rr }, + { X86::VUNPCKHPSZ128rm, X86::VUNPCKHPSZ128rm, X86::VPUNPCKHDQZ128rm }, + { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rr, X86::VPUNPCKHDQZ128rr }, + { X86::VUNPCKLPDZrm, X86::VUNPCKLPDZrm, X86::VPUNPCKLQDQZrm }, + { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrr, X86::VPUNPCKLQDQZrr }, + { X86::VUNPCKHPDZrm, X86::VUNPCKHPDZrm, X86::VPUNPCKHQDQZrm }, + { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrr, X86::VPUNPCKHQDQZrr }, + { X86::VUNPCKLPSZrm, X86::VUNPCKLPSZrm, X86::VPUNPCKLDQZrm }, + { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrr, X86::VPUNPCKLDQZrr }, + { X86::VUNPCKHPSZrm, X86::VUNPCKHPSZrm, X86::VPUNPCKHDQZrm }, + { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrr, X86::VPUNPCKHDQZrr }, + { X86::VEXTRACTPSZmr, X86::VEXTRACTPSZmr, X86::VPEXTRDZmr }, + { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZrr, X86::VPEXTRDZrr }, }; static const uint16_t ReplaceableInstrsAVX2[][3] = { @@ -9368,6 +9625,20 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = { { X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr}, { X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm}, { X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 }, + { X86::VBLENDPSrri, X86::VBLENDPSrri, X86::VPBLENDDrri }, + { X86::VBLENDPSrmi, X86::VBLENDPSrmi, X86::VPBLENDDrmi }, + { X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri }, + { X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi }, + { X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi }, + { X86::VPERMILPSYri, X86::VPERMILPSYri, X86::VPSHUFDYri }, + { X86::VUNPCKLPDYrm, X86::VUNPCKLPDYrm, X86::VPUNPCKLQDQYrm }, + { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrr, X86::VPUNPCKLQDQYrr }, + { X86::VUNPCKHPDYrm, X86::VUNPCKHPDYrm, X86::VPUNPCKHQDQYrm }, + { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrr, X86::VPUNPCKHQDQYrr }, + { X86::VUNPCKLPSYrm, X86::VUNPCKLPSYrm, X86::VPUNPCKLDQYrm }, + { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrr, X86::VPUNPCKLDQYrr }, + { X86::VUNPCKHPSYrm, X86::VUNPCKHPSYrm, X86::VPUNPCKHDQYrm }, + { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr }, }; static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = { @@ -9787,9 +10058,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVPDZ256rr: case X86::VDIVPDZ256rrk: case X86::VDIVPDZ256rrkz: - case X86::VDIVPDZrb: - case X86::VDIVPDZrbk: - case X86::VDIVPDZrbkz: + case X86::VDIVPDZrrb: + case X86::VDIVPDZrrbk: + case X86::VDIVPDZrrbkz: case X86::VDIVPDZrm: case X86::VDIVPDZrmb: case X86::VDIVPDZrmbk: @@ -9817,9 +10088,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVPSZ256rr: case X86::VDIVPSZ256rrk: case X86::VDIVPSZ256rrkz: - case X86::VDIVPSZrb: - case X86::VDIVPSZrbk: - case X86::VDIVPSZrbkz: + case X86::VDIVPSZrrb: + case X86::VDIVPSZrrbk: + case X86::VDIVPSZrrbkz: case X86::VDIVPSZrm: case X86::VDIVPSZrmb: case X86::VDIVPSZrmbk: @@ -9837,9 +10108,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVSDZrr_Int: case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz: - case X86::VDIVSDZrrb: - case X86::VDIVSDZrrbk: - case X86::VDIVSDZrrbkz: + case X86::VDIVSDZrrb_Int: + case X86::VDIVSDZrrb_Intk: + case X86::VDIVSDZrrb_Intkz: case X86::VDIVSSZrm: case X86::VDIVSSZrr: case X86::VDIVSSZrm_Int: @@ -9848,9 +10119,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const { case X86::VDIVSSZrr_Int: case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz: - case X86::VDIVSSZrrb: - case X86::VDIVSSZrrbk: - case X86::VDIVSSZrrbkz: + case X86::VDIVSSZrrb_Int: + case X86::VDIVSSZrrb_Intk: + case X86::VDIVSSZrrb_Intkz: case X86::VSQRTPDZ128m: case X86::VSQRTPDZ128mb: case X86::VSQRTPDZ128mbk: @@ -10419,7 +10690,7 @@ namespace { LDTLSCleanup() : MachineFunctionPass(ID) {} bool runOnMachineFunction(MachineFunction &MF) override { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>(); @@ -10528,29 +10799,72 @@ char LDTLSCleanup::ID = 0; FunctionPass* llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); } -unsigned X86InstrInfo::getOutliningBenefit(size_t SequenceSize, - size_t Occurrences, - bool CanBeTailCall) const { - unsigned NotOutlinedSize = SequenceSize * Occurrences; - unsigned OutlinedSize; - - // Is it a tail call? - if (CanBeTailCall) { - // If yes, we don't have to include a return instruction-- it's already in - // our sequence. So we have one occurrence of the sequence + #Occurrences - // calls. - OutlinedSize = SequenceSize + Occurrences; - } else { - // If not, add one for the return instruction. - OutlinedSize = (SequenceSize + 1) + Occurrences; - } +/// Constants defining how certain sequences should be outlined. +/// +/// \p MachineOutlinerDefault implies that the function is called with a call +/// instruction, and a return must be emitted for the outlined function frame. +/// +/// That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> call OUTLINED_FUNCTION I1 +/// I3 I2 +/// I3 +/// ret +/// +/// * Call construction overhead: 1 (call instruction) +/// * Frame construction overhead: 1 (return instruction) +/// +/// \p MachineOutlinerTailCall implies that the function is being tail called. +/// A jump is emitted instead of a call, and the return is already present in +/// the outlined sequence. That is, +/// +/// I1 OUTLINED_FUNCTION: +/// I2 --> jmp OUTLINED_FUNCTION I1 +/// ret I2 +/// ret +/// +/// * Call construction overhead: 1 (jump instruction) +/// * Frame construction overhead: 0 (don't need to return) +/// +enum MachineOutlinerClass { + MachineOutlinerDefault, + MachineOutlinerTailCall +}; - // Return the number of instructions saved by outlining this sequence. - return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0; +X86GenInstrInfo::MachineOutlinerInfo +X86InstrInfo::getOutlininingCandidateInfo( + std::vector< + std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> + &RepeatedSequenceLocs) const { + + if (RepeatedSequenceLocs[0].second->isTerminator()) + return MachineOutlinerInfo(1, // Number of instructions to emit call. + 0, // Number of instructions to emit frame. + MachineOutlinerTailCall, // Type of call. + MachineOutlinerTailCall // Type of frame. + ); + + return MachineOutlinerInfo(1, 1, MachineOutlinerDefault, + MachineOutlinerDefault); } -bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const { - return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone); +bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF, + bool OutlineFromLinkOnceODRs) const { + const Function &F = MF.getFunction(); + + // Does the function use a red zone? If it does, then we can't risk messing + // with the stack. + if (!F.hasFnAttribute(Attribute::NoRedZone)) + return false; + + // If we *don't* want to outline from things that could potentially be deduped + // then return false. + if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage()) + return false; + + // This function is viable for outlining, so return true. + return true; } X86GenInstrInfo::MachineOutlinerInstrType @@ -10580,7 +10894,7 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const { // FIXME: There are instructions which are being manually built without // explicit uses/defs so we also have to check the MCInstrDesc. We should be // able to remove the extra checks once those are fixed up. For example, - // sometimes we might get something like %RAX<def> = POP64r 1. This won't be + // sometimes we might get something like %rax = POP64r 1. This won't be // caught by modifiesRegister or readsRegister even though the instruction // really ought to be formed so that modifiesRegister/readsRegister would // catch it. @@ -10610,10 +10924,10 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const { void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF, - bool IsTailCall) const { - + const MachineOutlinerInfo &MInfo) + const { // If we're a tail call, we already have a return, so don't do anything. - if (IsTailCall) + if (MInfo.FrameConstructionID == MachineOutlinerTailCall) return; // We're a normal call, so our sequence doesn't have a return instruction. @@ -10624,15 +10938,16 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB, void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF, - bool IsTailCall) const {} + const MachineOutlinerInfo &MInfo) + const {} MachineBasicBlock::iterator X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It, MachineFunction &MF, - bool IsTailCall) const { + const MachineOutlinerInfo &MInfo) const { // Is it a tail call? - if (IsTailCall) { + if (MInfo.CallConstructionID == MachineOutlinerTailCall) { // Yes, just insert a JMP. It = MBB.insert(It, BuildMI(MF, DebugLoc(), get(X86::JMP_1)) diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index e64876073ccf..02a09c340cef 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -18,19 +18,19 @@ #include "X86InstrFMA3Info.h" #include "X86RegisterInfo.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #define GET_INSTRINFO_HEADER #include "X86GenInstrInfo.inc" namespace llvm { - class MachineInstrBuilder; - class X86RegisterInfo; - class X86Subtarget; +class MachineInstrBuilder; +class X86RegisterInfo; +class X86Subtarget; namespace X86 { - // X86 specific condition code. These correspond to X86_*_COND in - // X86InstrInfo.td. They must be kept in synch. +// X86 specific condition code. These correspond to X86_*_COND in +// X86InstrInfo.td. They must be kept in synch. enum CondCode { COND_A = 0, COND_AE = 1, @@ -83,18 +83,17 @@ CondCode getCondFromCMovOpc(unsigned Opc); /// GetOppositeBranchCondition - Return the inverse of the specified cond, /// e.g. turning COND_E to COND_NE. CondCode GetOppositeBranchCondition(CondCode CC); -} // end namespace X86; - +} // namespace X86 /// isGlobalStubReference - Return true if the specified TargetFlag operand is /// a reference to a stub for a global, not the global itself. inline static bool isGlobalStubReference(unsigned char TargetFlag) { switch (TargetFlag) { - case X86II::MO_DLLIMPORT: // dllimport stub. - case X86II::MO_GOTPCREL: // rip-relative GOT reference. - case X86II::MO_GOT: // normal GOT reference. - case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref. - case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref. + case X86II::MO_DLLIMPORT: // dllimport stub. + case X86II::MO_GOTPCREL: // rip-relative GOT reference. + case X86II::MO_GOT: // normal GOT reference. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref. + case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref. return true; default: return false; @@ -106,11 +105,11 @@ inline static bool isGlobalStubReference(unsigned char TargetFlag) { /// is true, the addressing mode has the PIC base register added in (e.g. EBX). inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { switch (TargetFlag) { - case X86II::MO_GOTOFF: // isPICStyleGOT: local global. - case X86II::MO_GOT: // isPICStyleGOT: other global. - case X86II::MO_PIC_BASE_OFFSET: // Darwin local global. - case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global. - case X86II::MO_TLVP: // ??? Pretty sure.. + case X86II::MO_GOTOFF: // isPICStyleGOT: local global. + case X86II::MO_GOT: // isPICStyleGOT: other global. + case X86II::MO_PIC_BASE_OFFSET: // Darwin local global. + case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global. + case X86II::MO_TLVP: // ??? Pretty sure.. return true; default: return false; @@ -118,9 +117,8 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) { } inline static bool isScale(const MachineOperand &MO) { - return MO.isImm() && - (MO.getImm() == 1 || MO.getImm() == 2 || - MO.getImm() == 4 || MO.getImm() == 8); + return MO.isImm() && (MO.getImm() == 1 || MO.getImm() == 2 || + MO.getImm() == 4 || MO.getImm() == 8); } inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) { @@ -150,8 +148,8 @@ class X86InstrInfo final : public X86GenInstrInfo { /// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1, /// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps. /// - typedef DenseMap<unsigned, - std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType; + typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>> + RegOp2MemOpTableType; RegOp2MemOpTableType RegOp2MemOpTable2Addr; RegOp2MemOpTableType RegOp2MemOpTable0; RegOp2MemOpTableType RegOp2MemOpTable1; @@ -161,13 +159,13 @@ class X86InstrInfo final : public X86GenInstrInfo { /// MemOp2RegOpTable - Load / store unfolding opcode map. /// - typedef DenseMap<unsigned, - std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType; + typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>> + MemOp2RegOpTableType; MemOp2RegOpTableType MemOp2RegOpTable; static void AddTableEntry(RegOp2MemOpTableType &R2MTable, - MemOp2RegOpTableType &M2RTable, - uint16_t RegOp, uint16_t MemOp, uint16_t Flags); + MemOp2RegOpTableType &M2RTable, uint16_t RegOp, + uint16_t MemOp, uint16_t Flags); virtual void anchor(); @@ -216,9 +214,8 @@ public: /// true, then it's expected the pre-extension value is available as a subreg /// of the result register. This also returns the sub-register index in /// SubIdx. - bool isCoalescableExtInstr(const MachineInstr &MI, - unsigned &SrcReg, unsigned &DstReg, - unsigned &SubIdx) const override; + bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg, + unsigned &DstReg, unsigned &SubIdx) const override; unsigned isLoadFromStackSlot(const MachineInstr &MI, int &FrameIndex) const override; @@ -253,8 +250,8 @@ public: /// operand to the LEA instruction. bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src, unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc, - bool &isKill, bool &isUndef, - MachineOperand &ImplicitOp, LiveVariables *LV) const; + bool &isKill, bool &isUndef, MachineOperand &ImplicitOp, + LiveVariables *LV) const; /// convertToThreeAddress - This method must be implemented by targets that /// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target @@ -312,8 +309,7 @@ public: /// FMA213 #1, #2, #3 /// results into instruction with adjusted opcode: /// FMA231 #3, #2, #1 - bool findFMA3CommutedOpIndices(const MachineInstr &MI, - unsigned &SrcOpIdx1, + bool findFMA3CommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1, unsigned &SrcOpIdx2, const X86InstrFMA3Group &FMA3Group) const; @@ -332,10 +328,10 @@ public: /// FMA213 #1, #2, #3 /// results into instruction with adjusted opcode: /// FMA231 #3, #2, #1 - unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, - unsigned SrcOpIdx1, - unsigned SrcOpIdx2, - const X86InstrFMA3Group &FMA3Group) const; + unsigned + getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1, + unsigned SrcOpIdx2, + const X86InstrFMA3Group &FMA3Group) const; // Branch analysis. bool isUnpredicatedTerminator(const MachineInstr &MI) const override; @@ -364,8 +360,8 @@ public: MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; - bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond, - unsigned, unsigned, int&, int&, int&) const override; + bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond, + unsigned, unsigned, int &, int &, int &) const override; void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, const DebugLoc &DL, unsigned DstReg, ArrayRef<MachineOperand> Cond, unsigned TrueReg, @@ -374,8 +370,8 @@ public: const DebugLoc &DL, unsigned DestReg, unsigned SrcReg, bool KillSrc) const override; void storeRegToStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned SrcReg, bool isKill, int FrameIndex, + MachineBasicBlock::iterator MI, unsigned SrcReg, + bool isKill, int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; @@ -384,12 +380,11 @@ public: const TargetRegisterClass *RC, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, - SmallVectorImpl<MachineInstr*> &NewMIs) const; + SmallVectorImpl<MachineInstr *> &NewMIs) const; void loadRegFromStackSlot(MachineBasicBlock &MBB, - MachineBasicBlock::iterator MI, - unsigned DestReg, int FrameIndex, - const TargetRegisterClass *RC, + MachineBasicBlock::iterator MI, unsigned DestReg, + int FrameIndex, const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const override; void loadRegFromAddr(MachineFunction &MF, unsigned DestReg, @@ -397,7 +392,7 @@ public: const TargetRegisterClass *RC, MachineInstr::mmo_iterator MMOBegin, MachineInstr::mmo_iterator MMOEnd, - SmallVectorImpl<MachineInstr*> &NewMIs) const; + SmallVectorImpl<MachineInstr *> &NewMIs) const; bool expandPostRAPseudo(MachineInstr &MI) const override; @@ -434,7 +429,7 @@ public: SmallVectorImpl<MachineInstr *> &NewMIs) const override; bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N, - SmallVectorImpl<SDNode*> &NewNodes) const override; + SmallVectorImpl<SDNode *> &NewNodes) const override; /// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new /// instruction after load / store are unfolded from an instruction of the @@ -442,9 +437,9 @@ public: /// possible. If LoadRegIndex is non-null, it is filled in with the operand /// index of the operand which will hold the register holding the loaded /// value. - unsigned getOpcodeAfterMemoryUnfold(unsigned Opc, - bool UnfoldLoad, bool UnfoldStore, - unsigned *LoadRegIndex = nullptr) const override; + unsigned + getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore, + unsigned *LoadRegIndex = nullptr) const override; /// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler /// to determine if two loads are loading from the same base address. It @@ -455,15 +450,15 @@ public: int64_t &Offset2) const override; /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to - /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should - /// be scheduled togther. On some targets if two loads are loading from + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads + /// should be scheduled togther. On some targets if two loads are loading from /// addresses in the same cache line, it's better if they are scheduled /// together. This function takes two integers that represent the load offsets /// from the common base address. It returns true if it decides it's desirable /// to schedule the two loads together. "NumLoads" is the number of loads that /// have already been scheduled after Load1. - bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, - int64_t Offset1, int64_t Offset2, + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1, + int64_t Offset2, unsigned NumLoads) const override; void getNoop(MCInst &NopInst) const override; @@ -520,9 +515,7 @@ public: const MachineInstr &UseMI, unsigned UseIdx) const override; - bool useMachineCombiner() const override { - return true; - } + bool useMachineCombiner() const override { return true; } bool isAssociativeAndCommutative(const MachineInstr &Inst) const override; @@ -566,28 +559,28 @@ public: ArrayRef<std::pair<unsigned, const char *>> getSerializableDirectMachineOperandTargetFlags() const override; - unsigned getOutliningBenefit(size_t SequenceSize, - size_t Occurrences, - bool CanBeTailCall) const override; + virtual MachineOutlinerInfo getOutlininingCandidateInfo( + std::vector< + std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>> + &RepeatedSequenceLocs) const override; - bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override; + bool isFunctionSafeToOutlineFrom(MachineFunction &MF, + bool OutlineFromLinkOnceODRs) const override; llvm::X86GenInstrInfo::MachineOutlinerInstrType getOutliningType(MachineInstr &MI) const override; - void insertOutlinerEpilogue(MachineBasicBlock &MBB, - MachineFunction &MF, - bool IsTailCall) const override; + void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const override; - void insertOutlinerPrologue(MachineBasicBlock &MBB, - MachineFunction &MF, - bool isTailCall) const override; + void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const override; MachineBasicBlock::iterator insertOutlinedCall(Module &M, MachineBasicBlock &MBB, - MachineBasicBlock::iterator &It, - MachineFunction &MF, - bool IsTailCall) const override; + MachineBasicBlock::iterator &It, MachineFunction &MF, + const MachineOutlinerInfo &MInfo) const override; + protected: /// Commutes the operands in the given instruction by changing the operands /// order and/or changing the instruction's opcode and/or the immediate value @@ -643,6 +636,6 @@ private: unsigned &SrcOpIdx2) const; }; -} // End llvm namespace +} // namespace llvm #endif diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index fab70e918b8a..42e89cb4831d 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -82,6 +82,9 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<1>, SDTCisInt<2>]>; +def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, + SDTCisPtrTy<1>]>; + def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, @@ -271,7 +274,12 @@ def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags, [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>; -def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>; +def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; +def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags, + [SDNPHasChain, SDNPMayStore, SDNPMayLoad, + SDNPMemOperand]>; def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>; @@ -592,19 +600,11 @@ def SSECC : Operand<i8> { let OperandType = "OPERAND_IMMEDIATE"; } -def i8immZExt3 : ImmLeaf<i8, [{ - return Imm >= 0 && Imm < 8; -}]>; - def AVXCC : Operand<i8> { let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; } -def i8immZExt5 : ImmLeaf<i8, [{ - return Imm >= 0 && Imm < 32; -}]>; - def AVX512ICC : Operand<i8> { let PrintMethod = "printSSEAVXCC"; let OperandType = "OPERAND_IMMEDIATE"; @@ -803,6 +803,7 @@ def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">; def HasSSE42 : Predicate<"Subtarget->hasSSE42()">; def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">; def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">; +def NoAVX : Predicate<"!Subtarget->hasAVX()">; def HasAVX : Predicate<"Subtarget->hasAVX()">; def HasAVX2 : Predicate<"Subtarget->hasAVX2()">; def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">; @@ -831,30 +832,41 @@ def NoVLX : Predicate<"!Subtarget->hasVLX()">; def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">; def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">; def PKU : Predicate<"Subtarget->hasPKU()">; +def HasVNNI : Predicate<"Subtarget->hasVNNI()">; +def HasBITALG : Predicate<"Subtarget->hasBITALG()">; def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">; def HasAES : Predicate<"Subtarget->hasAES()">; +def HasVAES : Predicate<"Subtarget->hasVAES()">; +def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVAES()">; def HasFXSR : Predicate<"Subtarget->hasFXSR()">; def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">; def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">; def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">; def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">; def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">; +def NoVLX_Or_NoVPCLMULQDQ : + Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">; +def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">; +def HasGFNI : Predicate<"Subtarget->hasGFNI()">; def HasFMA : Predicate<"Subtarget->hasFMA()">; def HasFMA4 : Predicate<"Subtarget->hasFMA4()">; +def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">; def HasXOP : Predicate<"Subtarget->hasXOP()">; def HasTBM : Predicate<"Subtarget->hasTBM()">; +def NoTBM : Predicate<"!Subtarget->hasTBM()">; def HasLWP : Predicate<"Subtarget->hasLWP()">; def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">; def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">; def HasF16C : Predicate<"Subtarget->hasF16C()">; -def NoF16C : Predicate<"!Subtarget->hasF16C()">; def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">; def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">; def HasBMI : Predicate<"Subtarget->hasBMI()">; def HasBMI2 : Predicate<"Subtarget->hasBMI2()">; +def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">; def HasVBMI : Predicate<"Subtarget->hasVBMI()">, AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">; +def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">; def HasIFMA : Predicate<"Subtarget->hasIFMA()">, AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">; def HasRTM : Predicate<"Subtarget->hasRTM()">; @@ -869,7 +881,10 @@ def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">; def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">; def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">; def HasMPX : Predicate<"Subtarget->hasMPX()">; +def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">; +def HasIBT : Predicate<"Subtarget->hasIBT()">; def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">; +def HasCLWB : Predicate<"Subtarget->hasCLWB()">; def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">; def Not64BitMode : Predicate<"!Subtarget->is64Bit()">, AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">; @@ -903,15 +918,15 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">; // the Function object through the <Target>Subtarget and objections were raised // to that (see post-commit review comments for r301750). let RecomputePerFunction = 1 in { - def OptForSize : Predicate<"MF->getFunction()->optForSize()">; - def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">; - def OptForSpeed : Predicate<"!MF->getFunction()->optForSize()">; + def OptForSize : Predicate<"MF->getFunction().optForSize()">; + def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">; + def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">; + def UseIncDec : Predicate<"!Subtarget->slowIncDec() || " + "MF->getFunction().optForSize()">; } -def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">; def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">; -def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">; -def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">; +def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">; def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">; def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">; def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">; @@ -1108,6 +1123,17 @@ let hasSideEffects = 0, SchedRW = [WriteZero] in { "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero), "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32; + def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero), + "nop{q}\t$zero", [], IIC_NOP>, TB, + Requires<[In64BitMode]>; + // Also allow register so we can assemble/disassemble + def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero), + "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16; + def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero), + "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32; + def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero), + "nop{q}\t$zero", [], IIC_NOP>, TB, + Requires<[In64BitMode]>; } @@ -1131,7 +1157,8 @@ def LEAVE64 : I<0xC9, RawFrm, // Miscellaneous Instructions. // -let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in +let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1, + SchedRW = [WriteSystem] in def Int_eh_sjlj_setup_dispatch : PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>; @@ -1461,7 +1488,8 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src), [(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32; def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src), "mov{q}\t{$src, $dst|$dst, $src}", - [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>; + [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>, + Requires<[In64BitMode]>; } // SchedRW let hasSideEffects = 0 in { @@ -1535,33 +1563,39 @@ def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst), let mayLoad = 1 in { let Defs = [AL] in def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src), - "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64; + "movabs{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>, + AdSize64; let Defs = [AX] in def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src), - "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64; + "movabs{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>, + OpSize16, AdSize64; let Defs = [EAX] in def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src), - "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32, - AdSize64; + "movabs{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>, + OpSize32, AdSize64; let Defs = [RAX] in def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src), - "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64; + "movabs{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>, + AdSize64; } let mayStore = 1 in { let Uses = [AL] in def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst), - "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64; + "movabs{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>, + AdSize64; let Uses = [AX] in def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst), - "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64; + "movabs{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>, + OpSize16, AdSize64; let Uses = [EAX] in def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst), - "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32, - AdSize64; + "movabs{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>, + OpSize32, AdSize64; let Uses = [RAX] in def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst), - "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64; + "movabs{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>, + AdSize64; } } // hasSideEffects = 0 @@ -1654,40 +1688,36 @@ let SchedRW = [WriteALU] in { def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>, - OpSize16, TB; + OpSize16, TB, NotMemoryFoldable; def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>, - OpSize32, TB; + OpSize32, TB, NotMemoryFoldable; def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", - [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB; + [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB, + NotMemoryFoldable; } // SchedRW // Unlike with the register+register form, the memory+register form of the // bt instruction does not ignore the high bits of the index. From ISel's // perspective, this is pretty bizarre. Make these instructions disassembly -// only for now. +// only for now. These instructions are also slow on modern CPUs so that's +// another reason to avoid generating them. let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in { def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", - // [(X86bt (loadi16 addr:$src1), GR16:$src2), - // (implicit EFLAGS)] [], IIC_BT_MR - >, OpSize16, TB, Requires<[FastBTMem]>; + >, OpSize16, TB, NotMemoryFoldable; def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "bt{l}\t{$src2, $src1|$src1, $src2}", - // [(X86bt (loadi32 addr:$src1), GR32:$src2), - // (implicit EFLAGS)] [], IIC_BT_MR - >, OpSize32, TB, Requires<[FastBTMem]>; + >, OpSize32, TB, NotMemoryFoldable; def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", - // [(X86bt (loadi64 addr:$src1), GR64:$src2), - // (implicit EFLAGS)] [], IIC_BT_MR - >, TB; + >, TB, NotMemoryFoldable; } let SchedRW = [WriteALU] in { @@ -1705,9 +1735,8 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2), IIC_BT_RI>, TB; } // SchedRW -// Note that these instructions don't need FastBTMem because that -// only applies when the other operand is in a register. When it's -// an immediate, bt is still fast. +// Note that these instructions aren't slow because that only applies when the +// other operand is in a register. When it's an immediate, bt is still fast. let SchedRW = [WriteALU] in { def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2), "bt{w}\t{$src2, $src1|$src1, $src2}", @@ -1720,40 +1749,43 @@ def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2), def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2), "bt{q}\t{$src2, $src1|$src1, $src2}", [(set EFLAGS, (X86bt (loadi64 addr:$src1), - i64immSExt8:$src2))], IIC_BT_MI>, TB; + i64immSExt8:$src2))], IIC_BT_MI>, TB, + Requires<[In64BitMode]>; } // SchedRW let hasSideEffects = 0 in { -let SchedRW = [WriteALU] in { -def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), +let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, - OpSize16, TB; -def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + OpSize16, TB, NotMemoryFoldable; +def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, - OpSize32, TB; -def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; + OpSize32, TB, NotMemoryFoldable; +def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB, + NotMemoryFoldable; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, - OpSize16, TB; + OpSize16, TB, NotMemoryFoldable; def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, - OpSize32, TB; + OpSize32, TB, NotMemoryFoldable; def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB, + NotMemoryFoldable; } -let SchedRW = [WriteALU] in { -def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2), +let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize16, TB; -def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2), +def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize32, TB; -def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2), +def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; } // SchedRW @@ -1765,39 +1797,41 @@ def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize32, TB; def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2), - "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; + "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB, + Requires<[In64BitMode]>; } -let SchedRW = [WriteALU] in { -def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), +let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, - OpSize16, TB; -def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + OpSize16, TB, NotMemoryFoldable; +def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, - OpSize32, TB; -def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), - "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB; + OpSize32, TB, NotMemoryFoldable; +def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, NotMemoryFoldable; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, - OpSize16, TB; + OpSize16, TB, NotMemoryFoldable; def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, - OpSize32, TB; + OpSize32, TB, NotMemoryFoldable; def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB, + NotMemoryFoldable; } -let SchedRW = [WriteALU] in { -def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2), +let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize16, TB; -def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2), +def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize32, TB; -def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2), +def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; } // SchedRW @@ -1809,39 +1843,42 @@ def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize32, TB; def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2), - "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; + "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB, + Requires<[In64BitMode]>; } -let SchedRW = [WriteALU] in { -def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2), +let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, - OpSize16, TB; -def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2), + OpSize16, TB, NotMemoryFoldable; +def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, - OpSize32, TB; -def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB; + OpSize32, TB, NotMemoryFoldable; +def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2), + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB, + NotMemoryFoldable; } // SchedRW let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in { def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, - OpSize16, TB; + OpSize16, TB, NotMemoryFoldable; def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, - OpSize32, TB; + OpSize32, TB, NotMemoryFoldable; def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB; + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB, + NotMemoryFoldable; } -let SchedRW = [WriteALU] in { -def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2), +let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in { +def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2), "bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize16, TB; -def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2), +def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, OpSize32, TB; -def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2), +def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2), "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB; } // SchedRW @@ -1853,7 +1890,8 @@ def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2), "bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, OpSize32, TB; def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2), - "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB; + "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB, + Requires<[In64BitMode]>; } } // hasSideEffects = 0 } // Defs = [EFLAGS] @@ -2000,35 +2038,38 @@ def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst), let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst), "cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>, - TB, Requires<[HasCmpxchg16b]>; + TB, Requires<[HasCmpxchg16b, In64BitMode]>; } // SchedRW // Lock instruction prefix +let SchedRW = [WriteMicrocoded] in def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>; +let SchedRW = [WriteNop] in { + // Rex64 instruction prefix -def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>, +def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", [], IIC_NOP>, Requires<[In64BitMode]>; // Data16 instruction prefix -def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>, +def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", [], IIC_NOP>, Requires<[Not16BitMode]>; // Data instruction prefix -def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", []>, +def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", [], IIC_NOP>, Requires<[In16BitMode]>; +} // SchedRW // Repeat string operation instruction prefixes -// These uses the DF flag in the EFLAGS register to inc or dec ECX -let Defs = [ECX], Uses = [ECX,EFLAGS] in { +// These use the DF flag in the EFLAGS register to inc or dec ECX +let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in { // Repeat (used with INS, OUTS, MOVS, LODS and STOS) def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>; // Repeat while not equal (used with CMPS and SCAS) def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>; } - // String manipulation instructions let SchedRW = [WriteMicrocoded] in { // These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI @@ -2174,31 +2215,35 @@ let Predicates = [HasMOVBE] in { //===----------------------------------------------------------------------===// // RDRAND Instruction // -let Predicates = [HasRDRAND], Defs = [EFLAGS] in { +let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in { def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins), "rdrand{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB; + [(set GR16:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, + OpSize16, PS; def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins), "rdrand{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB; + [(set GR32:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, + OpSize32, PS; def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins), "rdrand{q}\t$dst", - [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB; + [(set GR64:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, PS; } //===----------------------------------------------------------------------===// // RDSEED Instruction // -let Predicates = [HasRDSEED], Defs = [EFLAGS] in { +let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in { def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins), "rdseed{w}\t$dst", - [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB; + [(set GR16:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, + OpSize16, PS; def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins), "rdseed{l}\t$dst", - [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB; + [(set GR32:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, + OpSize32, PS; def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins), "rdseed{q}\t$dst", - [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB; + [(set GR64:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, PS; } //===----------------------------------------------------------------------===// @@ -2207,30 +2252,33 @@ let Predicates = [HasRDSEED], Defs = [EFLAGS] in { let Predicates = [HasLZCNT], Defs = [EFLAGS] in { def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "lzcnt{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS, - OpSize16; + [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)], + IIC_LZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>; def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "lzcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (ctlz (loadi16 addr:$src))), - (implicit EFLAGS)]>, XS, OpSize16; + (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize16, + Sched<[WriteIMulLd]>; def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "lzcnt{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS, - OpSize32; + [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)], + IIC_LZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>; def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "lzcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (ctlz (loadi32 addr:$src))), - (implicit EFLAGS)]>, XS, OpSize32; + (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize32, + Sched<[WriteIMulLd]>; def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "lzcnt{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>, - XS; + [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)], + IIC_LZCNT_RR>, XS, Sched<[WriteIMul]>; def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "lzcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (ctlz (loadi64 addr:$src))), - (implicit EFLAGS)]>, XS; + (implicit EFLAGS)], IIC_LZCNT_RM>, XS, + Sched<[WriteIMulLd]>; } //===----------------------------------------------------------------------===// @@ -2239,30 +2287,33 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in { let Predicates = [HasBMI], Defs = [EFLAGS] in { def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src), "tzcnt{w}\t{$src, $dst|$dst, $src}", - [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS, - OpSize16; + [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)], + IIC_TZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>; def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src), "tzcnt{w}\t{$src, $dst|$dst, $src}", [(set GR16:$dst, (cttz (loadi16 addr:$src))), - (implicit EFLAGS)]>, XS, OpSize16; + (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize16, + Sched<[WriteIMulLd]>; def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), "tzcnt{l}\t{$src, $dst|$dst, $src}", - [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS, - OpSize32; + [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)], + IIC_TZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>; def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), "tzcnt{l}\t{$src, $dst|$dst, $src}", [(set GR32:$dst, (cttz (loadi32 addr:$src))), - (implicit EFLAGS)]>, XS, OpSize32; + (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize32, + Sched<[WriteIMulLd]>; def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), "tzcnt{q}\t{$src, $dst|$dst, $src}", - [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>, - XS; + [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)], + IIC_TZCNT_RR>, XS, Sched<[WriteIMul]>; def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), "tzcnt{q}\t{$src, $dst|$dst, $src}", [(set GR64:$dst, (cttz (loadi64 addr:$src))), - (implicit EFLAGS)]>, XS; + (implicit EFLAGS)], IIC_TZCNT_RM>, XS, + Sched<[WriteIMulLd]>; } multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, @@ -2270,11 +2321,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM, let hasSideEffects = 0 in { def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), - []>, T8PS, VEX_4V; + [], IIC_UNARY_REG>, T8PS, VEX_4V, Sched<[WriteALU]>; let mayLoad = 1 in def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src), !strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"), - []>, T8PS, VEX_4V; + [], IIC_UNARY_MEM>, T8PS, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>; } } @@ -2309,18 +2360,18 @@ let Predicates = [HasBMI] in { (BLSI64rr GR64:$src)>; } - multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, PatFrag ld_frag> { def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>, - T8PS, VEX; + [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)], IIC_BIN_NONMEM>, + T8PS, VEX, Sched<[WriteALU]>; def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)), - (implicit EFLAGS)]>, T8PS, VEX; + (implicit EFLAGS)], IIC_BIN_MEM>, T8PS, VEX, + Sched<[WriteALULd, ReadAfterLd]>; } let Predicates = [HasBMI], Defs = [EFLAGS] in { @@ -2337,22 +2388,45 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in { int_x86_bmi_bzhi_64, loadi64>, VEX_W; } - def CountTrailingOnes : SDNodeXForm<imm, [{ // Count the trailing ones in the immediate. return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N)); }]>; -def BZHIMask : ImmLeaf<i64, [{ - return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32); +def BEXTRMaskXForm : SDNodeXForm<imm, [{ + unsigned Length = countTrailingOnes(N->getZExtValue()); + return getI32Imm(Length << 8, SDLoc(N)); }]>; -let Predicates = [HasBMI2] in { - def : Pat<(and GR64:$src, BZHIMask:$mask), +def AndMask64 : ImmLeaf<i64, [{ + return isMask_64(Imm) && Imm > UINT32_MAX; +}]>; + +// Use BEXTR for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasBMI, NoBMI2, NoTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BEXTR64rr GR64:$src, + (SUBREG_TO_REG (i64 0), + (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BEXTR64rm addr:$src, + (SUBREG_TO_REG (i64 0), + (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>; +} + +// Use BZHI for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasBMI2, NoTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), (BZHI64rr GR64:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), + (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>; +} +let Predicates = [HasBMI2] in { def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)), (BZHI32rr GR32:$src, (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; @@ -2402,27 +2476,17 @@ let Predicates = [HasBMI2] in { (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; } // HasBMI2 -let Predicates = [HasBMI] in { - def : Pat<(X86bextr GR32:$src1, GR32:$src2), - (BEXTR32rr GR32:$src1, GR32:$src2)>; - def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2), - (BEXTR32rm addr:$src1, GR32:$src2)>; - def : Pat<(X86bextr GR64:$src1, GR64:$src2), - (BEXTR64rr GR64:$src1, GR64:$src2)>; - def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2), - (BEXTR64rm addr:$src1, GR64:$src2)>; -} // HasBMI - multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC, X86MemOperand x86memop, Intrinsic Int, PatFrag ld_frag> { def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (Int RC:$src1, RC:$src2))]>, - VEX_4V; + [(set RC:$dst, (Int RC:$src1, RC:$src2))], IIC_BIN_NONMEM>, + VEX_4V, Sched<[WriteALU]>; def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V; + [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], + IIC_BIN_MEM>, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>; } let Predicates = [HasBMI2] in { @@ -2448,14 +2512,14 @@ multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr, def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl), !strconcat(OpcodeStr, "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), - [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>, - XOP, XOPA; + [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))], + IIC_BIN_NONMEM>, XOP, XOPA, Sched<[WriteALU]>; def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src1, immtype:$cntl), !strconcat(OpcodeStr, "\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"), - [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>, - XOP, XOPA; + [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))], + IIC_BIN_MEM>, XOP, XOPA, Sched<[WriteALULd, ReadAfterLd]>; } defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32, @@ -2471,11 +2535,11 @@ multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem, let hasSideEffects = 0 in { def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src), !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), - []>, XOP_4V, XOP9; + [], IIC_BIN_NONMEM>, XOP_4V, XOP9, Sched<[WriteALU]>; let mayLoad = 1 in def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"), - []>, XOP_4V, XOP9; + [], IIC_BIN_MEM>, XOP_4V, XOP9, Sched<[WriteALULd, ReadAfterLd]>; } } @@ -2498,34 +2562,43 @@ defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>; defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>; } // HasTBM, EFLAGS +// Use BEXTRI for 64-bit 'and' with large immediate 'mask'. +let Predicates = [HasTBM] in { + def : Pat<(and GR64:$src, AndMask64:$mask), + (BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>; + + def : Pat<(and (loadi64 addr:$src), AndMask64:$mask), + (BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>; +} + //===----------------------------------------------------------------------===// // Lightweight Profiling Instructions -let Predicates = [HasLWP] in { +let Predicates = [HasLWP], SchedRW = [WriteSystem] in { def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src", [(int_x86_llwpcb GR32:$src)], IIC_LWP>, - XOP, XOP9, Requires<[Not64BitMode]>; + XOP, XOP9; def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst", [(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>, - XOP, XOP9, Requires<[Not64BitMode]>; + XOP, XOP9; def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src", [(int_x86_llwpcb GR64:$src)], IIC_LWP>, - XOP, XOP9, VEX_W, Requires<[In64BitMode]>; + XOP, XOP9, VEX_W; def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst", [(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>, - XOP, XOP9, VEX_W, Requires<[In64BitMode]>; + XOP, XOP9, VEX_W; multiclass lwpins_intr<RegisterClass RC> { def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>, + [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))], IIC_LWP>, XOP_4V, XOPA; let mayLoad = 1 in def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl), "lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}", - [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>, + [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))], IIC_LWP>, XOP_4V, XOPA; } @@ -2549,7 +2622,7 @@ multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> { defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>; defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W; -} // HasLWP +} // HasLWP, SchedRW //===----------------------------------------------------------------------===// // MONITORX/MWAITX Instructions @@ -2605,15 +2678,6 @@ def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>; //===----------------------------------------------------------------------===// let Predicates = [HasTBM] in { - def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)), - (BEXTRI32ri GR32:$src1, imm:$src2)>; - def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)), - (BEXTRI32mi addr:$src1, imm:$src2)>; - def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2), - (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>; - def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2), - (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>; - // FIXME: patterns for the load versions are not implemented def : Pat<(and GR32:$src, (add GR32:$src, 1)), (BLCFILL32rr GR32:$src)>; @@ -2671,11 +2735,14 @@ let Predicates = [HasTBM] in { // Memory Instructions // -let Predicates = [HasCLFLUSHOPT] in +let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src), - "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD; -def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD; + "clflushopt\t$src", [(int_x86_clflushopt addr:$src)], + IIC_SSE_PREFETCH>, PD; +let Predicates = [HasCLWB], SchedRW = [WriteLoad] in +def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", + [(int_x86_clwb addr:$src)], IIC_SSE_PREFETCH>, PD; //===----------------------------------------------------------------------===// // Subsystems. @@ -2719,6 +2786,7 @@ include "X86InstrSystem.td" // Compiler Pseudo Instructions and Pat Patterns include "X86InstrCompiler.td" +include "X86InstrVecCompiler.td" //===----------------------------------------------------------------------===// // Assembler Mnemonic Aliases @@ -2751,6 +2819,7 @@ def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>; def : MnemonicAlias<"popfd", "popfl", "att">; // FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in @@ -2762,6 +2831,7 @@ def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>; def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>; def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>; +def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>; def : MnemonicAlias<"pushfd", "pushfl", "att">; def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>; @@ -2804,6 +2874,10 @@ def : MnemonicAlias<"smovq", "movsq", "att">; def : MnemonicAlias<"ud2a", "ud2", "att">; def : MnemonicAlias<"verrw", "verr", "att">; +// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release' +def : MnemonicAlias<"acquire", "xacquire", "intel">; +def : MnemonicAlias<"release", "xrelease", "intel">; + // System instruction aliases. def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>; def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>; @@ -3122,8 +3196,8 @@ def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Req // Force mov without a suffix with a segment and mem to prefer the 'l' form of // the move. All segment/mem forms are equivalent, this has the shortest // encoding. -def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>; -def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>; +def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV16sm SEGMENT_REG:$seg, i16mem:$mem), 0>; +def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV16ms i16mem:$mem, SEGMENT_REG:$seg), 0>; // Match 'movq <largeimm>, <reg>' as an alias for movabsq. def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>; @@ -3209,14 +3283,14 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">; FIXME */ // test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms. -def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}", - (TEST8rm GR8 :$val, i8mem :$mem), 0>; -def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}", - (TEST16rm GR16:$val, i16mem:$mem), 0>; -def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}", - (TEST32rm GR32:$val, i32mem:$mem), 0>; -def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}", - (TEST64rm GR64:$val, i64mem:$mem), 0>; +def : InstAlias<"test{b}\t{$mem, $val|$val, $mem}", + (TEST8mr i8mem :$mem, GR8 :$val), 0>; +def : InstAlias<"test{w}\t{$mem, $val|$val, $mem}", + (TEST16mr i16mem:$mem, GR16:$val), 0>; +def : InstAlias<"test{l}\t{$mem, $val|$val, $mem}", + (TEST32mr i32mem:$mem, GR32:$val), 0>; +def : InstAlias<"test{q}\t{$mem, $val|$val, $mem}", + (TEST64mr i64mem:$mem, GR64:$val), 0>; // xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms. def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}", diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td index 2c047722db24..039b4a248544 100644 --- a/lib/Target/X86/X86InstrMMX.td +++ b/lib/Target/X86/X86InstrMMX.td @@ -143,7 +143,7 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr, def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR64:$dst, - (IntId64 (bitconvert (memopmmx addr:$src))))], + (IntId64 (bitconvert (load_mmx addr:$src))))], itins.rm>, Sched<[itins.Sched.Folded]>; } @@ -163,7 +163,7 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), [(set VR64:$dst, (IntId64 VR64:$src1, - (bitconvert (memopmmx addr:$src2))))], itins.rm>, + (bitconvert (load_mmx addr:$src2))))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } @@ -616,7 +616,8 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR64:$src), "pmovmskb\t{$src, $dst|$dst, $src}", [(set GR32orGR64:$dst, - (int_x86_mmx_pmovmskb VR64:$src))]>; + (int_x86_mmx_pmovmskb VR64:$src))], + IIC_MMX_MOVMSK>, Sched<[WriteVecLogic]>; // Low word of XMM to MMX. def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1, diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td index 104ba2a174db..cb2b47b4f0c9 100644 --- a/lib/Target/X86/X86InstrMPX.td +++ b/lib/Target/X86/X86InstrMPX.td @@ -13,13 +13,16 @@ // //===----------------------------------------------------------------------===// +// FIXME: Investigate a better scheduler itinerary once MPX is used inside LLVM. +let SchedRW = [WriteSystem] in { + multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> { let mayLoad = 1 in { def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src), - OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, + OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - OpcodeStr#"\t{$src, $dst|$dst, $src}", []>, + OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>, Requires<[HasMPX, In64BitMode]>; } } @@ -29,17 +32,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS; multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> { let mayLoad = 1 in { def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2), - OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>, Requires<[HasMPX, Not64BitMode]>; def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2), - OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>, Requires<[HasMPX, In64BitMode]>; } def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2), - OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>, Requires<[HasMPX, Not64BitMode]>; def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2), - OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>, + OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>, Requires<[HasMPX, In64BitMode]>; } defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS; @@ -47,32 +50,33 @@ defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD; defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD; def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src), - "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD, Requires<[HasMPX]>; let mayLoad = 1 in { def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src), - "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD, Requires<[HasMPX, In64BitMode]>; } def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src), - "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD, Requires<[HasMPX]>; let mayStore = 1 in { def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), - "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD, Requires<[HasMPX, Not64BitMode]>; def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src), - "bndmov\t{$src, $dst|$dst, $src}", []>, PD, + "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD, Requires<[HasMPX, In64BitMode]>; def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src), - "bndstx\t{$src, $dst|$dst, $src}", []>, PS, + "bndstx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS, Requires<[HasMPX]>; } let mayLoad = 1 in -def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src), - "bndldx\t{$src, $dst|$dst, $src}", []>, PS, +def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src), + "bndldx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS, Requires<[HasMPX]>; +} // SchedRW diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td index 84119ad5eb35..f4331c5e2d93 100644 --- a/lib/Target/X86/X86InstrSGX.td +++ b/lib/Target/X86/X86InstrSGX.td @@ -15,6 +15,7 @@ //===----------------------------------------------------------------------===// // SGX instructions +let SchedRW = [WriteSystem] in { // ENCLS - Execute an Enclave System Function of Specified Leaf Number def ENCLS : I<0x01, MRM_CF, (outs), (ins), "encls", []>, TB; @@ -22,3 +23,4 @@ def ENCLS : I<0x01, MRM_CF, (outs), (ins), // ENCLU - Execute an Enclave User Function of Specified Leaf Number def ENCLU : I<0x01, MRM_D7, (outs), (ins), "enclu", []>, TB; +} // SchedRW diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 650e4fc8716c..a86a0bfc168d 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -25,9 +25,15 @@ class SizeItins<OpndItins arg_s, OpndItins arg_d> { OpndItins d = arg_d; } +class MoveLoadStoreItins<InstrItinClass arg_rr, InstrItinClass arg_rm, + InstrItinClass arg_mr> { + InstrItinClass rr = arg_rr; + InstrItinClass rm = arg_rm; + InstrItinClass mr = arg_mr; +} class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm, - InstrItinClass arg_ri> { + InstrItinClass arg_ri> { InstrItinClass rr = arg_rr; InstrItinClass rm = arg_rm; InstrItinClass ri = arg_ri; @@ -120,10 +126,6 @@ def SSE_DIV_ITINS_P : SizeItins< >; let Sched = WriteVecLogic in -def SSE_VEC_BIT_ITINS_P : OpndItins< - IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM ->; - def SSE_BIT_ITINS_P : OpndItins< IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM >; @@ -143,6 +145,11 @@ def SSE_INTMUL_ITINS_P : OpndItins< IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM >; +// FIXME: Merge SSE_INTSHIFT_P + SSE_INTSHIFT_ITINS_P. +def SSE_INTSHIFT_P : OpndItins< + IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM +>; + def SSE_INTSHIFT_ITINS_P : ShiftOpndItins< IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI >; @@ -151,10 +158,18 @@ def SSE_MOVA_ITINS : OpndItins< IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM >; +def SSE_MOVA : MoveLoadStoreItins< + IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM, IIC_SSE_MOVA_P_MR +>; + def SSE_MOVU_ITINS : OpndItins< IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM >; +def SSE_MOVU : MoveLoadStoreItins< + IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM, IIC_SSE_MOVU_P_MR +>; + def SSE_DPPD_ITINS : OpndItins< IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM >; @@ -203,6 +218,11 @@ def SSE_INTALU_ITINS_SHUFF_P : OpndItins< IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM >; +let Sched = WriteShuffle in +def SSE_PACK : OpndItins< + IIC_SSE_PACK, IIC_SSE_PACK +>; + let Sched = WriteMPSAD in def DEFAULT_ITINS_MPSADSCHED : OpndItins< IIC_ALU_NONMEM, IIC_ALU_MEM @@ -312,134 +332,17 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - pat_rr, NoItinerary, d>, + pat_rr, IIC_SSE_BIT_P_RR, d>, Sched<[WriteVecLogic]>; + let hasSideEffects = 0, mayLoad = 1 in def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - pat_rm, NoItinerary, d>, + pat_rm, IIC_SSE_BIT_P_RM, d>, Sched<[WriteVecLogicLd, ReadAfterLd]>; } -//===----------------------------------------------------------------------===// -// Non-instruction patterns -//===----------------------------------------------------------------------===// - -// A vector extract of the first f32/f64 position is a subregister copy -def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), - (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; -def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), - (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; - -// A 128-bit subvector extract from the first 256-bit vector position -// is a subregister copy that needs no instruction. -def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))), - (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>; -def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))), - (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>; - -def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))), - (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>; -def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))), - (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>; - -def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))), - (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>; -def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))), - (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>; - -// A 128-bit subvector insert to the first 256-bit vector position -// is a subregister copy that needs no instruction. -let AddedComplexity = 25 in { // to give priority over vinsertf128rm -def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)), - (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)), - (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)), - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)), - (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)), - (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)), - (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>; -} - -// Implicitly promote a 32-bit scalar to a vector. -def : Pat<(v4f32 (scalar_to_vector FR32:$src)), - (COPY_TO_REGCLASS FR32:$src, VR128)>; -// Implicitly promote a 64-bit scalar to a vector. -def : Pat<(v2f64 (scalar_to_vector FR64:$src)), - (COPY_TO_REGCLASS FR64:$src, VR128)>; - -// Bitcasts between 128-bit vector types. Return the original type since -// no instruction is needed for the conversion -def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; -def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; -def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; -def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; -def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; -def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; -def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; -def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; -def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; -def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; -def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; -def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; -def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; -def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; -def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; -def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; -def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; -def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; -def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; -def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; -def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; -def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; -def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; -def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; -def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; -def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; -def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; -def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; -def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; -def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; -def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; -def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; - -// Bitcasts between 256-bit vector types. Return the original type since -// no instruction is needed for the conversion -def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; -def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; -def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; -def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; -def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; -def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; -def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; -def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; -def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; -def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; -def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; -def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; -def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; -def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; -def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; -def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; -def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; -def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; -def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; -def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; -def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; -def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; -def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; -def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; -def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; -def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; -def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; -def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; -def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; -def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; // Alias instructions that map fld0 to xorps for sse or vxorps for avx. // This is expanded by ExpandPostRAPseudos. @@ -505,22 +408,20 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, // don't use movss/movsd for copies. //===----------------------------------------------------------------------===// -multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, +multiclass sse12_move_rr<SDNode OpNode, ValueType vt, X86MemOperand x86memop, string base_opc, - string asm_opr, Domain d = GenericDomain, - string Name> { + string asm_opr, Domain d, string Name> { let isCommutable = 1 in def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, RC:$src2), + (ins VR128:$src1, VR128:$src2), !strconcat(base_opc, asm_opr), - [(set VR128:$dst, (vt (OpNode VR128:$src1, - (scalar_to_vector RC:$src2))))], + [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>; // For the disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst), - (ins VR128:$src1, RC:$src2), + (ins VR128:$src1, VR128:$src2), !strconcat(base_opc, asm_opr), [], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>, FoldGenData<Name#rr>; @@ -528,9 +429,9 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt, multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, string OpcodeStr, - Domain d = GenericDomain, string Name> { + Domain d, string Name> { // AVX - defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, + defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d, "V"#Name>, VEX_4V, VEX_LIG, VEX_WIG; @@ -541,7 +442,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG; // SSE1 & 2 let Constraints = "$src1 = $dst" in { - defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr, + defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr, "\t{$src2, $dst|$dst, $src2}", d, Name>; } @@ -553,8 +454,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt, // Loading from memory automatically zeroing upper bits. multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop, - PatFrag mem_pat, string OpcodeStr, - Domain d = GenericDomain> { + PatFrag mem_pat, string OpcodeStr, Domain d> { def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set RC:$dst, (mem_pat addr:$src))], @@ -627,66 +527,40 @@ let Predicates = [UseAVX] in { // Shuffle with VMOVSS def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (VMOVSSrr (v4i32 VR128:$src1), - (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>; - def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), - (VMOVSSrr (v4f32 VR128:$src1), - (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>; - - // 256-bit variants - def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm), - (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)), - sub_xmm)>; - def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm), - (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)), - sub_xmm)>; + (VMOVSSrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), + (VMOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>; // Shuffle with VMOVSD def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - - // 256-bit variants - def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm), - (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)), - sub_xmm)>; - def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)), - (SUBREG_TO_REG (i32 0), - (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm), - (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)), - sub_xmm)>; + (VMOVSDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), + (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>; // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold cause // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (VMOVSDrr VR128:$src1, VR128:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (VMOVSDrr VR128:$src1, VR128:$src2)>; def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (VMOVSDrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), - (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (VMOVSDrr VR128:$src1, VR128:$src2)>; } let Predicates = [UseSSE1] in { let Predicates = [NoSSE41], AddedComplexity = 15 in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSS to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), - (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), - (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>; + (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>; } let AddedComplexity = 20 in { @@ -708,9 +582,10 @@ let Predicates = [UseSSE1] in { // Shuffle with MOVSS def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; - def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)), - (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>; + (MOVSSrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))), + (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>; } let Predicates = [UseSSE2] in { @@ -718,7 +593,7 @@ let Predicates = [UseSSE2] in { // Move scalar to XMM zero-extended, zeroing a VR128 then do a // MOVSD to the lower bits. def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + (MOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>; } let AddedComplexity = 20 in { @@ -737,22 +612,23 @@ let Predicates = [UseSSE2] in { // Shuffle with MOVSD def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; - def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (MOVSDrr VR128:$src1, VR128:$src2)>; + + def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))), + (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>; // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem // is during lowering, where it's not possible to recognize the fold because // it has two uses through a bitcast. One use disappears at isel time and the // fold opportunity reappears. def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (MOVSDrr VR128:$src1, VR128:$src2)>; def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (MOVSDrr VR128:$src1, VR128:$src2)>; def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (MOVSDrr VR128:$src1, VR128:$src2)>; def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)), - (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>; + (MOVSDrr VR128:$src1, VR128:$src2)>; } // Aliases to help the assembler pick two byte VEX encodings by swapping the @@ -845,11 +721,11 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src), IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG; def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movaps\t{$src, $dst|$dst, $src}", - [(alignedstore256 (v8f32 VR256:$src), addr:$dst)], + [(alignedstore (v8f32 VR256:$src), addr:$dst)], IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG; def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movapd\t{$src, $dst|$dst, $src}", - [(alignedstore256 (v4f64 VR256:$src), addr:$dst)], + [(alignedstore (v4f64 VR256:$src), addr:$dst)], IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG; def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src), "movups\t{$src, $dst|$dst, $src}", @@ -969,13 +845,13 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVAPSYrm addr:$src)>; def : Pat<(loadv4i64 addr:$src), (VMOVUPSYrm addr:$src)>; - def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst), + def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; - def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst), + def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; - def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst), + def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; - def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst), + def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst), (VMOVAPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v4i64 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; @@ -985,22 +861,6 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVUPSYmr addr:$dst, VR256:$src)>; def : Pat<(store (v32i8 VR256:$src), addr:$dst), (VMOVUPSYmr addr:$dst, VR256:$src)>; - - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2f64 (extract_subvector - (v4f64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4f32 (extract_subvector - (v8f32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - - def : Pat<(store (v2f64 (extract_subvector - (v4f64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v4f32 (extract_subvector - (v8f32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } // Use movaps / movups for SSE integer load / store (one byte shorter). @@ -1103,14 +963,10 @@ let Predicates = [UseAVX] in { // Shuffle with VMOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (VMOVLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), - (VMOVLPSrm VR128:$src1, addr:$src2)>; // Shuffle with VMOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (VMOVLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), - (VMOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Movsd VR128:$src1, (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (VMOVLPDrm VR128:$src1, addr:$src2)>; @@ -1119,15 +975,9 @@ let Predicates = [UseAVX] in { def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1), (VMOVLPSmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v4i32 (X86Movlps - (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1), - (VMOVLPSmr addr:$src1, VR128:$src2)>; def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), addr:$src1), (VMOVLPDmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), - addr:$src1), - (VMOVLPDmr addr:$src1, VR128:$src2)>; } let Predicates = [UseSSE1] in { @@ -1139,8 +989,6 @@ let Predicates = [UseSSE1] in { // Shuffle with MOVLPS def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))), (MOVLPSrm VR128:$src1, addr:$src2)>; - def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))), - (MOVLPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlps VR128:$src1, (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), (MOVLPSrm VR128:$src1, addr:$src2)>; @@ -1149,18 +997,12 @@ let Predicates = [UseSSE1] in { def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)), addr:$src1), (MOVLPSmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v4i32 (X86Movlps - (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), - addr:$src1), - (MOVLPSmr addr:$src1, VR128:$src2)>; } let Predicates = [UseSSE2] in { // Shuffle with MOVLPD def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))), (MOVLPDrm VR128:$src1, addr:$src2)>; - def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))), - (MOVLPDrm VR128:$src1, addr:$src2)>; def : Pat<(v2f64 (X86Movsd VR128:$src1, (v2f64 (scalar_to_vector (loadf64 addr:$src2))))), (MOVLPDrm VR128:$src1, addr:$src2)>; @@ -1169,9 +1011,6 @@ let Predicates = [UseSSE2] in { def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)), addr:$src1), (MOVLPDmr addr:$src1, VR128:$src2)>; - def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)), - addr:$src1), - (MOVLPDmr addr:$src1, VR128:$src2)>; } //===----------------------------------------------------------------------===// @@ -1179,7 +1018,7 @@ let Predicates = [UseSSE2] in { //===----------------------------------------------------------------------===// let AddedComplexity = 20 in { - defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp", + defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Unpckl, "movhp", IIC_SSE_MOV_LH>; } @@ -1218,19 +1057,9 @@ let Predicates = [UseAVX] in { (bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))), (VMOVHPSrm VR128:$src1, addr:$src2)>; def : Pat<(X86Movlhps VR128:$src1, - (bc_v4i32 (v2i64 (X86vzload addr:$src2)))), + (bc_v4f32 (v2i64 (X86vzload addr:$src2)))), (VMOVHPSrm VR128:$src1, addr:$src2)>; - // VMOVHPD patterns - - // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem - // is during lowering, where it's not possible to recognize the load fold - // cause it has two uses through a bitcast. One use disappears at isel time - // and the fold opportunity reappears. - def : Pat<(v2f64 (X86Unpckl VR128:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (VMOVHPDrm VR128:$src1, addr:$src2)>; - // Also handle an i64 load because that may get selected as a faster way to // load the data. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1261,14 +1090,6 @@ let Predicates = [UseSSE1] in { let Predicates = [UseSSE2] in { // MOVHPD patterns - // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem - // is during lowering, where it's not possible to recognize the load fold - // cause it has two uses through a bitcast. One use disappears at isel time - // and the fold opportunity reappears. - def : Pat<(v2f64 (X86Unpckl VR128:$src1, - (scalar_to_vector (loadf64 addr:$src2)))), - (MOVHPDrm VR128:$src1, addr:$src2)>; - // Also handle an i64 load because that may get selected as a faster way to // load the data. def : Pat<(v2f64 (X86Unpckl VR128:$src1, @@ -1322,63 +1143,77 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in { IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>; } -let Predicates = [UseAVX] in { - // MOVLHPS patterns - def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), - (VMOVLHPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), - (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; +//===----------------------------------------------------------------------===// +// SSE 1 & 2 - Conversion Instructions +//===----------------------------------------------------------------------===// - // MOVHLPS patterns - def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), - (VMOVHLPSrr VR128:$src1, VR128:$src2)>; -} +let Sched = WriteCvtF2I in { +def SSE_CVT_SS2SI_32 : OpndItins< + IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM +>; -let Predicates = [UseSSE1] in { - // MOVLHPS patterns - def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr VR128:$src1, VR128:$src2)>; - def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)), - (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>; +let Sched = WriteCvtF2I in +def SSE_CVT_SS2SI_64 : OpndItins< + IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM +>; - // MOVHLPS patterns - def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)), - (MOVHLPSrr VR128:$src1, VR128:$src2)>; -} +def SSE_CVT_SD2SI : OpndItins< + IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM +>; -//===----------------------------------------------------------------------===// -// SSE 1 & 2 - Conversion Instructions -//===----------------------------------------------------------------------===// +def SSE_CVT_PS2I : OpndItins< + IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM +>; -def SSE_CVT_PD : OpndItins< +def SSE_CVT_PD2I : OpndItins< IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM >; +} -let Sched = WriteCvtI2F in -def SSE_CVT_PS : OpndItins< +let Sched = WriteCvtI2F in { +def SSE_CVT_SI2SS : OpndItins< + IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM +>; + +def SSE_CVT_SI2SD : OpndItins< + IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM +>; + +def SSE_CVT_I2PS : OpndItins< IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM >; -let Sched = WriteCvtI2F in -def SSE_CVT_Scalar : OpndItins< +def SSE_CVT_I2PD : OpndItins< + IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM +>; +} + +let Sched = WriteCvtF2F in { +def SSE_CVT_SD2SS : OpndItins< IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM >; -let Sched = WriteCvtF2I in -def SSE_CVT_SS2SI_32 : OpndItins< - IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM +def SSE_CVT_SS2SD : OpndItins< + IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM >; -let Sched = WriteCvtF2I in -def SSE_CVT_SS2SI_64 : OpndItins< - IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM +def SSE_CVT_PD2PS : OpndItins< + IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM >; -let Sched = WriteCvtF2I in -def SSE_CVT_SD2SI : OpndItins< - IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM +def SSE_CVT_PS2PD : OpndItins< + IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM >; +def SSE_CVT_PH2PS : OpndItins< + IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM +>; + +def SSE_CVT_PS2PH : OpndItins< + IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM +>; +} + // FIXME: We probably want to match the rm form only when optimizing for // size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, @@ -1410,16 +1245,16 @@ let hasSideEffects = 0 in { // FIXME: We probably want to match the rm form only when optimizing for // size, to avoid false depenendecies (see sse_fp_unop_s for details) multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, - X86MemOperand x86memop, string asm> { + X86MemOperand x86memop, string asm, OpndItins itins> { let hasSideEffects = 0, Predicates = [UseAVX] in { def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src), - !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - Sched<[WriteCvtI2F]>; + !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [], + itins.rr>, Sched<[itins.Sched]>; let mayLoad = 1 in def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins DstRC:$src1, x86memop:$src), !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>, - Sched<[WriteCvtI2FLd, ReadAfterLd]>; + Sched<[itins.Sched.Folded, ReadAfterLd]>; } // hasSideEffects = 0 } @@ -1462,14 +1297,14 @@ def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}", // register, but the same isn't true when only using memory operands, // provide other assembly "l" and "q" forms to address this explicitly // where appropriate to do so. -defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">, - XS, VEX_4V, VEX_LIG; -defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">, - XS, VEX_4V, VEX_W, VEX_LIG; -defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">, - XD, VEX_4V, VEX_LIG; -defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">, - XD, VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}", + SSE_CVT_SI2SS>, XS, VEX_4V, VEX_LIG; +defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}", + SSE_CVT_SI2SS>, XS, VEX_4V, VEX_W, VEX_LIG; +defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}", + SSE_CVT_SI2SD>, XD, VEX_4V, VEX_LIG; +defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}", + SSE_CVT_SI2SD>, XD, VEX_4V, VEX_W, VEX_LIG; let Predicates = [UseAVX] in { def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}", @@ -1480,20 +1315,20 @@ let Predicates = [UseAVX] in { def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))), - (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>; + (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))), (VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))), - (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>; + (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>; def : Pat<(f32 (sint_to_fp GR32:$src)), (VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>; def : Pat<(f32 (sint_to_fp GR64:$src)), - (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>; + (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>; def : Pat<(f64 (sint_to_fp GR32:$src)), (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>; def : Pat<(f64 (sint_to_fp GR64:$src)), - (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>; + (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>; } defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32, @@ -1510,16 +1345,16 @@ defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64, SSE_CVT_SD2SI>, XD, REX_W; defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32, "cvtsi2ss{l}\t{$src, $dst|$dst, $src}", - SSE_CVT_Scalar>, XS; -defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, + SSE_CVT_SI2SS>, XS; +defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64, "cvtsi2ss{q}\t{$src, $dst|$dst, $src}", - SSE_CVT_Scalar>, XS, REX_W; + SSE_CVT_SI2SS>, XS, REX_W; defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32, "cvtsi2sd{l}\t{$src, $dst|$dst, $src}", - SSE_CVT_Scalar>, XD; -defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, + SSE_CVT_SI2SD>, XD; +defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64, "cvtsi2sd{q}\t{$src, $dst|$dst, $src}", - SSE_CVT_Scalar>, XD, REX_W; + SSE_CVT_SI2SD>, XD, REX_W; def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}", (CVTTSS2SIrr GR32:$dst, FR32:$src), 0>; @@ -1551,33 +1386,33 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}", multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, Operand memop, ComplexPattern mem_cpat, string asm, OpndItins itins> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>, - Sched<[itins.Sched]>; - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), - !strconcat(asm, "\t{$src, $dst|$dst, $src}"), - [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>, - Sched<[itins.Sched.Folded]>; + def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>, + Sched<[itins.Sched]>; + def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src), + !strconcat(asm, "\t{$src, $dst|$dst, $src}"), + [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>, + Sched<[itins.Sched.Folded]>; } multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop, PatFrag ld_frag, string asm, OpndItins itins, bit Is2Addr = 1> { - def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), - !if(Is2Addr, - !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], - itins.rr>, Sched<[itins.Sched]>; - def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), - (ins DstRC:$src1, x86memop:$src2), - !if(Is2Addr, - !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], - itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; + def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))], + itins.rr>, Sched<[itins.Sched]>; + def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), + (ins DstRC:$src1, x86memop:$src2), + !if(Is2Addr, + !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"), + !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), + [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [UseAVX] in { @@ -1596,34 +1431,34 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64, let isCodeGenOnly = 1 in { let Predicates = [UseAVX] in { - defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}", - SSE_CVT_Scalar, 0>, XS, VEX_4V; - defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + SSE_CVT_SI2SS, 0>, XS, VEX_4V; + defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}", - SSE_CVT_Scalar, 0>, XS, VEX_4V, + SSE_CVT_SI2SS, 0>, XS, VEX_4V, VEX_W; - defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}", - SSE_CVT_Scalar, 0>, XD, VEX_4V; - defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + SSE_CVT_SI2SD, 0>, XD, VEX_4V; + defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}", - SSE_CVT_Scalar, 0>, XD, + SSE_CVT_SI2SD, 0>, XD, VEX_4V, VEX_W; } let Constraints = "$src1 = $dst" in { - defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse_cvtsi2ss, i32mem, loadi32, - "cvtsi2ss{l}", SSE_CVT_Scalar>, XS; - defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + "cvtsi2ss{l}", SSE_CVT_SI2SS>, XS; + defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse_cvtsi642ss, i64mem, loadi64, - "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W; - defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, + "cvtsi2ss{q}", SSE_CVT_SI2SS>, XS, REX_W; + defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128, int_x86_sse2_cvtsi2sd, i32mem, loadi32, - "cvtsi2sd{l}", SSE_CVT_Scalar>, XD; - defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128, + "cvtsi2sd{l}", SSE_CVT_SI2SD>, XD; + defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128, int_x86_sse2_cvtsi642sd, i64mem, loadi64, - "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W; + "cvtsi2sd{q}", SSE_CVT_SI2SD>, XD, REX_W; } } // isCodeGenOnly = 1 @@ -1632,31 +1467,31 @@ let isCodeGenOnly = 1 in { // Aliases for intrinsics let isCodeGenOnly = 1 in { let Predicates = [UseAVX] in { -defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, - ssmem, sse_load_f32, "cvttss2si", - SSE_CVT_SS2SI_32>, XS, VEX; -defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, - int_x86_sse_cvttss2si64, ssmem, sse_load_f32, - "cvttss2si", SSE_CVT_SS2SI_64>, - XS, VEX, VEX_W; -defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, - sdmem, sse_load_f64, "cvttsd2si", - SSE_CVT_SD2SI>, XD, VEX; -defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, - int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, - "cvttsd2si", SSE_CVT_SD2SI>, - XD, VEX, VEX_W; -} -defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, +defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, + ssmem, sse_load_f32, "cvttss2si", + SSE_CVT_SS2SI_32>, XS, VEX; +defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse_cvttss2si64, ssmem, sse_load_f32, + "cvttss2si", SSE_CVT_SS2SI_64>, + XS, VEX, VEX_W; +defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, + sdmem, sse_load_f64, "cvttsd2si", + SSE_CVT_SD2SI>, XD, VEX; +defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, + int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, + "cvttsd2si", SSE_CVT_SD2SI>, + XD, VEX, VEX_W; +} +defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si, ssmem, sse_load_f32, "cvttss2si", SSE_CVT_SS2SI_32>, XS; -defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, +defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse_cvttss2si64, ssmem, sse_load_f32, "cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W; -defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, +defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD; -defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, +defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64, "cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W; } // isCodeGenOnly = 1 @@ -1678,53 +1513,53 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64, defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64, "vcvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle, SSE_CVT_PS>, + SSEPackedSingle, SSE_CVT_I2PS>, PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG; defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64, "vcvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle, SSE_CVT_PS>, + SSEPackedSingle, SSE_CVT_I2PS>, PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG; defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64, "cvtdq2ps\t{$src, $dst|$dst, $src}", - SSEPackedSingle, SSE_CVT_PS>, + SSEPackedSingle, SSE_CVT_I2PS>, PS, Requires<[UseSSE2]>; let Predicates = [UseAVX] in { def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", - (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>; + (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>; def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}", - (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>; + (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>; def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", - (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>; + (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>; def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}", - (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>; + (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>; def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", - (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>; + (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>; def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}", - (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; + (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>; def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", - (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>; + (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>; def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}", - (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; + (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>; } def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", - (CVTSS2SIrr GR32:$dst, VR128:$src), 0>; + (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>; def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}", - (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>; + (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>; def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", - (CVTSD2SIrr GR32:$dst, VR128:$src), 0>; + (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>; def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}", - (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>; + (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>; def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", - (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>; + (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>; def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}", - (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>; + (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>; def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>; + (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>; def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}", - (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>; + (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>; /// SSE 2 Only @@ -1734,18 +1569,17 @@ def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR64:$src2), "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG, - Sched<[WriteCvtF2F]>, VEX_WIG; + Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable; let mayLoad = 1 in def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins FR32:$src1, f64mem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RM>, - XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG, - Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG; + [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable; } def : Pat<(f32 (fpround FR64:$src)), - (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>, + (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>, Requires<[UseAVX]>; def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src), @@ -1760,14 +1594,14 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src), Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>; let isCodeGenOnly = 1 in { -def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg, +def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; -def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem, +def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss @@ -1776,14 +1610,14 @@ def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem, Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { -def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg, +def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; -def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem, +def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2), "cvtsd2ss\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtsd2ss @@ -1799,20 +1633,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in { def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR32:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RR>, - XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG, - Sched<[WriteCvtF2F]>, VEX_WIG; + [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable; let mayLoad = 1 in def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins FR64:$src1, f32mem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [], IIC_SSE_CVT_Scalar_RM>, - XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>, - Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG; + [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG, + Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable; } def : Pat<(f64 (fpextend FR32:$src)), - (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>; + (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>; def : Pat<(fpextend (loadf32 addr:$src)), (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>; @@ -1845,14 +1677,14 @@ def : Pat<(extloadf32 addr:$src), (CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>; let isCodeGenOnly = 1 in { -def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg, +def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtF2F]>; -def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, +def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", [(set VR128:$dst, @@ -1860,14 +1692,14 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem, IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG, Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>; let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix -def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg, +def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))], IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>, Sched<[WriteCvtF2F]>; -def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem, +def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2), "cvtss2sd\t{$src2, $dst|$dst, $src2}", [(set VR128:$dst, @@ -1885,33 +1717,33 @@ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), - (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>; + (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), - (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>; + (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), - (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>; + (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), - (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>; + (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), - (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>; + (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), - (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>; + (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>; } // Predicates = [UseAVX] let Predicates = [UseSSE2] in { @@ -1919,35 +1751,35 @@ def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))), - (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>; + (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))), - (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>; + (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))), - (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>; + (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))), - (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>; + (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>; } // Predicates = [UseSSE2] let Predicates = [UseSSE1] in { def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))), - (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>; + (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>; def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))), - (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>; + (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>; } // Predicates = [UseSSE1] // Convert packed single/double fp to doubleword @@ -2115,10 +1947,16 @@ let Predicates = [HasAVX, NoVLX] in { (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), (VCVTPD2DQrr VR128:$src)>; def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))), + (VCVTPD2DQrm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), (VCVTTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))), + (VCVTTPD2DQrm addr:$src)>; } -} // Predicates = [HasAVX] +} // Predicates = [HasAVX, NoVLX] def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvttpd2dq\t{$src, $dst|$dst, $src}", @@ -2137,8 +1975,14 @@ let Predicates = [UseSSE2] in { (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))), (CVTPD2DQrr VR128:$src)>; def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))), + (CVTPD2DQrm addr:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))), (CVTTPD2DQrr VR128:$src)>; + def : Pat<(X86vzmovl (v2i64 (bitconvert + (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))), + (CVTTPD2DQrm addr:$src)>; } } // Predicates = [UseSSE2] @@ -2180,7 +2024,7 @@ let hasSideEffects = 0, mayLoad = 1 in def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>, + (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>, VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG; def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "vcvtdq2pd\t{$src, $dst|$dst, $src}", @@ -2203,7 +2047,7 @@ let hasSideEffects = 0, mayLoad = 1 in def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", [(set VR128:$dst, - (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))], + (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))], IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>; def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), "cvtdq2pd\t{$src, $dst|$dst, $src}", @@ -2215,12 +2059,16 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), let Predicates = [HasAVX, NoVLX] in { def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (VCVTDQ2PDrm addr:$src)>; + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (VCVTDQ2PDrm addr:$src)>; } // Predicates = [HasAVX, NoVLX] // SSE2 register conversion intrinsics let Predicates = [UseSSE2] in { def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))), (CVTDQ2PDrm addr:$src)>; + def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))), + (CVTDQ2PDrm addr:$src)>; } // Predicates = [UseSSE2] // Convert packed double to packed single @@ -2275,38 +2123,51 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), let Predicates = [HasAVX, NoVLX] in { // Match fpround and fpextend for 128/256-bit conversions - let AddedComplexity = 15 in - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (VCVTPD2PSrr VR128:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), + (VCVTPD2PSrr VR128:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (loadv2f64 addr:$src)))))), + (VCVTPD2PSrm addr:$src)>; + } } let Predicates = [UseSSE2] in { // Match fpround and fpextend for 128 conversions - let AddedComplexity = 15 in - def : Pat<(X86vzmovl (v2f64 (bitconvert - (v4f32 (X86vfpround (v2f64 VR128:$src)))))), - (CVTPD2PSrr VR128:$src)>; + let AddedComplexity = 15 in { + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (v2f64 VR128:$src)))))), + (CVTPD2PSrr VR128:$src)>; + def : Pat<(X86vzmovl (v2f64 (bitconvert + (v4f32 (X86vfpround (memopv2f64 addr:$src)))))), + (CVTPD2PSrm addr:$src)>; + } } //===----------------------------------------------------------------------===// // SSE 1 & 2 - Compare Instructions //===----------------------------------------------------------------------===// +let Sched = WriteFAdd in +def SSE_COMIS : OpndItins< + IIC_SSE_COMIS_RR, IIC_SSE_COMIS_RM +>; + // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop, Operand CC, SDNode OpNode, ValueType VT, PatFrag ld_frag, string asm, string asm_alt, - OpndItins itins, ImmLeaf immLeaf> { + OpndItins itins> { let isCommutable = 1 in def rr : SIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))], + [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))], itins.rr>, Sched<[itins.Sched]>; def rm : SIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, (OpNode (VT RC:$src1), - (ld_frag addr:$src2), immLeaf:$cc))], + (ld_frag addr:$src2), imm:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; @@ -2327,41 +2188,41 @@ let ExeDomain = SSEPackedSingle in defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG; + SSE_ALU_F32S>, XS, VEX_4V, VEX_LIG, VEX_WIG; let ExeDomain = SSEPackedDouble in defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare + SSE_ALU_F32S>, // same latency as 32 bit compare XD, VEX_4V, VEX_LIG, VEX_WIG; let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32, "cmp${cc}ss\t{$src2, $dst|$dst, $src2}", - "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S, - i8immZExt3>, XS; + "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>, + XS; let ExeDomain = SSEPackedDouble in defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64, "cmp${cc}sd\t{$src2, $dst|$dst, $src2}", "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSE_ALU_F64S, i8immZExt3>, XD; + SSE_ALU_F64S>, XD; } multiclass sse12_cmp_scalar_int<Operand memop, Operand CC, Intrinsic Int, string asm, OpndItins itins, - ImmLeaf immLeaf, ComplexPattern mem_cpat> { - def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), + ComplexPattern mem_cpat> { + def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - VR128:$src, immLeaf:$cc))], + VR128:$src, imm:$cc))], itins.rr>, Sched<[itins.Sched]>; let mayLoad = 1 in - def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), + def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, memop:$src, CC:$cc), asm, [(set VR128:$dst, (Int VR128:$src1, - mem_cpat:$src, immLeaf:$cc))], + mem_cpat:$src, imm:$cc))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -2369,25 +2230,23 @@ let mayLoad = 1 in let isCodeGenOnly = 1 in { // Aliases to match intrinsics which expect XMM operand(s). let ExeDomain = SSEPackedSingle in - defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, + defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S, i8immZExt5, sse_load_f32>, - XS, VEX_4V; + SSE_ALU_F32S, sse_load_f32>, XS, VEX_4V; let ExeDomain = SSEPackedDouble in - defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, + defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}", - SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32 + SSE_ALU_F32S, sse_load_f64>, // same latency as f32 XD, VEX_4V; let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in - defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, + defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss, "cmp${cc}ss\t{$src, $dst|$dst, $src}", - SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS; + SSE_ALU_F32S, sse_load_f32>, XS; let ExeDomain = SSEPackedDouble in - defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, + defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd, "cmp${cc}sd\t{$src, $dst|$dst, $src}", - SSE_ALU_F64S, i8immZExt3, sse_load_f64>, - XD; + SSE_ALU_F64S, sse_load_f64>, XD; } } @@ -2395,102 +2254,106 @@ let isCodeGenOnly = 1 in { // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, X86MemOperand x86memop, - PatFrag ld_frag, string OpcodeStr> { + PatFrag ld_frag, string OpcodeStr, + OpndItins itins> { +let hasSideEffects = 0 in { def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], - IIC_SSE_COMIS_RR>, - Sched<[WriteFAdd]>; + itins.rr>, + Sched<[itins.Sched]>; let mayLoad = 1 in def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), (ld_frag addr:$src2)))], - IIC_SSE_COMIS_RM>, - Sched<[WriteFAddLd, ReadAfterLd]>; + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} } // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode, ValueType vt, Operand memop, - ComplexPattern mem_cpat, string OpcodeStr> { + ComplexPattern mem_cpat, string OpcodeStr, + OpndItins itins> { def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))], - IIC_SSE_COMIS_RR>, - Sched<[WriteFAdd]>; + itins.rr>, + Sched<[itins.Sched]>; let mayLoad = 1 in def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"), [(set EFLAGS, (OpNode (vt RC:$src1), mem_cpat:$src2))], - IIC_SSE_COMIS_RM>, - Sched<[WriteFAddLd, ReadAfterLd]>; + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Defs = [EFLAGS] in { defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG; + "ucomiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG; defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG; + "ucomisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG; let Pattern = []<dag> in { defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss">, PS, VEX, VEX_LIG, VEX_WIG; + "comiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG; defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd">, PD, VEX, VEX_LIG, VEX_WIG; + "comisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG; } let isCodeGenOnly = 1 in { defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG; + sse_load_f32, "ucomiss", SSE_COMIS>, PS, VEX, VEX_WIG; defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG; + sse_load_f64, "ucomisd", SSE_COMIS>, PD, VEX, VEX_WIG; defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS, VEX, VEX_WIG; + sse_load_f32, "comiss", SSE_COMIS>, PS, VEX, VEX_WIG; defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD, VEX, VEX_WIG; + sse_load_f64, "comisd", SSE_COMIS>, PD, VEX, VEX_WIG; } defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32, - "ucomiss">, PS; + "ucomiss", SSE_COMIS>, PS; defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64, - "ucomisd">, PD; + "ucomisd", SSE_COMIS>, PD; let Pattern = []<dag> in { defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32, - "comiss">, PS; + "comiss", SSE_COMIS>, PS; defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64, - "comisd">, PD; + "comisd", SSE_COMIS>, PD; } let isCodeGenOnly = 1 in { defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem, - sse_load_f32, "ucomiss">, PS; + sse_load_f32, "ucomiss", SSE_COMIS>, PS; defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem, - sse_load_f64, "ucomisd">, PD; + sse_load_f64, "ucomisd", SSE_COMIS>, PD; defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem, - sse_load_f32, "comiss">, PS; + sse_load_f32, "comiss", SSE_COMIS>, PS; defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem, - sse_load_f64, "comisd">, PD; + sse_load_f64, "comisd", SSE_COMIS>, PD; } } // Defs = [EFLAGS] // sse12_cmp_packed - sse 1 & 2 compare packed instructions multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, Operand CC, ValueType VT, string asm, - string asm_alt, Domain d, ImmLeaf immLeaf, + string asm_alt, Domain d, PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> { let isCommutable = 1 in def rri : PIi8<0xC2, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm, - [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))], + [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))], itins.rr, d>, Sched<[WriteFAdd]>; def rmi : PIi8<0xC2, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm, [(set RC:$dst, - (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))], + (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))], itins.rm, d>, Sched<[WriteFAddLd, ReadAfterLd]>; @@ -2510,181 +2373,200 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop, defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG; + SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG; defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG; + SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG; defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32, "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L; + SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L; defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64, "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}", "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}", - SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L; + SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L; let Constraints = "$src1 = $dst" in { defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32, "cmp${cc}ps\t{$src2, $dst|$dst, $src2}", "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS; + SSEPackedSingle, memopv4f32, SSE_ALU_F32P>, PS; defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64, "cmp${cc}pd\t{$src2, $dst|$dst, $src2}", "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}", - SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD; + SSEPackedDouble, memopv2f64, SSE_ALU_F64P>, PD; +} + +def CommutableCMPCC : PatLeaf<(imm), [{ + return (N->getZExtValue() == 0x00 || N->getZExtValue() == 0x03 || + N->getZExtValue() == 0x04 || N->getZExtValue() == 0x07); +}]>; + +// Patterns to select compares with loads in first operand. +let Predicates = [HasAVX] in { + def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1, + CommutableCMPCC:$cc)), + (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1, + CommutableCMPCC:$cc)), + (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, + CommutableCMPCC:$cc)), + (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, + CommutableCMPCC:$cc)), + (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE2] in { + def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1, + CommutableCMPCC:$cc)), + (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>; +} + +let Predicates = [UseSSE1] in { + def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1, + CommutableCMPCC:$cc)), + (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>; + + def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1, + CommutableCMPCC:$cc)), + (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>; } //===----------------------------------------------------------------------===// // SSE 1 & 2 - Shuffle Instructions //===----------------------------------------------------------------------===// +let Sched = WriteFShuffle in +def SSE_SHUFP : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; + /// sse12_shuffle - sse 1 & 2 fp shuffle instructions multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop, ValueType vt, string asm, PatFrag mem_frag, - Domain d> { + OpndItins itins, Domain d> { def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2), - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffleLd, ReadAfterLd]>; + (i8 imm:$src3))))], itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), asm, [(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2, - (i8 imm:$src3))))], IIC_SSE_SHUFP, d>, - Sched<[WriteFShuffle]>; + (i8 imm:$src3))))], itins.rr, d>, + Sched<[itins.Sched]>; } let Predicates = [HasAVX, NoVLX] in { defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; + loadv4f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32, "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; + loadv8f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; + loadv2f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64, "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; + loadv4f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32, "shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv4f32, SSEPackedSingle>, PS; + memopv4f32, SSE_SHUFP, SSEPackedSingle>, PS; defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64, "shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}", - memopv2f64, SSEPackedDouble>, PD; -} - -let Predicates = [HasAVX, NoVLX] in { - def : Pat<(v4i32 (X86Shufp VR128:$src1, - (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))), - (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; - - def : Pat<(v2i64 (X86Shufp VR128:$src1, - (loadv2i64 addr:$src2), (i8 imm:$imm))), - (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; - - // 256-bit patterns - def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>; - def : Pat<(v8i32 (X86Shufp VR256:$src1, - (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), - (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>; - - def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>; - def : Pat<(v4i64 (X86Shufp VR256:$src1, - (loadv4i64 addr:$src2), (i8 imm:$imm))), - (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>; -} - -let Predicates = [UseSSE1] in { - def : Pat<(v4i32 (X86Shufp VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))), - (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>; -} - -let Predicates = [UseSSE2] in { - // Generic SHUFPD patterns - def : Pat<(v2i64 (X86Shufp VR128:$src1, - (memopv2i64 addr:$src2), (i8 imm:$imm))), - (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>; - def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>; + memopv2f64, SSE_SHUFP, SSEPackedDouble>, PD; } //===----------------------------------------------------------------------===// // SSE 1 & 2 - Unpack FP Instructions //===----------------------------------------------------------------------===// +let Sched = WriteFShuffle in +def SSE_UNPCK : OpndItins< + IIC_SSE_UNPCK, IIC_SSE_UNPCK +>; + /// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt, PatFrag mem_frag, RegisterClass RC, X86MemOperand x86memop, string asm, - Domain d, bit IsCommutable = 0> { + OpndItins itins, Domain d, bit IsCommutable = 0> { let isCommutable = IsCommutable in def rr : PI<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), asm, [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], - IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>; + itins.rr, d>, Sched<[itins.Sched]>; def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), asm, [(set RC:$dst, (vt (OpNode RC:$src1, (mem_frag addr:$src2))))], - IIC_SSE_UNPCK, d>, - Sched<[WriteFShuffleLd, ReadAfterLd]>; + itins.rm, d>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX, NoVLX] in { defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32, VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V, VEX_WIG; + SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64, VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V, VEX_WIG; + SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32, VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V, VEX_WIG; + SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG; defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64, VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V, VEX_WIG; + SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG; defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32, VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; + SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64, VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; + SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32, VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; + SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG; defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64, VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}", - SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; + SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG; }// Predicates = [HasAVX, NoVLX] + let Constraints = "$src1 = $dst" in { defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32, VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, PS; + SSE_UNPCK, SSEPackedSingle>, PS; defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64, VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble, 1>, PD; + SSE_UNPCK, SSEPackedDouble, 1>, PD; defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32, VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}", - SSEPackedSingle>, PS; + SSE_UNPCK, SSEPackedSingle>, PS; defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64, VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}", - SSEPackedDouble>, PD; + SSE_UNPCK, SSEPackedDouble>, PD; } // Constraints = "$src1 = $dst" let Predicates = [HasAVX1Only] in { @@ -2787,13 +2669,13 @@ let Predicates = [HasAVX2, prd] in // These are ordered here for pattern ordering requirements with the fp versions defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1, NoVLX>; + SSE_BIT_ITINS_P, 1, NoVLX>; defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1, NoVLX>; + SSE_BIT_ITINS_P, 1, NoVLX>; defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 1, NoVLX>; + SSE_BIT_ITINS_P, 1, NoVLX>; defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, - SSE_VEC_BIT_ITINS_P, 0, NoVLX>; + SSE_BIT_ITINS_P, 0, NoVLX>; //===----------------------------------------------------------------------===// // SSE 1 & 2 - Logical Instructions @@ -2801,54 +2683,36 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64, /// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops /// +/// There are no patterns here because isel prefers integer versions for SSE2 +/// and later. There are SSE1 v4f32 patterns later. multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr, SDNode OpNode> { let Predicates = [HasAVX, NoVLX] in { defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f256mem, - [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), - (bc_v4i64 (v8f32 VR256:$src2))))], - [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)), - (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG; + [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG; defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f256mem, - [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), - (bc_v4i64 (v4f64 VR256:$src2))))], - [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)), - (loadv4i64 addr:$src2)))], 0>, - PD, VEX_4V, VEX_L, VEX_WIG; + [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG; defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, - [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (bc_v2i64 (v4f32 VR128:$src2))))], - [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG; + [], [], 0>, PS, VEX_4V, VEX_WIG; defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, - [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (v2f64 VR128:$src2))))], - [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), - (loadv2i64 addr:$src2)))], 0>, - PD, VEX_4V, VEX_WIG; + [], [], 0>, PD, VEX_4V, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle, !strconcat(OpcodeStr, "ps"), f128mem, - [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (bc_v2i64 (v4f32 VR128:$src2))))], - [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)), - (memopv2i64 addr:$src2)))]>, PS; + [], []>, PS; defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble, !strconcat(OpcodeStr, "pd"), f128mem, - [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), - (bc_v2i64 (v2f64 VR128:$src2))))], - [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)), - (memopv2i64 addr:$src2)))]>, PD; + [], []>, PD; } } @@ -3146,22 +3010,6 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; } - // With SSE 4.1, blendi is preferred to movsd, so match that too. - let Predicates = [UseSSE41] in { - // extracted scalar math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - - // vector math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>; - - } - // Repeat everything for AVX. let Predicates = [UseAVX] in { // extracted scalar math op with insert via movss @@ -3171,22 +3019,10 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> { (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, (COPY_TO_REGCLASS FR32:$src, VR128))>; - // extracted scalar math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector - (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))), - FR32:$src))), (i8 1))), - (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, - (COPY_TO_REGCLASS FR32:$src, VR128))>; - // vector math op with insert via movss def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst), (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))), (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; - - // vector math op with insert via blend - def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), - (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))), - (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>; } } @@ -3210,21 +3046,6 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } - // With SSE 4.1, blendi is preferred to movsd, so match those too. - let Predicates = [UseSSE41] in { - // extracted scalar math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - - // vector math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - } - // Repeat everything for AVX. let Predicates = [UseAVX] in { // extracted scalar math op with insert via movsd @@ -3234,22 +3055,10 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> { (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, (COPY_TO_REGCLASS FR64:$src, VR128))>; - // extracted scalar math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector - (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))), - FR64:$src))), (i8 1))), - (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, - (COPY_TO_REGCLASS FR64:$src, VR128))>; - // vector math op with insert via movsd def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst), (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))), (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; - - // vector math op with insert via blend - def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), - (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))), - (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>; } } @@ -3295,6 +3104,14 @@ def SSE_RSQRTSS : OpndItins< >; } +def SSE_RSQRT_P : SizeItins< + SSE_RSQRTPS, SSE_RSQRTPS +>; + +def SSE_RSQRT_S : SizeItins< + SSE_RSQRTSS, SSE_RSQRTSS +>; + let Sched = WriteFRcp in { def SSE_RCPP : OpndItins< IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM @@ -3305,12 +3122,21 @@ def SSE_RCPS : OpndItins< >; } +def SSE_RCP_P : SizeItins< + SSE_RCPP, SSE_RCPP +>; + +def SSE_RCP_S : SizeItins< + SSE_RCPS, SSE_RCPS +>; + /// sse_fp_unop_s - SSE1 unops in scalar form /// For the non-AVX defs, we need $src1 to be tied to $dst because /// the HW instructions are 2 operand / destructive. multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, + Operand intmemop, ComplexPattern int_cpat, Intrinsic Intr, SDNode OpNode, Domain d, OpndItins itins, Predicate target, string Suffix> { @@ -3331,7 +3157,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; let mayLoad = 1 in - def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2), + def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2), !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -3351,7 +3177,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, // which has a clobber before the rcp, vs. // rcpss mem, %xmm0 let Predicates = [target, OptForSize] in { - def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), + def : Pat<(Intr int_cpat:$src2), (!cast<Instruction>(NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; } @@ -3360,8 +3186,9 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, ValueType vt, ValueType ScalarVT, X86MemOperand x86memop, + Operand intmemop, ComplexPattern int_cpat, Intrinsic Intr, SDNode OpNode, Domain d, - OpndItins itins, string Suffix> { + OpndItins itins, Predicate target, string Suffix> { let hasSideEffects = 0 in { def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), @@ -3377,7 +3204,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, []>, Sched<[itins.Sched.Folded]>; let mayLoad = 1 in def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, x86memop:$src2), + (ins VR128:$src1, intmemop:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -3392,21 +3219,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC, // vrcpss mem, %xmm0, %xmm0 // TODO: In theory, we could fold the load, and avoid the stall caused by // the partial register store, either in ExecutionDepsFix or with smarter RA. - let Predicates = [UseAVX] in { + let Predicates = [target] in { def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r) (ScalarVT (IMPLICIT_DEF)), RC:$src)>; - } - let Predicates = [HasAVX] in { def : Pat<(Intr VR128:$src), (!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src, VR128:$src)>; } - let Predicates = [HasAVX, OptForSize] in { - def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))), + let Predicates = [target, OptForSize] in { + def : Pat<(Intr int_cpat:$src2), (!cast<Instruction>("V"#NAME#Suffix##m_Int) (vt (IMPLICIT_DEF)), addr:$src2)>; - } - let Predicates = [UseAVX, OptForSize] in { def : Pat<(ScalarVT (OpNode (load addr:$src))), (!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)), addr:$src)>; @@ -3452,7 +3275,7 @@ let Predicates = prds in { /// sse2_fp_unop_p - SSE2 unops in vector forms. multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr, SDNode OpNode, OpndItins itins> { -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat("v", OpcodeStr, "pd\t{$src, $dst|$dst, $src}"), @@ -3486,40 +3309,43 @@ let Predicates = [HasAVX] in { } multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem, + ssmem, sse_load_f32, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, SSEPackedSingle, itins, UseSSE1, "SS">, XS; defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32, - f32mem, + f32mem, ssmem, sse_load_f32, !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode, - SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG; + SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V, + VEX_LIG, VEX_WIG, NotMemoryFoldable; } multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode, - OpndItins itins> { + OpndItins itins, Predicate AVXTarget> { defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem, + sdmem, sse_load_f64, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD; defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64, - f64mem, + f64mem, sdmem, sse_load_f64, !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd), - OpNode, SSEPackedDouble, itins, "SD">, - XD, VEX_4V, VEX_LIG, VEX_WIG; + OpNode, SSEPackedDouble, itins, AVXTarget, "SD">, + XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable; } // Square root. -defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>, - sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>, - sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>, +defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>, + sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>, + sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>, sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>; // Reciprocal approximations. Note that these typically require refinement // in order to obtain suitable precision. -defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>, - sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >; -defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>, - sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>; +defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>, + sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>; +defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>, + sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>; // There is no f64 version of the reciprocal approximation instructions. @@ -3535,19 +3361,10 @@ multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix, (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; } - // With SSE 4.1, blendi is preferred to movs*, so match that too. - let Predicates = [UseSSE41] in { - def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), - (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>; - } - // Repeat for AVX versions of the instructions. let Predicates = [HasAVX] in { def : Pat<(VT (Move VT:$dst, (Intr VT:$src))), (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; - - def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))), - (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>; } } @@ -3893,34 +3710,6 @@ let Predicates = [HasAVX, NoVLX] in { (VMOVDQUmr addr:$dst, VR128:$src)>; def : Pat<(store (v16i8 VR128:$src), addr:$dst), (VMOVDQUmr addr:$dst, VR128:$src)>; - - // Special patterns for storing subvector extracts of lower 128-bits - // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr - def : Pat<(alignedstore (v2i64 (extract_subvector - (v4i64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v4i32 (extract_subvector - (v8i32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v8i16 (extract_subvector - (v16i16 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(alignedstore (v16i8 (extract_subvector - (v32i8 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - - def : Pat<(store (v2i64 (extract_subvector - (v4i64 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v4i32 (extract_subvector - (v8i32 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v8i16 (extract_subvector - (v16i16 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; - def : Pat<(store (v16i8 (extract_subvector - (v32i8 VR256:$src), (iPTR 0))), addr:$dst), - (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>; } //===---------------------------------------------------------------------===// @@ -4166,9 +3955,14 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32, // SSE2 - Packed Integer Shuffle Instructions //===---------------------------------------------------------------------===// +let Sched = WriteShuffle in +def SSE_PSHUF : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + let ExeDomain = SSEPackedInt in { multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256, - SDNode OpNode, Predicate prd> { + SDNode OpNode, OpndItins itins, Predicate prd> { let Predicates = [HasAVX, prd] in { def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, u8imm:$src2), @@ -4176,15 +3970,15 @@ let Predicates = [HasAVX, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG; + itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, - Sched<[WriteShuffleLd]>, VEX_WIG; + (i8 imm:$src2))))], itins.rm>, VEX, + Sched<[itins.Sched.Folded]>, VEX_WIG; } let Predicates = [HasAVX2, prd] in { @@ -4194,15 +3988,15 @@ let Predicates = [HasAVX2, prd] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG; + itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG; def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src1, u8imm:$src2), !strconcat("v", OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR256:$dst, (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L, - Sched<[WriteShuffleLd]>, VEX_WIG; + (i8 imm:$src2))))], itins.rm>, VEX, VEX_L, + Sched<[itins.Sched.Folded]>, VEX_WIG; } let Predicates = [UseSSE2] in { @@ -4212,23 +4006,24 @@ let Predicates = [UseSSE2] in { "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))], - IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>; + itins.rr>, Sched<[itins.Sched]>; def mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)), - (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + (i8 imm:$src2))))], itins.rm>, + Sched<[itins.Sched.Folded]>; } } } // ExeDomain = SSEPackedInt -defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD; -defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, +defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, SSE_PSHUF, + NoVLX>, PD; +defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, SSE_PSHUF, NoVLX_Or_NoBWI>, XS; -defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, +defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF, NoVLX_Or_NoBWI>, XD; //===---------------------------------------------------------------------===// @@ -4237,126 +4032,94 @@ defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, let ExeDomain = SSEPackedInt in { multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, - ValueType ArgVT, SDNode OpNode, PatFrag ld_frag, + ValueType ArgVT, SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> { def rr : PDI<opc, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, - Sched<[WriteShuffle]>; + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))], + itins.rr>, Sched<[itins.Sched]>; def rm : PDI<opc, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (OutVT (OpNode (ArgVT VR128:$src1), - (bitconvert (ld_frag addr:$src2)))))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; -} - -multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, - ValueType ArgVT, SDNode OpNode> { - def Yrr : PDI<opc, MRMSrcReg, - (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, - Sched<[WriteShuffle]>; - def Yrm : PDI<opc, MRMSrcMem, - (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (OutVT (OpNode (ArgVT VR256:$src1), - (bitconvert (loadv4i64 addr:$src2)))))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), + (bitconvert (ld_frag addr:$src2)))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT, - ValueType ArgVT, SDNode OpNode, PatFrag ld_frag, + ValueType ArgVT, SDNode OpNode, RegisterClass RC, + X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> { def rr : SS48I<opc, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>, - Sched<[WriteShuffle]>; + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))], + itins.rr>, Sched<[itins.Sched]>; def rm : SS48I<opc, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (OutVT (OpNode (ArgVT VR128:$src1), - (bitconvert (ld_frag addr:$src2)))))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; -} - -multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT, - ValueType ArgVT, SDNode OpNode> { - def Yrr : SS48I<opc, MRMSrcReg, - (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>, - Sched<[WriteShuffle]>; - def Yrm : SS48I<opc, MRMSrcMem, - (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), - !strconcat(OpcodeStr, - "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, - (OutVT (OpNode (ArgVT VR256:$src1), - (bitconvert (loadv4i64 addr:$src2)))))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + [(set RC:$dst, + (OutVT (OpNode (ArgVT RC:$src1), + (bitconvert (ld_frag addr:$src2)))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, - loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128, + i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128, + i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, - loadv2i64, 0>, VEX_4V; + defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128, + i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128, + i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>, - VEX_4V, VEX_L, VEX_WIG; - defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>, - VEX_4V, VEX_L, VEX_WIG; + defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss, + VR256, i256mem, SSE_PACK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss, + VR256, i256mem, SSE_PACK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; - defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>, - VEX_4V, VEX_L, VEX_WIG; - defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>, - VEX_4V, VEX_L; + defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus, + VR256,i256mem, SSE_PACK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus, + VR256, i256mem, SSE_PACK, loadv4i64, 0>, + VEX_4V, VEX_L; } let Constraints = "$src1 = $dst" in { - defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, - memopv2i64>; - defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, - memopv2i64>; + defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128, + i128mem, SSE_PACK, memopv2i64>; + defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128, + i128mem, SSE_PACK, memopv2i64>; - defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, - memopv2i64>; + defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128, + i128mem, SSE_PACK, memopv2i64>; - defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, - memopv2i64>; + defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128, + i128mem, SSE_PACK, memopv2i64>; } } // ExeDomain = SSEPackedInt @@ -4364,103 +4127,107 @@ let Constraints = "$src1 = $dst" in { // SSE2 - Packed Integer Unpack Instructions //===---------------------------------------------------------------------===// +let Sched = WriteShuffle in +def SSE_PUNPCK : OpndItins< + IIC_SSE_UNPCK, IIC_SSE_UNPCK +>; + let ExeDomain = SSEPackedInt in { multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt, - SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> { + SDNode OpNode, RegisterClass RC, X86MemOperand x86memop, + OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> { def rr : PDI<opc, MRMSrcReg, - (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), + (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))], - IIC_SSE_UNPCK>, Sched<[WriteShuffle]>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], + itins.rr>, Sched<[itins.Sched]>; def rm : PDI<opc, MRMSrcMem, - (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), + (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (vt (OpNode VR128:$src1, + [(set RC:$dst, (vt (OpNode RC:$src1, (bitconvert (ld_frag addr:$src2)))))], - IIC_SSE_UNPCK>, - Sched<[WriteShuffleLd, ReadAfterLd]>; -} - -multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt, - SDNode OpNode> { - def Yrr : PDI<opc, MRMSrcReg, - (outs VR256:$dst), (ins VR256:$src1, VR256:$src2), - !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>, - Sched<[WriteShuffle]>; - def Yrm : PDI<opc, MRMSrcMem, - (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), - !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (vt (OpNode VR256:$src1, - (bitconvert (loadv4i64 addr:$src2)))))]>, - Sched<[WriteShuffleLd, ReadAfterLd]>; + itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } - let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, - loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG; } + let Predicates = [HasAVX, NoVLX] in { - defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, - loadv2i64, 0>, VEX_4V, VEX_WIG; - defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, - loadv2i64, 0>, VEX_4V, VEX_WIG; + defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, + VEX_4V, VEX_WIG; + defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, loadv2i64, 0>, + VEX_4V, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>, - VEX_4V, VEX_L, VEX_WIG; - defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>, - VEX_4V, VEX_L, VEX_WIG; - defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>, - VEX_4V, VEX_L, VEX_WIG; - defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>, - VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; } + let Predicates = [HasAVX2, NoVLX] in { - defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>, - VEX_4V, VEX_L, VEX_WIG; - defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>, - VEX_4V, VEX_L, VEX_WIG; - defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>, - VEX_4V, VEX_L, VEX_WIG; - defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>, - VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; + defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256, + i256mem, SSE_PUNPCK, loadv4i64, 0>, + VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { - defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, - memopv2i64>; - defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, - memopv2i64>; - defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, - memopv2i64>; - defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, - memopv2i64>; - - defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, - memopv2i64>; - defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, - memopv2i64>; - defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, - memopv2i64>; - defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, - memopv2i64>; + defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + + defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; + defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128, + i128mem, SSE_PUNPCK, memopv2i64>; } } // ExeDomain = SSEPackedInt @@ -5014,6 +4781,12 @@ let Predicates = [UseSSE3] in { // SSE3 - Replicate Double FP - MOVDDUP //===---------------------------------------------------------------------===// +// FIXME: Improve MOVDDUP/BROADCAST reg/mem scheduling itineraries. +let Sched = WriteFShuffle in +def SSE_MOVDDUP : OpndItins< + IIC_SSE_MOV_LH, IIC_SSE_MOV_LH +>; + multiclass sse3_replicate_dfp<string OpcodeStr> { def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), @@ -5051,23 +4824,11 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup">; let Predicates = [HasAVX, NoVLX] in { def : Pat<(X86Movddup (loadv2f64 addr:$src)), (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>; - - // 256-bit version - def : Pat<(X86Movddup (loadv4i64 addr:$src)), - (VMOVDDUPYrm addr:$src)>; - def : Pat<(X86Movddup (v4i64 VR256:$src)), - (VMOVDDUPYrr VR256:$src)>; } -let Predicates = [HasAVX, NoVLX] in -def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), - (VMOVDDUPrm addr:$src)>; -let Predicates = [HasAVX1Only] in -def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), - (VMOVDDUPrm addr:$src)>; - let Predicates = [UseSSE3] in { - def : Pat<(X86Movddup (memopv2f64 addr:$src)), + // No need for aligned memory as this only loads 64-bits. + def : Pat<(X86Movddup (loadv2f64 addr:$src)), (MOVDDUPrm addr:$src)>; } @@ -5095,7 +4856,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), // SSE3 - Arithmetic //===---------------------------------------------------------------------===// -multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, +multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC, X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> { def rr : I<0xD0, MRMSrcReg, @@ -5103,147 +4864,124 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>, + [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))], itins.rr>, Sched<[itins.Sched]>; def rm : I<0xD0, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>, - Sched<[itins.Sched.Folded, ReadAfterLd]>; + [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))], + itins.rr>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { - defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128, - f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG; - defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256, - f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG; + defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem, + SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, + VEX_WIG; + defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem, + SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, + VEX_L, VEX_WIG; } let ExeDomain = SSEPackedDouble in { - defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128, - f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG; - defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256, - f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG; + defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem, + SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, + VEX_WIG; + defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem, + SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, + VEX_L, VEX_WIG; } } let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in { let ExeDomain = SSEPackedSingle in - defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128, - f128mem, SSE_ALU_F32P, memopv4f32>, XD; + defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, SSE_ALU_F32P, + memopv4f32>, XD; let ExeDomain = SSEPackedDouble in - defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128, - f128mem, SSE_ALU_F64P, memopv2f64>, PD; -} - -// Patterns used to select 'addsub' instructions. -let Predicates = [HasAVX] in { - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), - (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))), - (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), - (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))), - (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; - - def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))), - (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))), - (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>; - def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))), - (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>; - def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))), - (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>; -} - -let Predicates = [UseSSE3] in { - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))), - (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))), - (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))), - (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>; - def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))), - (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>; + defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, SSE_ALU_F64P, + memopv2f64>, PD; } //===---------------------------------------------------------------------===// // SSE3 Instructions //===---------------------------------------------------------------------===// +let Sched = WriteFHAdd in +def SSE_HADDSUB : OpndItins< + IIC_SSE_HADDSUB_RR, IIC_SSE_HADDSUB_RM +>; + // Horizontal ops multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, - bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, OpndItins itins, + PatFrag ld_frag, bit Is2Addr = 1> { def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, - Sched<[WriteFHAdd]>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>, + Sched<[itins.Sched]>; def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], - IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC, - X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag, - bit Is2Addr = 1> { + X86MemOperand x86memop, SDNode OpNode, OpndItins itins, + PatFrag ld_frag, bit Is2Addr = 1> { def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>, - Sched<[WriteFHAdd]>; + [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>, + Sched<[itins.Sched]>; def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2), !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))], - IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>; + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Predicates = [HasAVX] in { let ExeDomain = SSEPackedSingle in { defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem, - X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG; + X86fhadd, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG; defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem, - X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG; + X86fhsub, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG; defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem, - X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; + X86fhadd, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem, - X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; + X86fhsub, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG; } let ExeDomain = SSEPackedDouble in { defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem, - X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG; + X86fhadd, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG; defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem, - X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG; + X86fhsub, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG; defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem, - X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; + X86fhadd, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem, - X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; + X86fhsub, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG; } } let Constraints = "$src1 = $dst" in { let ExeDomain = SSEPackedSingle in { defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd, - memopv4f32>; + SSE_HADDSUB, memopv4f32>; defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub, - memopv4f32>; + SSE_HADDSUB, memopv4f32>; } let ExeDomain = SSEPackedDouble in { defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd, - memopv2f64>; + SSE_HADDSUB, memopv2f64>; defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub, - memopv2f64>; + SSE_HADDSUB, memopv2f64>; } } @@ -5251,59 +4989,63 @@ let Constraints = "$src1 = $dst" in { // SSSE3 - Packed Absolute Instructions //===---------------------------------------------------------------------===// +let Sched = WriteVecALU in +def SSE_PABS : OpndItins< + IIC_SSE_PABS_RR, IIC_SSE_PABS_RM +>; /// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt, - SDNode OpNode, PatFrag ld_frag> { + SDNode OpNode, OpndItins itins, PatFrag ld_frag> { def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (vt (OpNode VR128:$src)))], - IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>; + itins.rr>, Sched<[itins.Sched]>; def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, (vt (OpNode (bitconvert (ld_frag addr:$src)))))], - IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>; + itins.rm>, Sched<[itins.Sched.Folded]>; } /// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}. multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt, - SDNode OpNode> { + SDNode OpNode, OpndItins itins> { def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (vt (OpNode VR256:$src)))]>, - Sched<[WriteVecALU]>; + [(set VR256:$dst, (vt (OpNode VR256:$src)))], itins.rr>, + Sched<[itins.Sched]>; def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR256:$dst, - (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>, - Sched<[WriteVecALULd]>; + (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))], itins.rm>, + Sched<[itins.Sched.Folded]>; } let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { - defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG; - defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG; + defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG; + defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG; } let Predicates = [HasAVX, NoVLX] in { - defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG; + defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG; } let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { - defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG; - defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG; + defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG; + defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG; } let Predicates = [HasAVX2, NoVLX] in { - defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG; + defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG; } -defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>; -defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>; -defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>; +defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SSE_PABS, memopv2i64>; +defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SSE_PABS, memopv2i64>; +defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SSE_PABS, memopv2i64>; //===---------------------------------------------------------------------===// // SSSE3 - Packed Binary Operator Instructions @@ -5367,7 +5109,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !if(Is2Addr, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, + [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], itins.rr>, Sched<[itins.Sched]>; def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), @@ -5376,7 +5118,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr, !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), [(set VR128:$dst, (IntId128 VR128:$src1, - (bitconvert (ld_frag addr:$src2))))]>, + (bitconvert (ld_frag addr:$src2))))], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } @@ -5523,81 +5265,46 @@ defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16, // SSSE3 - Packed Align Instruction Patterns //===---------------------------------------------------------------------===// -multiclass ssse3_palignr<string asm, bit Is2Addr = 1> { +let Sched = WriteShuffle in +def SSE_PALIGN : OpndItins< + IIC_SSE_PALIGNRR, IIC_SSE_PALIGNRM +>; + +multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC, + PatFrag memop_frag, X86MemOperand x86memop, + OpndItins itins, bit Is2Addr = 1> { let hasSideEffects = 0 in { - def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), + def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>; + [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))], + itins.rr>, Sched<[itins.Sched]>; let mayLoad = 1 in - def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), !if(Is2Addr, !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(asm, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>; - } -} - -multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> { - let hasSideEffects = 0 in { - def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst), - (ins VR256:$src1, VR256:$src2, u8imm:$src3), - !strconcat(asm, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, Sched<[WriteShuffle]>; - let mayLoad = 1 in - def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst), - (ins VR256:$src1, i256mem:$src2, u8imm:$src3), - !strconcat(asm, - "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, Sched<[WriteShuffleLd, ReadAfterLd]>; + [(set RC:$dst, (VT (X86PAlignr RC:$src1, + (bitconvert (memop_frag addr:$src2)), + (i8 imm:$src3))))], + itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>; } } -let Predicates = [HasAVX] in - defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG; -let Predicates = [HasAVX2] in - defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG; +let Predicates = [HasAVX, NoVLX_Or_NoBWI] in + defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64, + i128mem, SSE_PALIGN, 0>, VEX_4V, VEX_WIG; +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in + defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64, + i256mem, SSE_PALIGN, 0>, VEX_4V, VEX_L, VEX_WIG; let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in - defm PALIGNR : ssse3_palignr<"palignr">; - -let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { -def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>; -} - -let Predicates = [HasAVX, NoVLX_Or_NoBWI] in { -def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -} - -let Predicates = [UseSSSE3] in { -def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))), - (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>; -} + defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64, + i128mem, SSE_PALIGN>; //===---------------------------------------------------------------------===// // SSSE3 - Thread synchronization @@ -5911,8 +5618,8 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> { (ins i8mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1), - imm:$src2)))), addr:$dst)]>; + [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))), + addr:$dst)]>; } let Predicates = [HasAVX, NoBWI] in @@ -5936,8 +5643,8 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> { (ins i16mem:$dst, VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1), - imm:$src2)))), addr:$dst)]>; + [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))), + addr:$dst)]>; } let Predicates = [HasAVX, NoBWI] in @@ -6147,18 +5854,6 @@ let ExeDomain = SSEPackedSingle in { defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>; } -let Predicates = [UseSSE41] in { - // If we're inserting an element from a load or a null pshuf of a load, - // fold the load into the insertps instruction. - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32 - (scalar_to_vector (loadf32 addr:$src2))), (i8 0)), - imm:$src3)), - (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; - def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd - (loadv4f32 addr:$src2), (i8 0)), imm:$src3)), - (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>; -} - let Predicates = [UseAVX] in { // If we're inserting an element from a vbroadcast of a load, fold the // load into the X86insertps instruction. @@ -6176,8 +5871,9 @@ let Predicates = [UseAVX] in { multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr, X86MemOperand x86memop, RegisterClass RC, + ValueType VT32, ValueType VT64, PatFrag mem_frag32, PatFrag mem_frag64, - Intrinsic V4F32Int, Intrinsic V2F64Int> { + SDNode OpNode> { let ExeDomain = SSEPackedSingle in { // Intrinsic operation, reg. // Vector intrinsic operation, reg @@ -6185,7 +5881,7 @@ let ExeDomain = SSEPackedSingle in { (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))], + [(set RC:$dst, (VT32 (OpNode RC:$src1, imm:$src2)))], IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; // Vector intrinsic operation, mem @@ -6194,7 +5890,7 @@ let ExeDomain = SSEPackedSingle in { !strconcat(OpcodeStr, "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (V4F32Int (mem_frag32 addr:$src1),imm:$src2))], + (VT32 (OpNode (mem_frag32 addr:$src1),imm:$src2)))], IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>; } // ExeDomain = SSEPackedSingle @@ -6204,8 +5900,8 @@ let ExeDomain = SSEPackedDouble in { (outs RC:$dst), (ins RC:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))], - IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>; + [(set RC:$dst, (VT64 (OpNode RC:$src1, imm:$src2)))], + IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAdd]>; // Vector intrinsic operation, mem def PDm : SS4AIi8<opcpd, MRMSrcMem, @@ -6213,14 +5909,14 @@ let ExeDomain = SSEPackedDouble in { !strconcat(OpcodeStr, "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set RC:$dst, - (V2F64Int (mem_frag64 addr:$src1),imm:$src2))], - IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>; + (VT64 (OpNode (mem_frag64 addr:$src1),imm:$src2)))], + IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAddLd]>; } // ExeDomain = SSEPackedDouble } multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd, string OpcodeStr> { -let ExeDomain = GenericDomain, hasSideEffects = 0 in { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, @@ -6233,7 +5929,9 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in { !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 +let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3), !strconcat(OpcodeStr, @@ -6246,12 +5944,12 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in { !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>, Sched<[WriteFAddLd, ReadAfterLd]>; -} // ExeDomain = GenericDomain, hasSideEffects = 0 +} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 } multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd, string OpcodeStr> { -let ExeDomain = GenericDomain, hasSideEffects = 0 in { +let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in { def SSr : SS4AIi8<opcss, MRMSrcReg, (outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -6264,7 +5962,9 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in { !strconcat(OpcodeStr, "ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = SSEPackedSingle, hasSideEffects = 0 +let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in { def SDr : SS4AIi8<opcsd, MRMSrcReg, (outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2), !strconcat(OpcodeStr, @@ -6277,14 +5977,13 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in { !strconcat(OpcodeStr, "sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"), []>, Sched<[WriteFAddLd, ReadAfterLd]>; -} // ExeDomain = GenericDomain, hasSideEffects = 0 +} // ExeDomain = SSEPackedDouble, hasSideEffects = 0 } multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd, - string OpcodeStr, - Intrinsic F32Int, - Intrinsic F64Int, bit Is2Addr = 1> { -let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { + string OpcodeStr, ValueType VT32, ValueType VT64, + SDNode OpNode, bit Is2Addr = 1> { +let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in { def SSr_Int : SS4AIi8<opcss, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -6292,7 +5991,7 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { "ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>, + [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, Sched<[WriteFAdd]>; def SSm_Int : SS4AIi8<opcss, MRMSrcMem, @@ -6303,9 +6002,11 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { !strconcat(OpcodeStr, "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, + (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>, Sched<[WriteFAddLd, ReadAfterLd]>; +} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 +let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in { def SDr_Int : SS4AIi8<opcsd, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3), !if(Is2Addr, @@ -6313,7 +6014,7 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { "sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"), !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), - [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>, + [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>, Sched<[WriteFAdd]>; def SDm_Int : SS4AIi8<opcsd, MRMSrcMem, @@ -6324,26 +6025,25 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in { !strconcat(OpcodeStr, "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), [(set VR128:$dst, - (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, + (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>, Sched<[WriteFAddLd, ReadAfterLd]>; -} // ExeDomain = GenericDomain, isCodeGenOnly = 1 +} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 } // FP round - roundss, roundps, roundsd, roundpd -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { // Intrinsic form - defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, - loadv4f32, loadv2f64, - int_x86_sse41_round_ps, - int_x86_sse41_round_pd>, VEX, VEX_WIG; - defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, - loadv8f32, loadv4f64, - int_x86_avx_round_ps_256, - int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG; - defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", - int_x86_sse41_round_ss, - int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG; - defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG; + defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, v4f32, + v2f64, loadv4f32, loadv2f64, X86VRndScale>, + VEX, VEX_WIG; + defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, v8f32, + v4f64, loadv8f32, loadv4f64, X86VRndScale>, + VEX, VEX_L, VEX_WIG; +} +let Predicates = [HasAVX, NoAVX512] in { + defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", v4f32, v2f64, + X86RndScales, 0>, VEX_4V, VEX_LIG, VEX_WIG; + defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG, VEX_WIG; } let Predicates = [UseAVX] in { @@ -6369,7 +6069,7 @@ let Predicates = [UseAVX] in { (VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>; } -let Predicates = [HasAVX] in { +let Predicates = [HasAVX, NoVLX] in { def : Pat<(v4f32 (ffloor VR128:$src)), (VROUNDPSr VR128:$src, (i32 0x9))>; def : Pat<(v4f32 (fnearbyint VR128:$src)), @@ -6415,15 +6115,13 @@ let Predicates = [HasAVX] in { (VROUNDYPDr VR256:$src, (i32 0xB))>; } -defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, - memopv4f32, memopv2f64, int_x86_sse41_round_ps, - int_x86_sse41_round_pd>; +defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, v4f32, v2f64, + memopv4f32, memopv2f64, X86VRndScale>; defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">; let Constraints = "$src1 = $dst" in -defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", - int_x86_sse41_round_ss, int_x86_sse41_round_sd>; +defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", v4f32, v2f64, X86RndScales>; let Predicates = [UseSSE41] in { def : Pat<(ffloor FR32:$src), @@ -6474,6 +6172,11 @@ let Predicates = [UseSSE41] in { // SSE4.1 - Packed Bit Test //===----------------------------------------------------------------------===// +let Sched = WriteVecLogic in +def SSE_PTEST : OpndItins< + IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM +>; + // ptest instruction we'll lower to this in X86ISelLowering primarily from // the intel intrinsic that corresponds to this. let Defs = [EFLAGS], Predicates = [HasAVX] in { @@ -6572,22 +6275,20 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in { Sched<[WriteFAddLd]>, XS; } - - // SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16. multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, - Intrinsic IntId128, PatFrag ld_frag, + SDNode OpNode, PatFrag ld_frag, X86FoldableSchedWrite Sched> { def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (IntId128 VR128:$src))]>, + [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>, Sched<[Sched]>; def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [(set VR128:$dst, - (IntId128 (bitconvert (ld_frag addr:$src))))]>, + (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>, Sched<[Sched.Folded]>; } @@ -6595,10 +6296,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr, // model, although the naming is misleading. let Predicates = [HasAVX] in defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw", - int_x86_sse41_phminposuw, loadv2i64, + X86phminpos, loadv2i64, WriteVecIMul>, VEX, VEX_WIG; defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw", - int_x86_sse41_phminposuw, memopv2i64, + X86phminpos, memopv2i64, WriteVecIMul>; /// SS48I_binop_rm - Simple SSE41 binary operator. @@ -6763,8 +6464,8 @@ let Constraints = "$src1 = $dst" in { /// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, Intrinsic IntId, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, bit Is2Addr = 1, - OpndItins itins = DEFAULT_ITINS> { + X86MemOperand x86memop, bit Is2Addr, + OpndItins itins> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), @@ -6791,8 +6492,8 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr, /// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop, bit Is2Addr = 1, - OpndItins itins = DEFAULT_ITINS> { + X86MemOperand x86memop, bit Is2Addr, + OpndItins itins> { let isCommutable = 1 in def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), @@ -6816,6 +6517,21 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, Sched<[itins.Sched.Folded, ReadAfterLd]>; } +def BlendCommuteImm2 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0x03; + return getI8Imm(Imm ^ 0x03, SDLoc(N)); +}]>; + +def BlendCommuteImm4 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0x0f; + return getI8Imm(Imm ^ 0x0f, SDLoc(N)); +}]>; + +def BlendCommuteImm8 : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue() & 0xff; + return getI8Imm(Imm ^ 0xff, SDLoc(N)); +}]>; + let Predicates = [HasAVX] in { let isCommutable = 0 in { defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw, @@ -6823,26 +6539,6 @@ let Predicates = [HasAVX] in { DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG; } - let ExeDomain = SSEPackedSingle in { - defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32, - VR128, loadv4f32, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG; - defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32, - VR256, loadv8f32, f256mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; - } - let ExeDomain = SSEPackedDouble in { - defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64, - VR128, loadv2f64, f128mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG; - defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64, - VR256, loadv4f64, f256mem, 0, - DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; - } - defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16, - VR128, loadv2i64, i128mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG; - let ExeDomain = SSEPackedSingle in defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps, VR128, loadv4f32, f128mem, 0, @@ -6863,9 +6559,6 @@ let Predicates = [HasAVX2] in { VR256, loadv4i64, i256mem, 0, DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG; } - defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16, - VR256, loadv4i64, i256mem, 0, - DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { @@ -6874,17 +6567,7 @@ let Constraints = "$src1 = $dst" in { VR128, memopv2i64, i128mem, 1, SSE_MPSADBW_ITINS>; } - let ExeDomain = SSEPackedSingle in - defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32, - VR128, memopv4f32, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - let ExeDomain = SSEPackedDouble in - defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64, - VR128, memopv2f64, f128mem, - 1, SSE_INTALU_ITINS_FBLEND_P>; - defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16, - VR128, memopv2i64, i128mem, - 1, SSE_INTALU_ITINS_BLEND_P>; + let ExeDomain = SSEPackedSingle in defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps, VR128, memopv4f32, f128mem, 1, @@ -6895,6 +6578,82 @@ let Constraints = "$src1 = $dst" in { SSE_DPPD_ITINS>; } +/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate +multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, + ValueType OpVT, RegisterClass RC, PatFrag memop_frag, + X86MemOperand x86memop, bit Is2Addr, Domain d, + OpndItins itins, SDNodeXForm commuteXForm> { +let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in { + let isCommutable = 1 in + def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + itins.rr>, Sched<[itins.Sched]>; + def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, x86memop:$src2, u8imm:$src3), + !if(Is2Addr, + !strconcat(OpcodeStr, + "\t{$src3, $src2, $dst|$dst, $src2, $src3}"), + !strconcat(OpcodeStr, + "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")), + [(set RC:$dst, + (OpVT (OpNode RC:$src1, + (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>, + Sched<[itins.Sched.Folded, ReadAfterLd]>; +} + + // Pattern to commute if load is in first source. + def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), + RC:$src1, imm:$src3)), + (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, + (commuteXForm imm:$src3))>; +} + +let Predicates = [HasAVX] in { + defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32, + VR128, loadv4f32, f128mem, 0, SSEPackedSingle, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>, + VEX_4V, VEX_WIG; + defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32, + VR256, loadv8f32, f256mem, 0, SSEPackedSingle, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>, + VEX_4V, VEX_L, VEX_WIG; + defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64, + VR128, loadv2f64, f128mem, 0, SSEPackedDouble, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>, + VEX_4V, VEX_WIG; + defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64, + VR256, loadv4f64, f256mem, 0, SSEPackedDouble, + DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>, + VEX_4V, VEX_L, VEX_WIG; + defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16, + VR128, loadv2i64, i128mem, 0, SSEPackedInt, + DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>, + VEX_4V, VEX_WIG; +} + +let Predicates = [HasAVX2] in { + defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16, + VR256, loadv4i64, i256mem, 0, SSEPackedInt, + DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>, + VEX_4V, VEX_L, VEX_WIG; +} + +defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32, + VR128, memopv4f32, f128mem, 1, SSEPackedSingle, + SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>; +defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64, + VR128, memopv2f64, f128mem, 1, SSEPackedDouble, + SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>; +defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16, + VR128, memopv2i64, i128mem, 1, SSEPackedInt, + SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>; + // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. let Predicates = [HasAVX] in { @@ -7003,16 +6762,12 @@ let Predicates = [HasAVX2] in { // movs[s/d] are 1-2 byte shorter instructions. let Predicates = [UseAVX] in { let AddedComplexity = 15 in { - // Move scalar to XMM zero-extended, zeroing a VR128 then do a - // MOVS{S,D} to the lower bits. - def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))), - (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>; def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))), (VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>; def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))), (VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>; def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))), - (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>; + (VMOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>; // Move low f32 and clear high bits. def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))), @@ -7049,7 +6804,7 @@ let Predicates = [UseSSE41], AddedComplexity = 15 in { let Uses = [XMM0], Constraints = "$src1 = $dst" in { multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag, X86MemOperand x86memop, Intrinsic IntId, - OpndItins itins = DEFAULT_ITINS> { + OpndItins itins> { def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, @@ -7210,7 +6965,7 @@ multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> { (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; } -let Defs = [EFLAGS], usesCustomInserter = 1 in { +let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>, Requires<[HasAVX]>, VEX_WIG; defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>, @@ -7247,7 +7002,7 @@ multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> { (bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>; } -let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { +let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>, Requires<[HasAVX]>; defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>, @@ -7284,7 +7039,7 @@ multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> { (bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>; } -let Defs = [EFLAGS], usesCustomInserter = 1 in { +let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>, Requires<[HasAVX]>, VEX_WIG; defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>, @@ -7322,7 +7077,7 @@ multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> { imm:$src5))]>; } -let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in { +let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in { defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>, Requires<[HasAVX]>; defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>, @@ -7400,8 +7155,9 @@ let Constraints = "$src1 = $dst" in { // SHA-NI Instructions //===----------------------------------------------------------------------===// +// FIXME: Is there a better scheduler itinerary for SHA than WriteVecIMul? multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, - bit UsesXMM0 = 0> { + OpndItins itins, bit UsesXMM0 = 0> { def rr : I<Opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2), !if(UsesXMM0, @@ -7409,7 +7165,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")), [!if(UsesXMM0, (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)), - (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8; + (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))], itins.rr>, + T8, Sched<[itins.Sched]>; def rm : I<Opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), @@ -7420,7 +7177,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId, (set VR128:$dst, (IntId VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)), (set VR128:$dst, (IntId VR128:$src1, - (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8; + (bc_v4i32 (memopv2i64 addr:$src2)))))], itins.rm>, T8, + Sched<[itins.Sched.Folded, ReadAfterLd]>; } let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { @@ -7429,24 +7187,32 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in { "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, VR128:$src2, - (i8 imm:$src3)))]>, TA; + (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RR>, TA, + Sched<[WriteVecIMul]>; def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}", [(set VR128:$dst, (int_x86_sha1rnds4 VR128:$src1, (bc_v4i32 (memopv2i64 addr:$src2)), - (i8 imm:$src3)))]>, TA; + (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RM>, TA, + Sched<[WriteVecIMulLd, ReadAfterLd]>; - defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>; - defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>; - defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>; + defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte, + SSE_INTMUL_ITINS_P>; + defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1, + SSE_INTMUL_ITINS_P>; + defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2, + SSE_INTMUL_ITINS_P>; let Uses=[XMM0] in - defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>; + defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, + SSE_INTMUL_ITINS_P, 1>; - defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>; - defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>; + defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1, + SSE_INTMUL_ITINS_P>; + defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2, + SSE_INTMUL_ITINS_P>; } // Aliases with explicit %xmm0 @@ -7459,46 +7225,60 @@ def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}", // AES-NI Instructions //===----------------------------------------------------------------------===// -multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128, - PatFrag ld_frag, bit Is2Addr = 1> { - def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>, - Sched<[WriteAESDecEnc]>; - def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2), - !if(Is2Addr, - !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"), - !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")), - [(set VR128:$dst, - (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>, - Sched<[WriteAESDecEncLd, ReadAfterLd]>; +multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, + Intrinsic IntId, PatFrag ld_frag, + bit Is2Addr = 0, RegisterClass RC = VR128, + X86MemOperand MemOp = i128mem> { + let AsmString = OpcodeStr## + !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}", + "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { + def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2), "", + [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>, + Sched<[WriteAESDecEnc]>; + def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, MemOp:$src2), "", + [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>, + Sched<[WriteAESDecEncLd, ReadAfterLd]>; + } } // Perform One Round of an AES Encryption/Decryption Flow -let Predicates = [HasAVX, HasAES] in { +let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in { defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc", - int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG; + int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG; defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast", - int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG; + int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG; defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec", - int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG; + int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG; defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast", - int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG; + int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG; +} + +let Predicates = [NoVLX, HasVAES] in { + defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc", + int_x86_aesni_aesenc_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; + defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast", + int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; + defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec", + int_x86_aesni_aesdec_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; + defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast", + int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256, + i256mem>, VEX_4V, VEX_L, VEX_WIG; } let Constraints = "$src1 = $dst" in { defm AESENC : AESI_binop_rm_int<0xDC, "aesenc", - int_x86_aesni_aesenc, memopv2i64>; + int_x86_aesni_aesenc, memopv2i64, 1>; defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast", - int_x86_aesni_aesenclast, memopv2i64>; + int_x86_aesni_aesenclast, memopv2i64, 1>; defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec", - int_x86_aesni_aesdec, memopv2i64>; + int_x86_aesni_aesdec, memopv2i64, 1>; defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast", - int_x86_aesni_aesdeclast, memopv2i64>; + int_x86_aesni_aesdeclast, memopv2i64, 1>; } // Perform the AES InvMixColumn Transformation @@ -7558,63 +7338,103 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst), // PCLMUL Instructions //===----------------------------------------------------------------------===// -// AVX carry-less Multiplication instructions -let isCommutable = 1 in -def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>, - Sched<[WriteCLMul]>, VEX_WIG; - -def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (loadv2i64 addr:$src2), imm:$src3))]>, - Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG; - -// Carry-less Multiplication instructions -let Constraints = "$src1 = $dst" in { -let isCommutable = 1 in -def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), - (ins VR128:$src1, VR128:$src2, u8imm:$src3), - "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set VR128:$dst, - (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], - IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; - -def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), - (ins VR128:$src1, i128mem:$src2, u8imm:$src3), - "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", - [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1, - (memopv2i64 addr:$src2), imm:$src3))], - IIC_SSE_PCLMULQDQ_RM>, - Sched<[WriteCLMulLd, ReadAfterLd]>; -} // Constraints = "$src1 = $dst" +// Immediate transform to help with commuting. +def PCLMULCommuteImm : SDNodeXForm<imm, [{ + uint8_t Imm = N->getZExtValue(); + return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N)); +}]>; - -multiclass pclmul_alias<string asm, int immop> { - def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), - (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>; - - def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"), - (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>; - - def : InstAlias<!strconcat("vpclmul", asm, - "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), - (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop), - 0>; - - def : InstAlias<!strconcat("vpclmul", asm, - "dq {$src2, $src1, $dst|$dst, $src1, $src2}"), - (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop), - 0>; +// SSE carry-less Multiplication instructions +let Predicates = [NoAVX, HasPCLMUL] in { + let Constraints = "$src1 = $dst" in { + let isCommutable = 1 in + def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst), + (ins VR128:$src1, VR128:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))], + IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>; + + def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst), + (ins VR128:$src1, i128mem:$src2, u8imm:$src3), + "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}", + [(set VR128:$dst, + (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2), + imm:$src3))], + IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMulLd, ReadAfterLd]>; + } // Constraints = "$src1 = $dst" + + def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1, + (i8 imm:$src3)), + (PCLMULQDQrm VR128:$src1, addr:$src2, + (PCLMULCommuteImm imm:$src3))>; +} // Predicates = [NoAVX, HasPCLMUL] + +// SSE aliases +foreach HI = ["hq","lq"] in +foreach LO = ["hq","lq"] in { + def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", + (PCLMULQDQrr VR128:$dst, VR128:$src, + !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; + def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}", + (PCLMULQDQrm VR128:$dst, i128mem:$src, + !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>; } -defm : pclmul_alias<"hqhq", 0x11>; -defm : pclmul_alias<"hqlq", 0x01>; -defm : pclmul_alias<"lqhq", 0x10>; -defm : pclmul_alias<"lqlq", 0x00>; + +// AVX carry-less Multiplication instructions +multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp, + PatFrag LdFrag, Intrinsic IntId> { + let isCommutable = 1 in + def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set RC:$dst, + (IntId RC:$src1, RC:$src2, imm:$src3))]>, + Sched<[WriteCLMul]>; + + def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, MemOp:$src2, u8imm:$src3), + "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", + [(set RC:$dst, + (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>, + Sched<[WriteCLMulLd, ReadAfterLd]>; + + // We can commute a load in the first operand by swapping the sources and + // rotating the immediate. + def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)), + (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2, + (PCLMULCommuteImm imm:$src3))>; +} + +let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in +defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64, + int_x86_pclmulqdq>, VEX_4V, VEX_WIG; + +let Predicates = [NoVLX, HasVPCLMULQDQ] in +defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64, + int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG; + +multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC, + X86MemOperand MemOp, string Hi, string Lo> { + def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2, + !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; + def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}", + (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2, + !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>; +} + +multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC, + X86MemOperand MemOp> { + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">; + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">; + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">; + defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">; +} + +// AVX aliases +defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>; +defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>; //===----------------------------------------------------------------------===// // SSE4A Instructions @@ -7628,29 +7448,33 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst), (ins VR128:$src, u8imm:$len, u8imm:$idx), "extrq\t{$idx, $len, $src|$src, $len, $idx}", [(set VR128:$dst, (X86extrqi VR128:$src, imm:$len, - imm:$idx))]>, PD; + imm:$idx))], IIC_SSE_INTALU_P_RR>, + PD, Sched<[WriteVecALU]>; def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), "extrq\t{$mask, $src|$src, $mask}", [(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src, - VR128:$mask))]>, PD; + VR128:$mask))], IIC_SSE_INTALU_P_RR>, + PD, Sched<[WriteVecALU]>; def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx), "insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}", [(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2, - imm:$len, imm:$idx))]>, XD; + imm:$len, imm:$idx))], IIC_SSE_INTALU_P_RR>, + XD, Sched<[WriteVecALU]>; def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src, VR128:$mask), "insertq\t{$mask, $src|$src, $mask}", [(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src, - VR128:$mask))]>, XD; + VR128:$mask))], IIC_SSE_INTALU_P_RR>, + XD, Sched<[WriteVecALU]>; } } // ExeDomain = SSEPackedInt // Non-temporal (unaligned) scalar stores. let AddedComplexity = 400 in { // Prefer non-temporal versions -let mayStore = 1, SchedRW = [WriteStore] in { +let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in { def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src), "movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS; @@ -7712,6 +7536,15 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256, v4f64, v2f64, WriteFShuffle256>, VEX_L; +let Predicates = [HasAVX, NoVLX] in { + def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (VBROADCASTSSrm addr:$src)>; + def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))), + (VBROADCASTSSYrm addr:$src)>; + def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))), + (VBROADCASTSDYrm addr:$src)>; +} + //===----------------------------------------------------------------------===// // VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both // halves of a 256-bit vector. @@ -7852,21 +7685,23 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr, def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, f128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>, - VEX_4V; + [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))], + IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>; def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, - VEX_4V, VEX_L; + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))], + IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>; def mr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + [(IntSt addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>, + VEX_4V, Sched<[WriteStore]>; def Ymr : AVX8I<opc_mr, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>, + VEX_4V, VEX_L, Sched<[WriteStore]>; } let ExeDomain = SSEPackedSingle in @@ -7885,6 +7720,17 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd", //===----------------------------------------------------------------------===// // VPERMIL - Permute Single and Double Floating-Point Values // + +let Sched = WriteFShuffle in +def AVX_VPERMILV : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; + +let Sched = WriteFShuffle in +def AVX_VPERMIL : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; + multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr, RegisterClass RC, X86MemOperand x86memop_f, X86MemOperand x86memop_i, PatFrag i_frag, @@ -7937,83 +7783,81 @@ let isCommutable = 1 in def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src1, VR256:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2, + [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$src3))))]>, VEX_4V, VEX_L, Sched<[WriteFShuffle]>; def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, f256mem:$src2, u8imm:$src3), "vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}", - [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2), + [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2), (i8 imm:$src3)))]>, VEX_4V, VEX_L, Sched<[WriteFShuffleLd, ReadAfterLd]>; } +// Immediate transform to help with commuting. +def Perm2XCommuteImm : SDNodeXForm<imm, [{ + return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N)); +}]>; + let Predicates = [HasAVX] in { -def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, - (loadv4f64 addr:$src2), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +// Pattern with load in other operand. +def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2), + VR256:$src1, (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; } let Predicates = [HasAVX1Only] in { -def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>; - -def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, - (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, (loadv4i64 addr:$src2), (i8 imm:$imm))), (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, - (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, - (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), - (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>; +// Pattern with load in other operand. +def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), + VR256:$src1, (i8 imm:$imm))), + (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; } //===----------------------------------------------------------------------===// // VZERO - Zero YMM registers // // Note, these instruction do not affect the YMM16-YMM31. +let SchedRW = [WriteSystem] in { let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7, YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in { // Zero All YMM registers def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall", - [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG; + [(int_x86_avx_vzeroall)], IIC_AVX_ZERO>, PS, VEX, VEX_L, + Requires<[HasAVX]>, VEX_WIG; // Zero Upper bits of YMM registers def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper", - [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG; -} + [(int_x86_avx_vzeroupper)], IIC_AVX_ZERO>, PS, VEX, + Requires<[HasAVX]>, VEX_WIG; +} // Defs +} // SchedRW //===----------------------------------------------------------------------===// // Half precision conversion instructions //===----------------------------------------------------------------------===// -multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> { def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src), "vcvtph2ps\t{$src, $dst|$dst, $src}", - [(set RC:$dst, (Int VR128:$src))]>, + [(set RC:$dst, (X86cvtph2ps VR128:$src))]>, T8PD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayLoad = 1 in def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), - "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX, - Sched<[WriteCvtF2FLd]>; + "vcvtph2ps\t{$src, $dst|$dst, $src}", + [(set RC:$dst, (X86cvtph2ps (bc_v8i16 + (loadv2i64 addr:$src))))]>, + T8PD, VEX, Sched<[WriteCvtF2FLd]>; } -multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { +multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> { def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst), (ins RC:$src1, i32u8imm:$src2), "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}", - [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>, + [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>, TAPD, VEX, Sched<[WriteCvtF2F]>; let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteCvtF2FLd, WriteRMW] in @@ -8023,32 +7867,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> { TAPD, VEX; } -let Predicates = [HasF16C] in { - defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>; - defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L; - defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>; - defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L; +let Predicates = [HasF16C, NoVLX] in { + defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>; + defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L; + defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>; + defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L; // Pattern match vcvtph2ps of a scalar i64 load. - def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(int_x86_vcvtph2ps_128 (bitconvert - (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert + (v2i64 (scalar_to_vector (loadi64 addr:$src))))))), (VCVTPH2PSrm addr:$src)>; - def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16 - (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), - addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16 - (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))), - addr:$dst), - (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; - def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)), - addr:$dst), - (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; + def : Pat<(store (f64 (extractelt + (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (i64 (extractelt + (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))), + (iPTR 0))), addr:$dst), + (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>; + def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst), + (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>; } // Patterns for matching conversions from float to half-float and vice versa. @@ -8075,10 +7918,10 @@ let Predicates = [HasF16C, NoVLX] in { // AVX2 Instructions //===----------------------------------------------------------------------===// -/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate -multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, +/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate +multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, ValueType OpVT, RegisterClass RC, PatFrag memop_frag, - X86MemOperand x86memop> { + X86MemOperand x86memop, SDNodeXForm commuteXForm> { let isCommutable = 1 in def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$src3), @@ -8094,12 +7937,19 @@ multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode, (OpVT (OpNode RC:$src1, (bitconvert (memop_frag addr:$src2)), imm:$src3)))]>, Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V; + + // Pattern to commute if load is in first source. + def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)), + RC:$src1, imm:$src3)), + (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2, + (commuteXForm imm:$src3))>; } -defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32, - VR128, loadv2i64, i128mem>; -defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32, - VR256, loadv4i64, i256mem>, VEX_L; +defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32, + VR128, loadv2i64, i128mem, BlendCommuteImm4>; +defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32, + VR256, loadv4i64, i256mem, BlendCommuteImm8>, + VEX_L; // For insertion into the zero index (low half) of a 256-bit vector, it is // more efficient to generate a blend with immediate instead of an insert*128. @@ -8187,12 +8037,23 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32, defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64, v2i64, v4i64, NoVLX>; -let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { +let Predicates = [HasAVX2, NoVLX] in { // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD. def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))), (VPBROADCASTQrm addr:$src)>; def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))), (VPBROADCASTQYrm addr:$src)>; + + def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VPBROADCASTDrm addr:$src)>; + def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))), + (VPBROADCASTDYrm addr:$src)>; + def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VPBROADCASTQrm addr:$src)>; + def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))), + (VPBROADCASTQYrm addr:$src)>; +} +let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in { // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably. // This means we'll encounter truncated i32 loads; match that here. def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))), @@ -8279,6 +8140,13 @@ let Predicates = [HasAVX, NoVLX] in { // 128bit broadcasts: def : Pat<(v2f64 (X86VBroadcast f64:$src)), (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>; + def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))), + (VMOVDDUPrm addr:$src)>; + + def : Pat<(v2f64 (X86VBroadcast v2f64:$src)), + (VMOVDDUPrr VR128:$src)>; + def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))), + (VMOVDDUPrm addr:$src)>; } let Predicates = [HasAVX1Only] in { @@ -8306,12 +8174,24 @@ let Predicates = [HasAVX1Only] in { def : Pat<(v2i64 (X86VBroadcast i64:$src)), (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>; + def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))), + (VMOVDDUPrm addr:$src)>; } //===----------------------------------------------------------------------===// // VPERM - Permute instructions // +let Sched = WriteFShuffle256 in +def AVX2_PERMV_F : OpndItins< + IIC_SSE_SHUFP, IIC_SSE_SHUFP +>; + +let Sched = WriteShuffle256 in +def AVX2_PERMV_I : OpndItins< + IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI +>; + multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag, ValueType OpVT, X86FoldableSchedWrite Sched, X86MemOperand memOp> { @@ -8385,24 +8265,10 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst), (i8 imm:$src3)))]>, Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L; -let Predicates = [HasAVX2] in { -def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))), - (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>; - -def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)), - (i8 imm:$imm))), - (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, - (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))), - (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; -def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)), - (i8 imm:$imm))), - (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>; -} +let Predicates = [HasAVX2] in +def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2), + VR256:$src1, (i8 imm:$imm))), + (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>; //===----------------------------------------------------------------------===// @@ -8456,20 +8322,23 @@ multiclass avx2_pmovmask<string OpcodeStr, def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V; + [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))], + IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>; def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>, - VEX_4V, VEX_L; + [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))], + IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>; def mr : AVX28I<0x8e, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V; + [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>, + VEX_4V, Sched<[WriteStore]>; def Ymr : AVX28I<0x8e, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src1, VR256:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), - [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L; + [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>, + VEX_4V, VEX_L, Sched<[WriteStore]>; } defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd", @@ -8616,40 +8485,63 @@ let Predicates = [HasAVX2, NoVLX] in { (VPSRAVDYrm VR256:$src1, addr:$src2)>; } - - //===----------------------------------------------------------------------===// // VGATHER - GATHER Operations -multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256, - X86MemOperand memop128, X86MemOperand memop256> { + +// FIXME: Improve scheduling of gather instructions. +multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx, + ValueType VTy, PatFrag GatherNode128, + PatFrag GatherNode256, RegisterClass RC256, + X86MemOperand memop128, X86MemOperand memop256, + ValueType MTx = VTx, ValueType MTy = VTy> { def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb), (ins VR128:$src1, memop128:$src2, VR128:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX; + [(set (VTx VR128:$dst), (MTx VR128:$mask_wb), + (GatherNode128 VR128:$src1, VR128:$mask, + vectoraddr:$src2))]>, + VEX, Sched<[WriteLoad]>; def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb), (ins RC256:$src1, memop256:$src2, RC256:$mask), !strconcat(OpcodeStr, "\t{$mask, $src2, $dst|$dst, $src2, $mask}"), - []>, VEX, VEX_L; -} - -let mayLoad = 1, hasSideEffects = 0, Constraints - = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" - in { - defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W; - defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W; - defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>; - defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>; - - let ExeDomain = SSEPackedDouble in { - defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W; - defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W; - } - - let ExeDomain = SSEPackedSingle in { - defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>; - defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>; + [(set (VTy RC256:$dst), (MTy RC256:$mask_wb), + (GatherNode256 RC256:$src1, RC256:$mask, + vectoraddr:$src2))]>, + VEX, VEX_L, Sched<[WriteLoad]>; +} + +let Predicates = [UseAVX2] in { + let mayLoad = 1, hasSideEffects = 0, Constraints + = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb" + in { + defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32, + mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W; + defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64, + mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W; + defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32, + mgatherv8i32, VR256, vx128mem, vy256mem>; + defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64, + mgatherv4i64, VR128, vx64mem, vy128mem>; + + let ExeDomain = SSEPackedDouble in { + defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32, + mgatherv4i32, VR256, vx128mem, vx256mem, + v2i64, v4i64>, VEX_W; + defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64, + mgatherv4i64, VR256, vx128mem, vy256mem, + v2i64, v4i64>, VEX_W; + } + + let ExeDomain = SSEPackedSingle in { + defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32, + mgatherv8i32, VR256, vx128mem, vy256mem, + v4i32, v8i32>; + defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64, + mgatherv4i64, VR128, vx64mem, vy128mem, + v4i32, v4i32>; + } } } @@ -8708,3 +8600,82 @@ def : Pat<(xor FR128:$src1, FR128:$src2), (COPY_TO_REGCLASS (XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128), (COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>; + +//===----------------------------------------------------------------------===// +// GFNI instructions +//===----------------------------------------------------------------------===// + +multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT, + RegisterClass RC, PatFrag MemOpFrag, + X86MemOperand X86MemOp, bit Is2Addr = 0> { + let ExeDomain = SSEPackedInt, + AsmString = !if(Is2Addr, + OpcodeStr##"\t{$src2, $dst|$dst, $src2}", + OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in { + let isCommutable = 1 in + def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "", + [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))], + SSE_INTALU_ITINS_P.rr>, + Sched<[SSE_INTALU_ITINS_P.Sched]>, T8PD; + + def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "", + [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, + (bitconvert (MemOpFrag addr:$src2)))))], + SSE_INTALU_ITINS_P.rm>, + Sched<[SSE_INTALU_ITINS_P.Sched.Folded, ReadAfterLd]>, T8PD; + } +} + +multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT, + SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag, + X86MemOperand X86MemOp, bit Is2Addr = 0> { + let AsmString = !if(Is2Addr, + OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}", + OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in { + def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst), + (ins RC:$src1, RC:$src2, u8imm:$src3), "", + [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))], + SSE_INTALU_ITINS_P.rr, SSEPackedInt>, + Sched<[WriteVecALU]>; + def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst), + (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "", + [(set RC:$dst, (OpVT (OpNode RC:$src1, + (bitconvert (MemOpFrag addr:$src2)), + imm:$src3)))], + SSE_INTALU_ITINS_P.rm, SSEPackedInt>, + Sched<[WriteVecALU.Folded, ReadAfterLd]>; + } +} + +multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> { + let Constraints = "$src1 = $dst", + Predicates = [HasGFNI, UseSSE2] in + defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode, + VR128, loadv2i64, i128mem, 1>; + let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { + defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128, + loadv2i64, i128mem>, VEX_4V, VEX_W; + defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256, + loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W; + } +} + +// GF2P8MULB +let Constraints = "$src1 = $dst", + Predicates = [HasGFNI, UseSSE2] in +defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64, + i128mem, 1>; +let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in { + defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64, + i128mem>, VEX_4V; + defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64, + i256mem>, VEX_4V, VEX_L; +} +// GF2P8AFFINEINVQB, GF2P8AFFINEQB +let isCommutable = 0 in { + defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb", + X86GF2P8affineinvqb>, TAPD; + defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb", + X86GF2P8affineqb>, TAPD; +} + diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td index c847be7ec099..bdf478600279 100644 --- a/lib/Target/X86/X86InstrSVM.td +++ b/lib/Target/X86/X86InstrSVM.td @@ -15,48 +15,49 @@ //===----------------------------------------------------------------------===// // SVM instructions +let SchedRW = [WriteSystem] in { // 0F 01 D9 -def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB; +def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", [], IIC_SVM>, TB; // 0F 01 DC -def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB; +def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", [], IIC_STGI>, TB; // 0F 01 DD -def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB; +def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", [], IIC_CLGI>, TB; // 0F 01 DE let Uses = [EAX] in -def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB; +def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", [], IIC_SKINIT>, TB; // 0F 01 D8 let Uses = [EAX] in def VMRUN32 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>; + "vmrun\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>; let Uses = [RAX] in def VMRUN64 : I<0x01, MRM_D8, (outs), (ins), - "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; + "vmrun\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>; // 0F 01 DA let Uses = [EAX] in def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>; + "vmload\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>; let Uses = [RAX] in def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins), - "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; + "vmload\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>; // 0F 01 DB let Uses = [EAX] in def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>; + "vmsave\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>; let Uses = [RAX] in def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins), - "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>; + "vmsave\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>; // 0F 01 DF let Uses = [EAX, ECX] in def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>; + "invlpga\t{%ecx, %eax|eax, ecx}", [], IIC_INVLPG>, TB, Requires<[Not64BitMode]>; let Uses = [RAX, ECX] in def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins), - "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>; - + "invlpga\t{%ecx, %rax|rax, ecx}", [], IIC_INVLPG>, TB, Requires<[In64BitMode]>; +} // SchedRW diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td index 0efb383e1c8d..43e1752f2df2 100644 --- a/lib/Target/X86/X86InstrShiftRotate.td +++ b/lib/Target/X86/X86InstrShiftRotate.td @@ -83,7 +83,8 @@ def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst), OpSize32; def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t{%cl, $dst|$dst, cl}", - [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; + [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>, + Requires<[In64BitMode]>; } def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src), "shl{b}\t{$src, $dst|$dst, $src}", @@ -100,7 +101,7 @@ def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src), def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src), "shl{q}\t{$src, $dst|$dst, $src}", [(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; // Shift by 1 def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst), @@ -118,7 +119,7 @@ def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst), def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst), "shl{q}\t$dst", [(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } // SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { @@ -183,7 +184,8 @@ def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst), OpSize32; def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t{%cl, $dst|$dst, cl}", - [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>; + [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>, + Requires<[In64BitMode]>; } def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src), "shr{b}\t{$src, $dst|$dst, $src}", @@ -200,7 +202,7 @@ def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src), def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src), "shr{q}\t{$src, $dst|$dst, $src}", [(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; // Shift by 1 def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst), @@ -218,7 +220,7 @@ def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst), def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst), "shr{q}\t$dst", [(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } // SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { @@ -296,7 +298,7 @@ def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst), def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst), "sar{q}\t{%cl, $dst|$dst, cl}", [(store (sra (loadi64 addr:$dst), CL), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src), "sar{b}\t{$src, $dst|$dst, $src}", @@ -313,7 +315,7 @@ def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src), def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src), "sar{q}\t{$src, $dst|$dst, $src}", [(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; // Shift by 1 def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst), @@ -331,7 +333,7 @@ def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst), def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst), "sar{q}\t$dst", [(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } // SchedRW //===----------------------------------------------------------------------===// @@ -418,9 +420,10 @@ def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst), def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t$dst", [], IIC_SR>; + "rcl{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>; def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt), - "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; + "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, + Requires<[In64BitMode]>; def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst), "rcr{b}\t$dst", [], IIC_SR>; @@ -435,9 +438,10 @@ def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst), def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt), "rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32; def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t$dst", [], IIC_SR>; + "rcr{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>; def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt), - "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>; + "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, + Requires<[In64BitMode]>; } // Uses = [EFLAGS] let Uses = [CL, EFLAGS] in { @@ -448,7 +452,8 @@ def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst), def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst), "rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst), - "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, + Requires<[In64BitMode]>; def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst), "rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; @@ -457,7 +462,8 @@ def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst), def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst), "rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32; def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst), - "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>; + "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, + Requires<[In64BitMode]>; } // Uses = [CL, EFLAGS] } // SchedRW } // hasSideEffects = 0 @@ -532,7 +538,7 @@ def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst), def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst), "rol{q}\t{%cl, $dst|$dst, cl}", [(store (rotl (loadi64 addr:$dst), CL), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1), "rol{b}\t{$src1, $dst|$dst, $src1}", @@ -549,7 +555,7 @@ def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1), def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1), "rol{q}\t{$src1, $dst|$dst, $src1}", [(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; // Rotate by 1 def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst), @@ -567,7 +573,7 @@ def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst), def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst), "rol{q}\t$dst", [(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } // SchedRW let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in { @@ -640,7 +646,7 @@ def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst), def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t{%cl, $dst|$dst, cl}", [(store (rotr (loadi64 addr:$dst), CL), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src), "ror{b}\t{$src, $dst|$dst, $src}", @@ -657,7 +663,7 @@ def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src), def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src), "ror{q}\t{$src, $dst|$dst, $src}", [(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; // Rotate by 1 def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst), @@ -675,7 +681,7 @@ def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst), def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst), "ror{q}\t$dst", [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)], - IIC_SR>; + IIC_SR>, Requires<[In64BitMode]>; } // SchedRW @@ -961,16 +967,40 @@ let Predicates = [HasBMI2] in { (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; } - // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor - // + // Artificially lower the complexity so that we'll favor // mov (%ecx), %esi // shl $imm, $esi // // over // - // movb $imm %al + // movb $imm, %al // shlx %al, (%ecx), %esi - // - // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole - // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible. + let AddedComplexity = -20 in { + def : Pat<(sra (loadi32 addr:$src1), GR8:$src2), + (SARX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(sra (loadi64 addr:$src1), GR8:$src2), + (SARX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(srl (loadi32 addr:$src1), GR8:$src2), + (SHRX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(srl (loadi64 addr:$src1), GR8:$src2), + (SHRX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + + def : Pat<(shl (loadi32 addr:$src1), GR8:$src2), + (SHLX32rm addr:$src1, + (INSERT_SUBREG + (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + def : Pat<(shl (loadi64 addr:$src1), GR8:$src2), + (SHLX64rm addr:$src1, + (INSERT_SUBREG + (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>; + } } diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td index 2e5350ce979e..40d2dca4f9ec 100644 --- a/lib/Target/X86/X86InstrSystem.td +++ b/lib/Target/X86/X86InstrSystem.td @@ -19,7 +19,8 @@ let Defs = [RAX, RDX] in TB; let Defs = [RAX, RCX, RDX] in - def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB; + def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)], + IIC_RDTSCP>, TB; // CPU flow control instructions @@ -33,7 +34,7 @@ def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB; // Interrupt and SysCall Instructions. let Uses = [EFLAGS] in - def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>; + def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>; def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3", [(int_x86_int (i8 3))], IIC_INT3>; } // SchedRW @@ -154,13 +155,14 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src), //===----------------------------------------------------------------------===// // Segment override instruction prefixes -def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>; -def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>; -def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>; -def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>; -def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>; -def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>; - +let SchedRW = [WriteNop] in { +def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", [], IIC_NOP>; +def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", [], IIC_NOP>; +def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", [], IIC_NOP>; +def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", [], IIC_NOP>; +def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", [], IIC_NOP>; +def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", [], IIC_NOP>; +} // SchedRW //===----------------------------------------------------------------------===// // Moves to and from segment registers. @@ -175,11 +177,7 @@ def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>; let mayStore = 1 in { def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src), - "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16; -def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src), - "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32; -def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src), - "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSizeIgnore; } def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src), "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16; @@ -189,11 +187,7 @@ def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src), "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>; let mayLoad = 1 in { def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src), - "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16; -def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src), - "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32; -def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src), - "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>; + "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSizeIgnore; } } // SchedRW @@ -489,6 +483,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB; } // SchedRW //===----------------------------------------------------------------------===// +// CET instructions +let SchedRW = [WriteSystem], Predicates = [HasSHSTK] in{ + let Uses = [SSP] in { + let Defs = [SSP] in { + def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src", + [(int_x86_incsspd GR32:$src)]>, XS; + def INCSSPQ : RI<0xAE, MRM5r, (outs), (ins GR64:$src), "incsspq\t$src", + [(int_x86_incsspq GR64:$src)]>, XS; + } // Defs SSP + + let Constraints = "$src = $dst" in { + def RDSSPD : I<0x1E, MRM1r, (outs GR32:$dst), (ins GR32:$src), + "rdsspd\t$dst", + [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, XS; + def RDSSPQ : RI<0x1E, MRM1r, (outs GR64:$dst), (ins GR64:$src), + "rdsspq\t$dst", + [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, XS; + } + + let Defs = [SSP] in { + def SAVEPREVSSP : I<0x01, MRM_EA, (outs), (ins), "saveprevssp", + [(int_x86_saveprevssp)]>, XS; + def RSTORSSP : I<0x01, MRM5m, (outs), (ins i32mem:$src), + "rstorssp\t$src", + [(int_x86_rstorssp addr:$src)]>, XS; + } // Defs SSP + } // Uses SSP + + def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "wrssd\t{$src, $dst|$dst, $src}", + [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS; + def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "wrssq\t{$src, $dst|$dst, $src}", + [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8PS; + def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), + "wrussd\t{$src, $dst|$dst, $src}", + [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8PD; + def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), + "wrussq\t{$src, $dst|$dst, $src}", + [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD; + + let Defs = [SSP] in { + let Uses = [SSP] in { + def SETSSBSY : I<0x01, MRM_E8, (outs), (ins), "setssbsy", + [(int_x86_setssbsy)]>, XS; + } // Uses SSP + + def CLRSSBSY : I<0xAE, MRM6m, (outs), (ins i32mem:$src), + "clrssbsy\t$src", + [(int_x86_clrssbsy addr:$src)]>, XS; + } // Defs SSP +} // SchedRW && HasSHSTK + +//===----------------------------------------------------------------------===// // XSAVE instructions let SchedRW = [WriteSystem] in { let Predicates = [HasXSAVE] in { @@ -496,67 +544,60 @@ let Defs = [EDX, EAX], Uses = [ECX] in def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB; let Uses = [EDX, EAX, ECX] in - def XSETBV : I<0x01, MRM_D1, (outs), (ins), - "xsetbv", + def XSETBV : I<0x01, MRM_D1, (outs), (ins), + "xsetbv", [(int_x86_xsetbv ECX, EDX, EAX)]>, TB; } // HasXSAVE let Uses = [EDX, EAX] in { -let Predicates = [HasXSAVE] in { - def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), - "xsave\t$dst", - [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB; - def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), - "xsave64\t$dst", - [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; - def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor\t$dst", - [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB; - def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), - "xrstor64\t$dst", - [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; -} -let Predicates = [HasXSAVEOPT] in { - def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), - "xsaveopt\t$dst", - [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS; - def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), - "xsaveopt64\t$dst", - [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>; -} -let Predicates = [HasXSAVEC] in { - def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), - "xsavec\t$dst", - [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB; - def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), - "xsavec64\t$dst", - [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; -} -let Predicates = [HasXSAVES] in { - def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), - "xsaves\t$dst", - [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB; - def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), - "xsaves64\t$dst", - [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; - def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors\t$dst", - [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB; - def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), - "xrstors64\t$dst", - [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>; -} +def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave\t$dst", + [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>; +def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst), + "xsave64\t$dst", + [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>; +def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor\t$dst", + [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>; +def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst), + "xrstor64\t$dst", + [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>; +def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt\t$dst", + [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>; +def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst), + "xsaveopt64\t$dst", + [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>; +def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec\t$dst", + [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>; +def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst), + "xsavec64\t$dst", + [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>; +def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves\t$dst", + [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>; +def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst), + "xsaves64\t$dst", + [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>; +def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors\t$dst", + [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>; +def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst), + "xrstors64\t$dst", + [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>; } // Uses } // SchedRW //===----------------------------------------------------------------------===// // VIA PadLock crypto instructions -let Defs = [RAX, RDI], Uses = [RDX, RDI] in +let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB; def : InstAlias<"xstorerng", (XSTORE)>; +let SchedRW = [WriteSystem] in { let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in { def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB; def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB; @@ -571,67 +612,110 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in { } let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB; +} // SchedRW + //==-----------------------------------------------------------------------===// // PKU - enable protection key -let usesCustomInserter = 1 in { +let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def WRPKRU : PseudoI<(outs), (ins GR32:$src), [(int_x86_wrpkru GR32:$src)]>; def RDPKRU : PseudoI<(outs GR32:$dst), (ins), [(set GR32:$dst, (int_x86_rdpkru))]>; } +let SchedRW = [WriteSystem] in { let Defs = [EAX, EDX], Uses = [ECX] in - def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB; + def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", [], IIC_PKU>, TB; let Uses = [EAX, ECX, EDX] in - def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB; + def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", [], IIC_PKU>, TB; +} // SchedRW //===----------------------------------------------------------------------===// // FS/GS Base Instructions -let Predicates = [HasFSGSBase, In64BitMode] in { +let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in { def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins), "rdfsbase{l}\t$dst", - [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS; + [(set GR32:$dst, (int_x86_rdfsbase_32))], + IIC_SEGMENT_BASE_R>, XS; def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins), "rdfsbase{q}\t$dst", - [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS; + [(set GR64:$dst, (int_x86_rdfsbase_64))], + IIC_SEGMENT_BASE_R>, XS; def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins), "rdgsbase{l}\t$dst", - [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS; + [(set GR32:$dst, (int_x86_rdgsbase_32))], + IIC_SEGMENT_BASE_R>, XS; def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins), "rdgsbase{q}\t$dst", - [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS; + [(set GR64:$dst, (int_x86_rdgsbase_64))], + IIC_SEGMENT_BASE_R>, XS; def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src), "wrfsbase{l}\t$src", - [(int_x86_wrfsbase_32 GR32:$src)]>, XS; + [(int_x86_wrfsbase_32 GR32:$src)], + IIC_SEGMENT_BASE_W>, XS; def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src), "wrfsbase{q}\t$src", - [(int_x86_wrfsbase_64 GR64:$src)]>, XS; + [(int_x86_wrfsbase_64 GR64:$src)], + IIC_SEGMENT_BASE_W>, XS; def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src), "wrgsbase{l}\t$src", - [(int_x86_wrgsbase_32 GR32:$src)]>, XS; + [(int_x86_wrgsbase_32 GR32:$src)], IIC_SEGMENT_BASE_W>, XS; def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src), "wrgsbase{q}\t$src", - [(int_x86_wrgsbase_64 GR64:$src)]>, XS; + [(int_x86_wrgsbase_64 GR64:$src)], + IIC_SEGMENT_BASE_W>, XS; } //===----------------------------------------------------------------------===// // INVPCID Instruction +let SchedRW = [WriteSystem] in { def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD, Requires<[Not64BitMode]>; def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD, Requires<[In64BitMode]>; +} // SchedRW //===----------------------------------------------------------------------===// // SMAP Instruction -let Defs = [EFLAGS] in { - def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB; - def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB; +let Defs = [EFLAGS], SchedRW = [WriteSystem] in { + def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", [], IIC_SMAP>, TB; + def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", [], IIC_SMAP>, TB; } //===----------------------------------------------------------------------===// // SMX Instruction +let SchedRW = [WriteSystem] in { let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in { - def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB; -} + def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", [], IIC_SMX>, TB; +} // Uses, Defs +} // SchedRW + +//===----------------------------------------------------------------------===// +// RDPID Instruction +let SchedRW = [WriteSystem] in { +def RDPID32 : I<0xC7, MRM7r, (outs GR32:$src), (ins), + "rdpid\t$src", [], IIC_RDPID>, XS, + Requires<[Not64BitMode]>; +def RDPID64 : I<0xC7, MRM7r, (outs GR64:$src), (ins), + "rdpid\t$src", [], IIC_RDPID>, XS, + Requires<[In64BitMode]>; +} // SchedRW + +//===----------------------------------------------------------------------===// +// PTWRITE Instruction +let SchedRW = [WriteSystem] in { + +def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst), + "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS; +def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst), + "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS, + Requires<[In64BitMode]>; + +def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst), + "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS; +def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst), + "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS, + Requires<[In64BitMode]>; +} // SchedRW diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td index 61aac58a491f..10c6eef78639 100644 --- a/lib/Target/X86/X86InstrTSX.td +++ b/lib/Target/X86/X86InstrTSX.td @@ -18,6 +18,8 @@ def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>, [SDNPHasChain, SDNPSideEffect]>; +let SchedRW = [WriteSystem] in { + let usesCustomInserter = 1 in def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins), "# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>, @@ -45,11 +47,14 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins), def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm), "xabort\t$imm", [(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>; +} // SchedRW // HLE prefixes +let SchedRW = [WriteSystem] in { let isAsmParserOnly = 1 in { def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>; def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>; } +} // SchedRW diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td index 315a69e6a2a2..4bb2c204b368 100644 --- a/lib/Target/X86/X86InstrVMX.td +++ b/lib/Target/X86/X86InstrVMX.td @@ -15,56 +15,66 @@ //===----------------------------------------------------------------------===// // VMX instructions +let SchedRW = [WriteSystem] in { // 66 0F 38 80 def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD, + "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD, Requires<[Not64BitMode]>; def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD, + "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD, Requires<[In64BitMode]>; + // 66 0F 38 81 def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2), - "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD, Requires<[Not64BitMode]>; def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2), - "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD, + "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD, Requires<[In64BitMode]>; + // 0F 01 C1 -def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB; +def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", [], IIC_VMX>, TB; def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), "vmclear\t$vmcs", []>, PD; + // OF 01 D4 -def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB; +def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", [], IIC_VMX>, TB; + // 0F 01 C2 -def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB; +def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", [], IIC_VMX>, TB; + // 0F 01 C3 -def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB; +def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", [], IIC_VMX>, TB; def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs), - "vmptrld\t$vmcs", []>, PS; + "vmptrld\t$vmcs", [], IIC_VMX>, PS; def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs), - "vmptrst\t$vmcs", []>, TB; + "vmptrst\t$vmcs", [], IIC_VMX>, PS; def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src), - "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; + "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>; def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src), - "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; + "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>; + let mayStore = 1 in { def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src), - "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; + "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>; def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src), - "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; -} + "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>; +} // mayStore + def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src), - "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; + "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>; def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src), - "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; + "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>; + let mayLoad = 1 in { def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src), - "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>; + "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>; def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src), - "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>; -} + "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>; +} // mayLoad + // 0F 01 C4 def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB; def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon), "vmxon\t$vmxon", []>, XS; - +} // SchedRW diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td new file mode 100644 index 000000000000..c1cb4dcb16be --- /dev/null +++ b/lib/Target/X86/X86InstrVecCompiler.td @@ -0,0 +1,586 @@ +//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file describes the various vector pseudo instructions used by the +// compiler, as well as Pat patterns used during instruction selection. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// No op bitconverts +//===----------------------------------------------------------------------===// + +// Bitcasts between 128-bit vector types. Return the original type since +// no instruction is needed for the conversion +def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>; +def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>; +def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>; + +// Bitcasts between 256-bit vector types. Return the original type since +// no instruction is needed for the conversion +def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>; +def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>; + +// Bitcasts between 512-bit vector types. Return the original type since +// no instruction is needed for the conversion. +def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>; +def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>; + + +//===----------------------------------------------------------------------===// +// Non-instruction patterns +//===----------------------------------------------------------------------===// + +// A vector extract of the first f32/f64 position is a subregister copy +def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>; +def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))), + (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>; + +// Implicitly promote a 32-bit scalar to a vector. +def : Pat<(v4f32 (scalar_to_vector FR32:$src)), + (COPY_TO_REGCLASS FR32:$src, VR128)>; +// Implicitly promote a 64-bit scalar to a vector. +def : Pat<(v2f64 (scalar_to_vector FR64:$src)), + (COPY_TO_REGCLASS FR64:$src, VR128)>; + + +//===----------------------------------------------------------------------===// +// Subvector tricks +//===----------------------------------------------------------------------===// + +// Patterns for insert_subvector/extract_subvector to/from index=0 +multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT, + RegisterClass RC, ValueType VT, + SubRegIndex subIdx> { + def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))), + (subVT (EXTRACT_SUBREG RC:$src, subIdx))>; + + let AddedComplexity = 25 in // to give priority over vinsertf128rm + def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))), + (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>; +} + +// A 128-bit subvector extract from the first 256-bit vector position is a +// subregister copy that needs no instruction. Likewise, a 128-bit subvector +// insert to the first 256-bit vector position is a subregister copy that needs +// no instruction. +defm : subvector_subreg_lowering<VR128, v4i32, VR256, v8i32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v4f32, VR256, v8f32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>; + +// A 128-bit subvector extract from the first 512-bit vector position is a +// subregister copy that needs no instruction. Likewise, a 128-bit subvector +// insert to the first 512-bit vector position is a subregister copy that needs +// no instruction. +defm : subvector_subreg_lowering<VR128, v4i32, VR512, v16i32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v4f32, VR512, v16f32, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>; +defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>; + +// A 128-bit subvector extract from the first 512-bit vector position is a +// subregister copy that needs no instruction. Likewise, a 128-bit subvector +// insert to the first 512-bit vector position is a subregister copy that needs +// no instruction. +defm : subvector_subreg_lowering<VR256, v8i32, VR512, v16i32, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v8f32, VR512, v16f32, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v4i64, VR512, v8i64, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>; +defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>; + + +multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr, + RegisterClass RC, ValueType DstTy, + ValueType SrcTy, SubRegIndex SubIdx> { + def : Pat<(alignedstore (DstTy (extract_subvector + (SrcTy RC:$src), (iPTR 0))), addr:$dst), + (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst, + (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; + + def : Pat<(store (DstTy (extract_subvector + (SrcTy RC:$src), (iPTR 0))), addr:$dst), + (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst, + (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>; + defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>; + defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>; +} + +let Predicates = [HasVLX] in { + // Special patterns for storing subvector extracts of lower 128-bits + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64, + sub_xmm>; + defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32, + sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v2i64, + v4i64, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v4i32, + v8i32, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v8i16, + v16i16, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v16i8, + v32i8, sub_xmm>; + + // Special patterns for storing subvector extracts of lower 128-bits of 512. + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64, + sub_xmm>; + defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32, + sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v2i64, + v8i64, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v4i32, + v16i32, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v8i16, + v32i16, sub_xmm>; + defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v16i8, + v64i8, sub_xmm>; + + // Special patterns for storing subvector extracts of lower 256-bits of 512. + // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr + defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64, + sub_ymm>; + defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32, + sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v4i64, + v8i64, sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v8i32, + v16i32, sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v16i16, + v32i16, sub_ymm>; + defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v32i8, + v64i8, sub_ymm>; +} + +// If we're inserting into an all zeros vector, just use a plain move which +// will zero the upper bits. +// TODO: Is there a safe way to detect whether the producing instruction +// already zeroed the upper bits? +multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC, + ValueType DstTy, ValueType SrcTy, + ValueType ZeroTy, PatFrag memop, + SubRegIndex SubIdx> { + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + (SrcTy RC:$src), (iPTR 0))), + (SUBREG_TO_REG (i64 0), + (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src), SubIdx)>; + + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + (SrcTy (bitconvert (memop addr:$src))), + (iPTR 0))), + (SUBREG_TO_REG (i64 0), + (!cast<Instruction>("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>; +} + +let Predicates = [HasAVX, NoVLX] in { + defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64, + sub_xmm>; + defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64, + sub_xmm>; +} + +let Predicates = [HasVLX] in { + defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32, + loadv2f64, sub_xmm>; + defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32, + loadv4f32, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32, + loadv2i64, sub_xmm>; + + defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32, + loadv2f64, sub_xmm>; + defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32, + loadv4f32, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32, + loadv2i64, sub_xmm>; + defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32, + loadv2i64, sub_xmm>; + + defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32, + loadv4f64, sub_ymm>; + defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32, + loadv8f32, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32, + loadv4i64, sub_ymm>; +} + +let Predicates = [HasAVX512, NoVLX] in { + defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64, + sub_xmm>; + defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64, + sub_xmm>; + defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64, + sub_xmm>; + + defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32, + loadv4f64, sub_ymm>; + defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32, + loadv8f32, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32, + loadv4i64, sub_ymm>; + defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32, + loadv4i64, sub_ymm>; +} + +// List of opcodes that guaranteed to zero the upper elements of vector regs. +// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA +// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make +// this difficult. So starting with a couple opcodes used by reduction loops +// where we explicitly insert zeros. +class veczeroupper<ValueType vt, RegisterClass RC> : + PatLeaf<(vt RC:$src), [{ + return N->getOpcode() == X86ISD::VPMADDWD || + N->getOpcode() == X86ISD::PSADBW; + }]>; + +def zeroupperv2f64 : veczeroupper<v2f64, VR128>; +def zeroupperv4f32 : veczeroupper<v4f32, VR128>; +def zeroupperv2i64 : veczeroupper<v2i64, VR128>; +def zeroupperv4i32 : veczeroupper<v4i32, VR128>; +def zeroupperv8i16 : veczeroupper<v8i16, VR128>; +def zeroupperv16i8 : veczeroupper<v16i8, VR128>; + +def zeroupperv4f64 : veczeroupper<v4f64, VR256>; +def zeroupperv8f32 : veczeroupper<v8f32, VR256>; +def zeroupperv4i64 : veczeroupper<v4i64, VR256>; +def zeroupperv8i32 : veczeroupper<v8i32, VR256>; +def zeroupperv16i16 : veczeroupper<v16i16, VR256>; +def zeroupperv32i8 : veczeroupper<v32i8, VR256>; + + +// If we can guarantee the upper elements have already been zeroed we can elide +// an explicit zeroing. +multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy, + ValueType SrcTy, ValueType ZeroTy, + SubRegIndex SubIdx, PatLeaf Zeroupper> { + def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)), + Zeroupper:$src, (iPTR 0))), + (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>; +} + +// 128->256 +defm: subvector_zero_ellision<VR128, v4f64, v2f64, v8i32, sub_xmm, zeroupperv2f64>; +defm: subvector_zero_ellision<VR128, v8f32, v4f32, v8i32, sub_xmm, zeroupperv4f32>; +defm: subvector_zero_ellision<VR128, v4i64, v2i64, v8i32, sub_xmm, zeroupperv2i64>; +defm: subvector_zero_ellision<VR128, v8i32, v4i32, v8i32, sub_xmm, zeroupperv4i32>; +defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>; +defm: subvector_zero_ellision<VR128, v32i8, v16i8, v8i32, sub_xmm, zeroupperv16i8>; + +// 128->512 +defm: subvector_zero_ellision<VR128, v8f64, v2f64, v16i32, sub_xmm, zeroupperv2f64>; +defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>; +defm: subvector_zero_ellision<VR128, v8i64, v2i64, v16i32, sub_xmm, zeroupperv2i64>; +defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>; +defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>; +defm: subvector_zero_ellision<VR128, v64i8, v16i8, v16i32, sub_xmm, zeroupperv16i8>; + +// 256->512 +defm: subvector_zero_ellision<VR256, v8f64, v4f64, v16i32, sub_ymm, zeroupperv4f64>; +defm: subvector_zero_ellision<VR256, v16f32, v8f32, v16i32, sub_ymm, zeroupperv8f32>; +defm: subvector_zero_ellision<VR256, v8i64, v4i64, v16i32, sub_ymm, zeroupperv4i64>; +defm: subvector_zero_ellision<VR256, v16i32, v8i32, v16i32, sub_ymm, zeroupperv8i32>; +defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>; +defm: subvector_zero_ellision<VR256, v64i8, v32i8, v16i32, sub_ymm, zeroupperv32i8>; + + +class maskzeroupper<ValueType vt, RegisterClass RC> : + PatLeaf<(vt RC:$src), [{ + return isMaskZeroExtended(N); + }]>; + +def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>; +def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>; +def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>; +def maskzeroupperv16i1 : maskzeroupper<v16i1, VK16>; +def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>; + +// The patterns determine if we can depend on the upper bits of a mask register +// being zeroed by the previous operation so that we can skip explicit +// zeroing. +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv8i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK32)>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv16i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK16:$src, VK32)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv8i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv16i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK16:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv32i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK32:$src, VK64)>; +} + +let Predicates = [HasAVX512] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv8i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK8:$src, VK16)>; +} + +let Predicates = [HasVLX, HasDQI] in { + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK8)>; + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK8)>; +} + +let Predicates = [HasVLX] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK16)>; + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK16)>; +} + +let Predicates = [HasBWI, HasVLX] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK32)>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK32)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv2i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK2:$src, VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + maskzeroupperv4i1:$src, (iPTR 0))), + (COPY_TO_REGCLASS VK4:$src, VK64)>; +} + +// If the bits are not zero we have to fall back to explicitly zeroing by +// using shifts. +let Predicates = [HasAVX512, NoDQI] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK8:$mask, VK16), + (i8 8)), (i8 8))>; +} + +let Predicates = [HasDQI] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>; +} + +let Predicates = [HasVLX, HasDQI] in { + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8), + (i8 6)), (i8 6))>; + def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK4:$mask, VK8), + (i8 4)), (i8 4))>; +} + +let Predicates = [HasVLX] in { + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16), + (i8 14)), (i8 14))>; + def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16), + (i8 12)), (i8 12))>; +} + +let Predicates = [HasBWI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v16i1 VK16:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK32)>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v16i1 VK16:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK64)>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v32i1 VK32:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVDkk VK32:$mask), VK64)>; +} + +let Predicates = [HasBWI, NoDQI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK8:$mask, VK32), + (i8 24)), (i8 24))>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK8:$mask, VK64), + (i8 56)), (i8 56))>; +} + +let Predicates = [HasBWI, HasDQI] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK32)>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v8i1 VK8:$mask), (iPTR 0))), + (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>; +} + +let Predicates = [HasBWI, HasVLX] in { + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32), + (i8 30)), (i8 30))>; + def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK4:$mask, VK32), + (i8 28)), (i8 28))>; + + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v2i1 VK2:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64), + (i8 62)), (i8 62))>; + def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV), + (v4i1 VK4:$mask), (iPTR 0))), + (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64), + (i8 60)), (i8 60))>; +} diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td index 5dde2d07babe..c4b8e3e90d29 100644 --- a/lib/Target/X86/X86InstrXOP.td +++ b/lib/Target/X86/X86InstrXOP.td @@ -14,10 +14,11 @@ multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int VR128:$src))]>, XOP; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WritePHAdd]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, + Sched<[WritePHAddLd, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { @@ -43,30 +44,33 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int, Operand memop, ComplexPattern mem_cpat> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int VR128:$src))]>, XOP; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP; + [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP, + Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int VR128:$src))]>, XOP; + [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>; def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP; + [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, + Sched<[WriteFAddLd, ReadAfterLd]>; } multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> { def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L; + [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[WriteFAdd]>; def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src), !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), - [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L; + [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L, + Sched<[WriteFAddLd, ReadAfterLd]>; } let ExeDomain = SSEPackedSingle in { @@ -97,14 +101,14 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))))))]>, - XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>; + XOP_4V, VEX_W, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst), (ins i128mem:$src1, VR128:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), (vt128 VR128:$src2))))]>, - XOP, Sched<[WriteVarVecShift, ReadAfterLd]>; + XOP, Sched<[WriteVarVecShiftLd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst), @@ -115,10 +119,10 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode, } let ExeDomain = SSEPackedInt in { - defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>; - defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>; - defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>; - defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>; + defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8>; + defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32>; + defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64>; + defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16>; defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>; defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>; defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>; @@ -135,19 +139,21 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode, (ins VR128:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP; + (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, + XOP, Sched<[WriteVecShift]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2), !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, - (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP; + (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, + XOP, Sched<[WriteVecShiftLd, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { - defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>; - defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>; - defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>; - defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>; + defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8>; + defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32>; + defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64>; + defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16>; } // Instruction where second source can be memory, but third must be register @@ -158,14 +164,15 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> { !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, - (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V; + (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V, + Sched<[WriteVecIMul]>; def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set VR128:$dst, (Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)), - VR128:$src3))]>, XOP_4V; + VR128:$src3))]>, XOP_4V, Sched<[WriteVecIMulLd, ReadAfterLd]>; } let ExeDomain = SSEPackedInt in { @@ -213,8 +220,8 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> "\t{$src2, $src1, $dst|$dst, $src1, $src2}"), [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), - i8immZExt3:$cc)))]>, - XOP_4V; + imm:$cc)))]>, + XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>; def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, XOPCC:$cc), !strconcat("vpcom${cc}", Suffix, @@ -222,20 +229,20 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128> [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))), - i8immZExt3:$cc)))]>, - XOP_4V; + imm:$cc)))]>, + XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>; let isAsmParserOnly = 1, hasSideEffects = 0 in { def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V; + []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>; let mayLoad = 1 in def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, u8imm:$src3), !strconcat("vpcom", Suffix, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V; + []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>; } } @@ -259,7 +266,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), (vt128 VR128:$src3))))]>, - XOP_4V; + XOP_4V, Sched<[WriteShuffle]>; def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i128mem:$src3), !strconcat(OpcodeStr, @@ -267,7 +274,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR128:$dst, (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2), (vt128 (bitconvert (loadv2i64 addr:$src3))))))]>, - XOP_4V, VEX_W; + XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>; def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2, VR128:$src3), !strconcat(OpcodeStr, @@ -275,14 +282,14 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode, [(set VR128:$dst, (v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))), (vt128 VR128:$src3))))]>, - XOP_4V; + XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, VR128:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>; + []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>; } let ExeDomain = SSEPackedInt in { @@ -297,28 +304,29 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), - (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V; + (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V, + Sched<[WriteShuffle]>; def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, x86memop:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1), (X86andnp (load addr:$src3), RC:$src2))))]>, - XOP_4V, VEX_W; + XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>; def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1), (X86andnp RC:$src3, (load addr:$src2)))))]>, - XOP_4V; + XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3), !strconcat(OpcodeStr, "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), - []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>; + []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>; } let ExeDomain = SSEPackedInt in { @@ -335,7 +343,8 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, - (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>; + (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>, + Sched<[WriteFShuffle]>; def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4), !strconcat(OpcodeStr, @@ -343,21 +352,23 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC, [(set RC:$dst, (VT (X86vpermil2 RC:$src1, RC:$src2, (bitconvert (IntLdFrag addr:$src3)), - (i8 imm:$src4))))]>, VEX_W; + (i8 imm:$src4))))]>, VEX_W, + Sched<[WriteFShuffleLd, ReadAfterLd]>; def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), [(set RC:$dst, (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2), - RC:$src3, (i8 imm:$src4))))]>; + RC:$src3, (i8 imm:$src4))))]>, + Sched<[WriteFShuffleLd, ReadAfterLd]>; // For disassembler let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst), (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4), !strconcat(OpcodeStr, "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"), - []>, VEX_W, FoldGenData<NAME#rr>; + []>, VEX_W, Sched<[WriteFShuffle]>, FoldGenData<NAME#rr>; } let ExeDomain = SSEPackedDouble in { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index 859d3288db89..44bbc3f1b3fa 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -1,4 +1,4 @@ -//===- X86InstructionSelector.cpp ----------------------------*- C++ -*-==// +//===- X86InstructionSelector.cpp -----------------------------------------===// // // The LLVM Compiler Infrastructure // @@ -12,6 +12,7 @@ /// \todo This should be generated by TableGen. //===----------------------------------------------------------------------===// +#include "MCTargetDesc/X86BaseInfo.h" #include "X86InstrBuilder.h" #include "X86InstrInfo.h" #include "X86RegisterBankInfo.h" @@ -19,27 +20,36 @@ #include "X86Subtarget.h" #include "X86TargetMachine.h" #include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" +#include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/Utils.h" #include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineMemOperand.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/IR/Type.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/LowLevelTypeImpl.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" +#include <cassert> +#include <cstdint> +#include <tuple> #define DEBUG_TYPE "X86-isel" -#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h" - using namespace llvm; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - namespace { #define GET_GLOBALISEL_PREDICATE_BITSET @@ -51,15 +61,16 @@ public: X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI, const X86RegisterBankInfo &RBI); - bool select(MachineInstr &I) const override; + bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override; + static const char *getName() { return DEBUG_TYPE; } private: /// tblgen-erated 'select' implementation, used as the initial selector for /// the patterns that don't require complex C++. - bool selectImpl(MachineInstr &I) const; + bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const; // TODO: remove after supported by Tablegen-erated instruction selection. - unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, + unsigned getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const; bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, @@ -74,19 +85,28 @@ private: MachineFunction &MF) const; bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectAnyext(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const; bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; + MachineFunction &MF, + CodeGenCoverage &CoverageInfo) const; bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; + MachineFunction &MF, + CodeGenCoverage &CoverageInfo) const; bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectCondBranch(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const; // emit insert subreg instruction and insert it before MachineInstr &I bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I, @@ -171,21 +191,71 @@ X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg, return getRegClass(Ty, RegBank); } +static unsigned getSubRegIndex(const TargetRegisterClass *RC) { + unsigned SubIdx = X86::NoSubRegister; + if (RC == &X86::GR32RegClass) { + SubIdx = X86::sub_32bit; + } else if (RC == &X86::GR16RegClass) { + SubIdx = X86::sub_16bit; + } else if (RC == &X86::GR8RegClass) { + SubIdx = X86::sub_8bit; + } + + return SubIdx; +} + +static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) { + assert(TargetRegisterInfo::isPhysicalRegister(Reg)); + if (X86::GR64RegClass.contains(Reg)) + return &X86::GR64RegClass; + if (X86::GR32RegClass.contains(Reg)) + return &X86::GR32RegClass; + if (X86::GR16RegClass.contains(Reg)) + return &X86::GR16RegClass; + if (X86::GR8RegClass.contains(Reg)) + return &X86::GR8RegClass; + + llvm_unreachable("Unknown RegClass for PhysReg!"); +} + // Set X86 Opcode and constrain DestReg. bool X86InstructionSelector::selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const { - unsigned DstReg = I.getOperand(0).getReg(); + const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI); + const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); + + unsigned SrcReg = I.getOperand(1).getReg(); + const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); + const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI); + if (TargetRegisterInfo::isPhysicalRegister(DstReg)) { assert(I.isCopy() && "Generic operators do not allow physical registers"); + + if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID && + DstRegBank.getID() == X86::GPRRegBankID) { + + const TargetRegisterClass *SrcRC = + getRegClass(MRI.getType(SrcReg), SrcRegBank); + const TargetRegisterClass *DstRC = getRegClassFromGRPhysReg(DstReg); + + if (SrcRC != DstRC) { + // This case can be generated by ABI lowering, performe anyext + unsigned ExtSrc = MRI.createVirtualRegister(DstRC); + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(ExtSrc) + .addImm(0) + .addReg(SrcReg) + .addImm(getSubRegIndex(SrcRC)); + + I.getOperand(1).setReg(ExtSrc); + } + } + return true; } - const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); - const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - unsigned SrcReg = I.getOperand(1).getReg(); - const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) && "No phys reg on generic operators"); assert((DstSize == SrcSize || @@ -195,38 +265,28 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) && "Copy with different width?!"); - const TargetRegisterClass *RC = nullptr; + const TargetRegisterClass *DstRC = + getRegClass(MRI.getType(DstReg), DstRegBank); - switch (RegBank.getID()) { - case X86::GPRRegBankID: - assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values."); - RC = getRegClass(MRI.getType(DstReg), RegBank); + if (SrcRegBank.getID() == X86::GPRRegBankID && + DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize && + TargetRegisterInfo::isPhysicalRegister(SrcReg)) { + // Change the physical register to performe truncate. - // Change the physical register - if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) { - if (RC == &X86::GR32RegClass) - I.getOperand(1).setSubReg(X86::sub_32bit); - else if (RC == &X86::GR16RegClass) - I.getOperand(1).setSubReg(X86::sub_16bit); - else if (RC == &X86::GR8RegClass) - I.getOperand(1).setSubReg(X86::sub_8bit); + const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg); + if (DstRC != SrcRC) { + I.getOperand(1).setSubReg(getSubRegIndex(DstRC)); I.getOperand(1).substPhysReg(SrcReg, TRI); } - break; - case X86::VECRRegBankID: - RC = getRegClass(MRI.getType(DstReg), RegBank); - break; - default: - llvm_unreachable("Unknown RegBank!"); } // No need to constrain SrcReg. It will get constrained when // we hit another of its use or its defs. // Copies do not have constraints. const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg); - if (!OldRC || !RC->hasSubClassEq(OldRC)) { - if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { + if (!OldRC || !DstRC->hasSubClassEq(OldRC)) { + if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) << " operand\n"); return false; @@ -236,7 +296,8 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I, return true; } -bool X86InstructionSelector::select(MachineInstr &I) const { +bool X86InstructionSelector::select(MachineInstr &I, + CodeGenCoverage &CoverageInfo) const { assert(I.getParent() && "Instruction should be in a basic block!"); assert(I.getParent()->getParent() && "Instruction should be in a function!"); @@ -248,51 +309,69 @@ bool X86InstructionSelector::select(MachineInstr &I) const { if (!isPreISelGenericOpcode(Opcode)) { // Certain non-generic instructions also need some special handling. + if (Opcode == TargetOpcode::LOAD_STACK_GUARD) + return false; + if (I.isCopy()) return selectCopy(I, MRI); - // TODO: handle more cases - LOAD_STACK_GUARD, PHI return true; } assert(I.getNumOperands() == I.getNumExplicitOperands() && "Generic instruction has unexpected implicit operands\n"); - if (selectImpl(I)) + if (selectImpl(I, CoverageInfo)) return true; DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs())); // TODO: This should be implemented by tblgen. - if (selectLoadStoreOp(I, MRI, MF)) - return true; - if (selectFrameIndexOrGep(I, MRI, MF)) - return true; - if (selectGlobalValue(I, MRI, MF)) - return true; - if (selectConstant(I, MRI, MF)) - return true; - if (selectTrunc(I, MRI, MF)) - return true; - if (selectZext(I, MRI, MF)) - return true; - if (selectCmp(I, MRI, MF)) - return true; - if (selectUadde(I, MRI, MF)) - return true; - if (selectUnmergeValues(I, MRI, MF)) - return true; - if (selectMergeValues(I, MRI, MF)) - return true; - if (selectExtract(I, MRI, MF)) - return true; - if (selectInsert(I, MRI, MF)) - return true; + switch (I.getOpcode()) { + default: + return false; + case TargetOpcode::G_STORE: + case TargetOpcode::G_LOAD: + return selectLoadStoreOp(I, MRI, MF); + case TargetOpcode::G_GEP: + case TargetOpcode::G_FRAME_INDEX: + return selectFrameIndexOrGep(I, MRI, MF); + case TargetOpcode::G_GLOBAL_VALUE: + return selectGlobalValue(I, MRI, MF); + case TargetOpcode::G_CONSTANT: + return selectConstant(I, MRI, MF); + case TargetOpcode::G_FCONSTANT: + return materializeFP(I, MRI, MF); + case TargetOpcode::G_TRUNC: + return selectTrunc(I, MRI, MF); + case TargetOpcode::G_ZEXT: + return selectZext(I, MRI, MF); + case TargetOpcode::G_ANYEXT: + return selectAnyext(I, MRI, MF); + case TargetOpcode::G_ICMP: + return selectCmp(I, MRI, MF); + case TargetOpcode::G_UADDE: + return selectUadde(I, MRI, MF); + case TargetOpcode::G_UNMERGE_VALUES: + return selectUnmergeValues(I, MRI, MF, CoverageInfo); + case TargetOpcode::G_MERGE_VALUES: + return selectMergeValues(I, MRI, MF, CoverageInfo); + case TargetOpcode::G_EXTRACT: + return selectExtract(I, MRI, MF); + case TargetOpcode::G_INSERT: + return selectInsert(I, MRI, MF); + case TargetOpcode::G_BRCOND: + return selectCondBranch(I, MRI, MF); + case TargetOpcode::G_IMPLICIT_DEF: + case TargetOpcode::G_PHI: + return selectImplicitDefOrPHI(I, MRI); + } return false; } -unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, +unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty, + const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const { bool Isload = (Opc == TargetOpcode::G_LOAD); @@ -366,9 +445,9 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, } // Fill in an address from the given instruction. -void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI, - X86AddressMode &AM) { - +static void X86SelectAddress(const MachineInstr &I, + const MachineRegisterInfo &MRI, + X86AddressMode &AM) { assert(I.getOperand(0).isReg() && "unsupported opperand."); assert(MRI.getType(I.getOperand(0).getReg()).isPointer() && "unsupported type."); @@ -390,17 +469,15 @@ void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI, // Default behavior. AM.Base.Reg = I.getOperand(0).getReg(); - return; } bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - unsigned Opc = I.getOpcode(); - if (Opc != TargetOpcode::G_STORE && Opc != TargetOpcode::G_LOAD) - return false; + assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) && + "unexpected instruction"); const unsigned DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); @@ -447,8 +524,8 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, MachineFunction &MF) const { unsigned Opc = I.getOpcode(); - if (Opc != TargetOpcode::G_FRAME_INDEX && Opc != TargetOpcode::G_GEP) - return false; + assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) && + "unexpected instruction"); const unsigned DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); @@ -473,10 +550,8 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I, bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - unsigned Opc = I.getOpcode(); - - if (Opc != TargetOpcode::G_GLOBAL_VALUE) - return false; + assert((I.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) && + "unexpected instruction"); auto GV = I.getOperand(1).getGlobal(); if (GV->isThreadLocal()) { @@ -485,7 +560,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, // Can't handle alternate code models yet. if (TM.getCodeModel() != CodeModel::Small) - return 0; + return false; X86AddressMode AM; AM.GV = GV; @@ -521,8 +596,8 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I, bool X86InstructionSelector::selectConstant(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_CONSTANT) - return false; + assert((I.getOpcode() == TargetOpcode::G_CONSTANT) && + "unexpected instruction"); const unsigned DefReg = I.getOperand(0).getReg(); LLT Ty = MRI.getType(DefReg); @@ -550,14 +625,13 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I, case 32: NewOpc = X86::MOV32ri; break; - case 64: { + case 64: // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used if (isInt<32>(Val)) NewOpc = X86::MOV64ri32; else NewOpc = X86::MOV64ri; break; - } default: llvm_unreachable("Can't select G_CONSTANT, unsupported type."); } @@ -569,8 +643,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I, bool X86InstructionSelector::selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_TRUNC) - return false; + assert((I.getOpcode() == TargetOpcode::G_TRUNC) && "unexpected instruction"); const unsigned DstReg = I.getOperand(0).getReg(); const unsigned SrcReg = I.getOperand(1).getReg(); @@ -628,8 +701,7 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, bool X86InstructionSelector::selectZext(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_ZEXT) - return false; + assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction"); const unsigned DstReg = I.getOperand(0).getReg(); const unsigned SrcReg = I.getOperand(1).getReg(); @@ -673,11 +745,59 @@ bool X86InstructionSelector::selectZext(MachineInstr &I, return true; } +bool X86InstructionSelector::selectAnyext(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction"); + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + + const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI); + const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI); + + assert(DstRB.getID() == SrcRB.getID() && + "G_ANYEXT input/output on different banks\n"); + + assert(DstTy.getSizeInBits() > SrcTy.getSizeInBits() && + "G_ANYEXT incorrect operand size"); + + if (DstRB.getID() != X86::GPRRegBankID) + return false; + + const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB); + const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB); + + if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) || + !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) { + DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) + << " operand\n"); + return false; + } + + if (SrcRC == DstRC) { + I.setDesc(TII.get(X86::COPY)); + return true; + } + + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG)) + .addDef(DstReg) + .addImm(0) + .addReg(SrcReg) + .addImm(getSubRegIndex(SrcRC)); + + I.eraseFromParent(); + return true; +} + bool X86InstructionSelector::selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_ICMP) - return false; + assert((I.getOpcode() == TargetOpcode::G_ICMP) && "unexpected instruction"); X86::CondCode CC; bool SwapArgs; @@ -729,8 +849,7 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I, bool X86InstructionSelector::selectUadde(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_UADDE) - return false; + assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction"); const unsigned DstReg = I.getOperand(0).getReg(); const unsigned CarryOutReg = I.getOperand(1).getReg(); @@ -789,9 +908,8 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I, bool X86InstructionSelector::selectExtract(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - - if (I.getOpcode() != TargetOpcode::G_EXTRACT) - return false; + assert((I.getOpcode() == TargetOpcode::G_EXTRACT) && + "unexpected instruction"); const unsigned DstReg = I.getOperand(0).getReg(); const unsigned SrcReg = I.getOperand(1).getReg(); @@ -848,7 +966,6 @@ bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); unsigned SubIdx = X86::NoSubRegister; @@ -887,7 +1004,6 @@ bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - const LLT DstTy = MRI.getType(DstReg); const LLT SrcTy = MRI.getType(SrcReg); unsigned SubIdx = X86::NoSubRegister; @@ -925,9 +1041,7 @@ bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg, bool X86InstructionSelector::selectInsert(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const { - - if (I.getOpcode() != TargetOpcode::G_INSERT) - return false; + assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction"); const unsigned DstReg = I.getOperand(0).getReg(); const unsigned SrcReg = I.getOperand(1).getReg(); @@ -982,11 +1096,11 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I, return constrainSelectedInstRegOperands(I, TII, TRI, RBI); } -bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_UNMERGE_VALUES) - return false; +bool X86InstructionSelector::selectUnmergeValues( + MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF, + CodeGenCoverage &CoverageInfo) const { + assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) && + "unexpected instruction"); // Split to extracts. unsigned NumDefs = I.getNumOperands() - 1; @@ -994,14 +1108,13 @@ bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I, unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits(); for (unsigned Idx = 0; Idx < NumDefs; ++Idx) { - MachineInstr &ExtrInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::G_EXTRACT), I.getOperand(Idx).getReg()) .addReg(SrcReg) .addImm(Idx * DefSize); - if (!select(ExtrInst)) + if (!select(ExtrInst, CoverageInfo)) return false; } @@ -1009,11 +1122,11 @@ bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I, return true; } -bool X86InstructionSelector::selectMergeValues(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - if (I.getOpcode() != TargetOpcode::G_MERGE_VALUES) - return false; +bool X86InstructionSelector::selectMergeValues( + MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF, + CodeGenCoverage &CoverageInfo) const { + assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES) && + "unexpected instruction"); // Split to inserts. unsigned DstReg = I.getOperand(0).getReg(); @@ -1032,7 +1145,6 @@ bool X86InstructionSelector::selectMergeValues(MachineInstr &I, return false; for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) { - unsigned Tmp = MRI.createGenericVirtualRegister(DstTy); MRI.setRegBank(Tmp, RegBank); @@ -1044,7 +1156,7 @@ bool X86InstructionSelector::selectMergeValues(MachineInstr &I, DefReg = Tmp; - if (!select(InsertInst)) + if (!select(InsertInst, CoverageInfo)) return false; } @@ -1052,12 +1164,127 @@ bool X86InstructionSelector::selectMergeValues(MachineInstr &I, TII.get(TargetOpcode::COPY), DstReg) .addReg(DefReg); - if (!select(CopyInst)) + if (!select(CopyInst, CoverageInfo)) return false; I.eraseFromParent(); return true; } + +bool X86InstructionSelector::selectCondBranch(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction"); + + const unsigned CondReg = I.getOperand(0).getReg(); + MachineBasicBlock *DestMBB = I.getOperand(1).getMBB(); + + MachineInstr &TestInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri)) + .addReg(CondReg) + .addImm(1); + BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JNE_1)) + .addMBB(DestMBB); + + constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; +} + +bool X86InstructionSelector::materializeFP(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + assert((I.getOpcode() == TargetOpcode::G_FCONSTANT) && + "unexpected instruction"); + + // Can't handle alternate code models yet. + CodeModel::Model CM = TM.getCodeModel(); + if (CM != CodeModel::Small && CM != CodeModel::Large) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const LLT DstTy = MRI.getType(DstReg); + const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); + unsigned Align = DstTy.getSizeInBits(); + const DebugLoc &DbgLoc = I.getDebugLoc(); + + unsigned Opc = getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Align); + + // Create the load from the constant pool. + const ConstantFP *CFP = I.getOperand(1).getFPImm(); + unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Align); + MachineInstr *LoadInst = nullptr; + unsigned char OpFlag = STI.classifyLocalReference(nullptr); + + if (CM == CodeModel::Large && STI.is64Bit()) { + // Under X86-64 non-small code model, GV (and friends) are 64-bits, so + // they cannot be folded into immediate fields. + + unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass); + BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg) + .addConstantPoolIndex(CPI, 0, OpFlag); + + MachineMemOperand *MMO = MF.getMachineMemOperand( + MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad, + MF.getDataLayout().getPointerSize(), Align); + + LoadInst = + addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg), + AddrReg) + .addMemOperand(MMO); + + } else if (CM == CodeModel::Small || !STI.is64Bit()) { + // Handle the case when globals fit in our immediate field. + // This is true for X86-32 always and X86-64 when in -mcmodel=small mode. + + // x86-32 PIC requires a PIC base register for constant pools. + unsigned PICBase = 0; + if (OpFlag == X86II::MO_PIC_BASE_OFFSET || OpFlag == X86II::MO_GOTOFF) { + // PICBase can be allocated by TII.getGlobalBaseReg(&MF). + // In DAGISEL the code that initialize it generated by the CGBR pass. + return false; // TODO support the mode. + } else if (STI.is64Bit() && TM.getCodeModel() == CodeModel::Small) + PICBase = X86::RIP; + + LoadInst = addConstantPoolReference( + BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg), CPI, PICBase, + OpFlag); + } else + return false; + + constrainSelectedInstRegOperands(*LoadInst, TII, TRI, RBI); + I.eraseFromParent(); + return true; +} + +bool X86InstructionSelector::selectImplicitDefOrPHI( + MachineInstr &I, MachineRegisterInfo &MRI) const { + assert((I.getOpcode() == TargetOpcode::G_IMPLICIT_DEF || + I.getOpcode() == TargetOpcode::G_PHI) && + "unexpected instruction"); + + unsigned DstReg = I.getOperand(0).getReg(); + + if (!MRI.getRegClassOrNull(DstReg)) { + const LLT DstTy = MRI.getType(DstReg); + const TargetRegisterClass *RC = getRegClass(DstTy, DstReg, MRI); + + if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) { + DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode()) + << " operand\n"); + return false; + } + } + + if (I.getOpcode() == TargetOpcode::G_IMPLICIT_DEF) + I.setDesc(TII.get(X86::IMPLICIT_DEF)); + else + I.setDesc(TII.get(X86::PHI)); + + return true; +} + InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp index f0ed4bc16e2f..cdb24b9d40a6 100644 --- a/lib/Target/X86/X86InterleavedAccess.cpp +++ b/lib/Target/X86/X86InterleavedAccess.cpp @@ -1,26 +1,44 @@ -//===--------- X86InterleavedAccess.cpp ----------------------------------===// +//===- X86InterleavedAccess.cpp -------------------------------------------===// // // The LLVM Compiler Infrastructure // // This file is distributed under the University of Illinois Open Source // License. See LICENSE.TXT for details. // -//===--------------------------------------------------------------------===// -/// +//===----------------------------------------------------------------------===// +// /// \file /// This file contains the X86 implementation of the interleaved accesses /// optimization generating X86-specific instructions/intrinsics for /// interleaved access groups. -/// -//===--------------------------------------------------------------------===// +// +//===----------------------------------------------------------------------===// #include "X86ISelLowering.h" -#include "X86TargetMachine.h" +#include "X86Subtarget.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/Analysis/VectorUtils.h" +#include "llvm/CodeGen/MachineValueType.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include <algorithm> +#include <cassert> +#include <cmath> +#include <cstdint> using namespace llvm; namespace { + /// \brief This class holds necessary information to represent an interleaved /// access group and supports utilities to lower the group into /// X86-specific instructions/intrinsics. @@ -69,7 +87,18 @@ class X86InterleavedAccessGroup { /// Out-V2 = p3, q3, r3, s3 /// Out-V3 = P4, q4, r4, s4 void transpose_4x4(ArrayRef<Instruction *> InputVectors, - SmallVectorImpl<Value *> &TrasposedVectors); + SmallVectorImpl<Value *> &TransposedMatrix); + void interleave8bitStride4(ArrayRef<Instruction *> InputVectors, + SmallVectorImpl<Value *> &TransposedMatrix, + unsigned NumSubVecElems); + void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors, + SmallVectorImpl<Value *> &TransposedMatrix); + void interleave8bitStride3(ArrayRef<Instruction *> InputVectors, + SmallVectorImpl<Value *> &TransposedMatrix, + unsigned NumSubVecElems); + void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors, + SmallVectorImpl<Value *> &TransposedMatrix, + unsigned NumSubVecElems); public: /// In order to form an interleaved access group X86InterleavedAccessGroup @@ -94,38 +123,58 @@ public: /// instructions/intrinsics. bool lowerIntoOptimizedSequence(); }; + } // end anonymous namespace bool X86InterleavedAccessGroup::isSupported() const { VectorType *ShuffleVecTy = Shuffles[0]->getType(); - uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy); Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType(); + unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy); + unsigned WideInstSize; + + // Currently, lowering is supported for the following vectors: + // Stride 4: + // 1. Store and load of 4-element vectors of 64 bits on AVX. + // 2. Store of 16/32-element vectors of 8 bits on AVX. + // Stride 3: + // 1. Load of 16/32-element vectors of 8 bits on AVX. + if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3)) + return false; - // Currently, lowering is supported for 4-element vectors of 64 bits on AVX. - uint64_t ExpectedShuffleVecSize; - if (isa<LoadInst>(Inst)) - ExpectedShuffleVecSize = 256; - else - ExpectedShuffleVecSize = 1024; + if (isa<LoadInst>(Inst)) { + WideInstSize = DL.getTypeSizeInBits(Inst->getType()); + if (cast<LoadInst>(Inst)->getPointerAddressSpace()) + return false; + } else + WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType()); + + // We support shuffle represents stride 4 for byte type with size of + // WideInstSize. + if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4) + return true; + + if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 && + (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 || + WideInstSize == 2048)) + return true; - if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize || - DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4) - return false; + if (ShuffleElemSize == 8 && Factor == 3 && + (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536)) + return true; - return true; + return false; } void X86InterleavedAccessGroup::decompose( Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy, SmallVectorImpl<Instruction *> &DecomposedVectors) { - assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) && "Expected Load or Shuffle"); - Type *VecTy = VecInst->getType(); - (void)VecTy; - assert(VecTy->isVectorTy() && - DL.getTypeSizeInBits(VecTy) >= + Type *VecWidth = VecInst->getType(); + (void)VecWidth; + assert(VecWidth->isVectorTy() && + DL.getTypeSizeInBits(VecWidth) >= DL.getTypeSizeInBits(SubVecTy) * NumSubVectors && "Invalid Inst-size!!!"); @@ -137,19 +186,30 @@ void X86InterleavedAccessGroup::decompose( for (unsigned i = 0; i < NumSubVectors; ++i) DecomposedVectors.push_back( cast<ShuffleVectorInst>(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Builder, Indices[i], - SubVecTy->getVectorNumElements(), 0)))); + Op0, Op1, + createSequentialMask(Builder, Indices[i], + SubVecTy->getVectorNumElements(), 0)))); return; } // Decompose the load instruction. LoadInst *LI = cast<LoadInst>(VecInst); Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace()); - Value *VecBasePtr = - Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); - + Value *VecBasePtr; + unsigned int NumLoads = NumSubVectors; + // In the case of stride 3 with a vector of 32 elements load the information + // in the following way: + // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1] + unsigned VecLength = DL.getTypeSizeInBits(VecWidth); + if (VecLength == 768 || VecLength == 1536) { + Type *VecTran = + VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo(); + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran); + NumLoads = NumSubVectors * (VecLength / 384); + } else + VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy); // Generate N loads of T type. - for (unsigned i = 0; i < NumSubVectors; i++) { + for (unsigned i = 0; i < NumLoads; i++) { // TODO: Support inbounds GEP. Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i)); Instruction *NewLoad = @@ -158,6 +218,470 @@ void X86InterleavedAccessGroup::decompose( } } +// Changing the scale of the vector type by reducing the number of elements and +// doubling the scalar size. +static MVT scaleVectorType(MVT VT) { + unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2; + return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize), + VT.getVectorNumElements() / 2); +} + +static uint32_t Concat[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 }; + +// genShuffleBland - Creates shuffle according to two vectors.This function is +// only works on instructions with lane inside 256 registers. According to +// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The +// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'. +// Where the 'LowOffset' refers to the first vector and the highOffset refers to +// the second vector. +// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20| +// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25| +// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31| +// For the sequence to work as a mirror to the load. +// We must consider the elements order as above. +// In this function we are combining two types of shuffles. +// The first one is vpshufed and the second is a type of "blend" shuffle. +// By computing the shuffle on a sequence of 16 elements(one lane) and add the +// correct offset. We are creating a vpsuffed + blend sequence between two +// shuffles. +static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask, + SmallVectorImpl<uint32_t> &Out, int LowOffset, + int HighOffset) { + assert(VT.getSizeInBits() >= 256 && + "This function doesn't accept width smaller then 256"); + unsigned NumOfElm = VT.getVectorNumElements(); + for (unsigned i = 0; i < Mask.size(); i++) + Out.push_back(Mask[i] + LowOffset); + for (unsigned i = 0; i < Mask.size(); i++) + Out.push_back(Mask[i] + HighOffset + NumOfElm); +} + +// reorderSubVector returns the data to is the original state. And de-facto is +// the opposite of the function concatSubVector. + +// For VecElems = 16 +// Invec[0] - |0| TransposedMatrix[0] - |0| +// Invec[1] - |1| => TransposedMatrix[1] - |1| +// Invec[2] - |2| TransposedMatrix[2] - |2| + +// For VecElems = 32 +// Invec[0] - |0|3| TransposedMatrix[0] - |0|1| +// Invec[1] - |1|4| => TransposedMatrix[1] - |2|3| +// Invec[2] - |2|5| TransposedMatrix[2] - |4|5| + +// For VecElems = 64 +// Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 | +// Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 | +// Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11| + +static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix, + ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf, + unsigned VecElems, unsigned Stride, + IRBuilder<> Builder) { + + if (VecElems == 16) { + for (unsigned i = 0; i < Stride; i++) + TransposedMatrix[i] = Builder.CreateShuffleVector( + Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf); + return; + } + + SmallVector<uint32_t, 32> OptimizeShuf; + Value *Temp[8]; + + for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) { + genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16, + (i + 1) / Stride * 16); + Temp[i / 2] = Builder.CreateShuffleVector( + Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf); + OptimizeShuf.clear(); + } + + if (VecElems == 32) { + std::copy(Temp, Temp + Stride, TransposedMatrix.begin()); + return; + } + else + for (unsigned i = 0; i < Stride; i++) + TransposedMatrix[i] = + Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat); +} + +void X86InterleavedAccessGroup::interleave8bitStride4VF8( + ArrayRef<Instruction *> Matrix, + SmallVectorImpl<Value *> &TransposedMatrix) { + // Assuming we start from the following vectors: + // Matrix[0]= c0 c1 c2 c3 c4 ... c7 + // Matrix[1]= m0 m1 m2 m3 m4 ... m7 + // Matrix[2]= y0 y1 y2 y3 y4 ... y7 + // Matrix[3]= k0 k1 k2 k3 k4 ... k7 + + MVT VT = MVT::v8i16; + TransposedMatrix.resize(2); + SmallVector<uint32_t, 16> MaskLow; + SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord; + SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord; + + for (unsigned i = 0; i < 8; ++i) { + MaskLow.push_back(i); + MaskLow.push_back(i + 8); + } + + createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false); + createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false); + scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord); + scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord); + // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7 + // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7 + Value *IntrVec1Low = + Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); + Value *IntrVec2Low = + Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); + + // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3 + // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7 + + TransposedMatrix[0] = + Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord); + TransposedMatrix[1] = + Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord); +} + +void X86InterleavedAccessGroup::interleave8bitStride4( + ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix, + unsigned NumOfElm) { + // Example: Assuming we start from the following vectors: + // Matrix[0]= c0 c1 c2 c3 c4 ... c31 + // Matrix[1]= m0 m1 m2 m3 m4 ... m31 + // Matrix[2]= y0 y1 y2 y3 y4 ... y31 + // Matrix[3]= k0 k1 k2 k3 k4 ... k31 + + MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm); + MVT HalfVT = scaleVectorType(VT); + + TransposedMatrix.resize(4); + SmallVector<uint32_t, 32> MaskHigh; + SmallVector<uint32_t, 32> MaskLow; + SmallVector<uint32_t, 32> LowHighMask[2]; + SmallVector<uint32_t, 32> MaskHighTemp; + SmallVector<uint32_t, 32> MaskLowTemp; + + // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86 + // shuffle pattern. + + createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false); + createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false); + + // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86 + // shuffle pattern. + + createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false); + createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false); + scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]); + scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]); + + // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23 + // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31 + // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23 + // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31 + Value *IntrVec[4]; + + IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow); + IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh); + IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow); + IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh); + + // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23 + // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31 + // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19 + // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27 + + Value *VecOut[4]; + for (int i = 0; i < 4; i++) + VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2], + LowHighMask[i % 2]); + + // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7 + // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15 + // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23 + // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31 + + if (VT == MVT::v16i8) { + std::copy(VecOut, VecOut + 4, TransposedMatrix.begin()); + return; + } + + reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16), + NumOfElm, 4, Builder); +} + +// createShuffleStride returns shuffle mask of size N. +// The shuffle pattern is as following : +// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane), +// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),..., +// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)} +// Where Lane is the # of lanes in a register: +// VectorSize = 128 => Lane = 1 +// VectorSize = 256 => Lane = 2 +// For example shuffle pattern for VF 16 register size 256 -> lanes = 2 +// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>} +static void createShuffleStride(MVT VT, int Stride, + SmallVectorImpl<uint32_t> &Mask) { + int VectorSize = VT.getSizeInBits(); + int VF = VT.getVectorNumElements(); + int LaneCount = std::max(VectorSize / 128, 1); + for (int Lane = 0; Lane < LaneCount; Lane++) + for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i) + Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane); +} + +// setGroupSize sets 'SizeInfo' to the size(number of elements) of group +// inside mask a shuffleMask. A mask contains exactly 3 groups, where +// each group is a monotonically increasing sequence with stride 3. +// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2} +static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) { + int VectorSize = VT.getSizeInBits(); + int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1); + for (int i = 0, FirstGroupElement = 0; i < 3; i++) { + int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0); + SizeInfo.push_back(GroupSize); + FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF; + } +} + +// DecodePALIGNRMask returns the shuffle mask of vpalign instruction. +// vpalign works according to lanes +// Where Lane is the # of lanes in a register: +// VectorWide = 128 => Lane = 1 +// VectorWide = 256 => Lane = 2 +// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}. +// For Lane = 2 shuffle pattern is: +// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}. +// Imm variable sets the offset amount. The result of the +// function is stored inside ShuffleMask vector and it built as described in +// the begin of the description. AlignDirection is a boolean that indecat the +// direction of the alignment. (false - align to the "right" side while true - +// align to the "left" side) +static void DecodePALIGNRMask(MVT VT, unsigned Imm, + SmallVectorImpl<uint32_t> &ShuffleMask, + bool AlignDirection = true, bool Unary = false) { + unsigned NumElts = VT.getVectorNumElements(); + unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1); + unsigned NumLaneElts = NumElts / NumLanes; + + Imm = AlignDirection ? Imm : (NumLaneElts - Imm); + unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8); + + for (unsigned l = 0; l != NumElts; l += NumLaneElts) { + for (unsigned i = 0; i != NumLaneElts; ++i) { + unsigned Base = i + Offset; + // if i+offset is out of this lane then we actually need the other source + // If Unary the other source is the first source. + if (Base >= NumLaneElts) + Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts; + ShuffleMask.push_back(Base + l); + } + } +} + +// concatSubVector - The function rebuilds the data to a correct expected +// order. An assumption(The shape of the matrix) was taken for the +// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'. +// This function ensures that the data is built in correct way for the lane +// instructions. Each lane inside the vector is a 128-bit length. +// +// The 'InVec' argument contains the data in increasing order. In InVec[0] You +// can find the first 128 bit data. The number of different lanes inside a +// vector depends on the 'VecElems'.In general, the formula is +// VecElems * type / 128. The size of the array 'InVec' depends and equal to +// 'VecElems'. + +// For VecElems = 16 +// Invec[0] - |0| Vec[0] - |0| +// Invec[1] - |1| => Vec[1] - |1| +// Invec[2] - |2| Vec[2] - |2| + +// For VecElems = 32 +// Invec[0] - |0|1| Vec[0] - |0|3| +// Invec[1] - |2|3| => Vec[1] - |1|4| +// Invec[2] - |4|5| Vec[2] - |2|5| + +// For VecElems = 64 +// Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 | +// Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10| +// Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11| + +static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec, + unsigned VecElems, IRBuilder<> Builder) { + if (VecElems == 16) { + for (int i = 0; i < 3; i++) + Vec[i] = InVec[i]; + return; + } + + for (unsigned j = 0; j < VecElems / 32; j++) + for (int i = 0; i < 3; i++) + Vec[i + j * 3] = Builder.CreateShuffleVector( + InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32)); + + if (VecElems == 32) + return; + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat); +} + +void X86InterleavedAccessGroup::deinterleave8bitStride3( + ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, + unsigned VecElems) { + // Example: Assuming we start from the following vectors: + // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2 + // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5 + // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7 + + TransposedMatrix.resize(3); + SmallVector<uint32_t, 32> VPShuf; + SmallVector<uint32_t, 32> VPAlign[2]; + SmallVector<uint32_t, 32> VPAlign2; + SmallVector<uint32_t, 32> VPAlign3; + SmallVector<uint32_t, 3> GroupSize; + Value *Vec[6], *TempVector[3]; + + MVT VT = MVT::getVT(Shuffles[0]->getType()); + + createShuffleStride(VT, 3, VPShuf); + setGroupSize(VT, GroupSize); + + for (int i = 0; i < 2; i++) + DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false); + + DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true); + DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true); + + concatSubVector(Vec, InVec, VecElems, Builder); + // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 + // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 + // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector( + Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf); + + // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2 + // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5 + // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7 + + for (int i = 0; i < 3; i++) + TempVector[i] = + Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]); + + // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 + // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 + // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i], + VPAlign[1]); + + // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 + // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 + // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7 + + Value *TempVec = Builder.CreateShuffleVector( + Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3); + TransposedMatrix[0] = Builder.CreateShuffleVector( + Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2); + TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec; + TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2]; +} + +// group2Shuffle reorder the shuffle stride back into continuous order. +// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} => +// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}. +static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask, + SmallVectorImpl<uint32_t> &Output) { + int IndexGroup[3] = {0, 0, 0}; + int Index = 0; + int VectorWidth = VT.getSizeInBits(); + int VF = VT.getVectorNumElements(); + // Find the index of the different groups. + int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1; + for (int i = 0; i < 3; i++) { + IndexGroup[(Index * 3) % (VF / Lane)] = Index; + Index += Mask[i]; + } + // According to the index compute the convert mask. + for (int i = 0; i < VF / Lane; i++) { + Output.push_back(IndexGroup[i % 3]); + IndexGroup[i % 3]++; + } +} + +void X86InterleavedAccessGroup::interleave8bitStride3( + ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix, + unsigned VecElems) { + // Example: Assuming we start from the following vectors: + // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7 + // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7 + // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7 + + TransposedMatrix.resize(3); + SmallVector<uint32_t, 3> GroupSize; + SmallVector<uint32_t, 32> VPShuf; + SmallVector<uint32_t, 32> VPAlign[3]; + SmallVector<uint32_t, 32> VPAlign2; + SmallVector<uint32_t, 32> VPAlign3; + + Value *Vec[3], *TempVector[3]; + MVT VT = MVT::getVectorVT(MVT::i8, VecElems); + + setGroupSize(VT, GroupSize); + + for (int i = 0; i < 3; i++) + DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]); + + DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true); + DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true); + + // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2 + // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4 + // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7 + + Vec[0] = Builder.CreateShuffleVector( + InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2); + Vec[1] = Builder.CreateShuffleVector( + InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3); + Vec[2] = InVec[2]; + + // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2 + // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5 + // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7 + + for (int i = 0; i < 3; i++) + TempVector[i] = + Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]); + + // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1 + // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4 + // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7 + + for (int i = 0; i < 3; i++) + Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3], + VPAlign[2]); + + // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2 + // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5 + // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7 + + unsigned NumOfElm = VT.getVectorNumElements(); + group2Shuffle(VT, GroupSize, VPShuf); + reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder); +} + void X86InterleavedAccessGroup::transpose_4x4( ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix) { @@ -200,10 +724,26 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Try to generate target-sized register(/instruction). decompose(Inst, Factor, ShuffleTy, DecomposedVectors); + Type *ShuffleEltTy = Inst->getType(); + unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor; // Perform matrix-transposition in order to compute interleaved // results by generating some sort of (optimized) target-specific // instructions. - transpose_4x4(DecomposedVectors, TransposedVectors); + + switch (NumSubVecElems) { + default: + return false; + case 4: + transpose_4x4(DecomposedVectors, TransposedVectors); + break; + case 8: + case 16: + case 32: + case 64: + deinterleave8bitStride3(DecomposedVectors, TransposedVectors, + NumSubVecElems); + break; + } // Now replace the unoptimized-interleaved-vectors with the // transposed-interleaved vectors. @@ -219,12 +759,31 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() { // Lower the interleaved stores: // 1. Decompose the interleaved wide shuffle into individual shuffle // vectors. - decompose(Shuffles[0], Factor, - VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors); + decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems), + DecomposedVectors); // 2. Transpose the interleaved-vectors into vectors of contiguous // elements. - transpose_4x4(DecomposedVectors, TransposedVectors); + switch (NumSubVecElems) { + case 4: + transpose_4x4(DecomposedVectors, TransposedVectors); + break; + case 8: + interleave8bitStride4VF8(DecomposedVectors, TransposedVectors); + break; + case 16: + case 32: + case 64: + if (Factor == 4) + interleave8bitStride4(DecomposedVectors, TransposedVectors, + NumSubVecElems); + if (Factor == 3) + interleave8bitStride3(DecomposedVectors, TransposedVectors, + NumSubVecElems); + break; + default: + return false; + } // 3. Concatenate the contiguous-vectors back into a wide vector. Value *WideVec = concatenateVectors(Builder, TransposedVectors); diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 6b1add8ff8ed..0782d5598746 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -30,13 +30,15 @@ enum IntrinsicType : uint16_t { INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK, FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3, FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3, + IFMA_OP_MASK, IFMA_OP_MASKZ, VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK, - INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM, - COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC, + INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK, + COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, - TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, + TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, + ROUNDP, ROUNDS }; struct IntrinsicData { @@ -118,6 +120,12 @@ static const IntrinsicData IntrinsicsWithChain[] = { X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH, X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm), + X86_INTRINSIC_DATA(avx512_mask_compress_store_b_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_b_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_b_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128, COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256, @@ -142,6 +150,18 @@ static const IntrinsicData IntrinsicsWithChain[] = { COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512, COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_w_128, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_w_256, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_store_w_512, + COMPRESS_TO_MEM, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_b_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_b_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_b_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256, @@ -166,6 +186,12 @@ static const IntrinsicData IntrinsicsWithChain[] = { EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512, EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_w_128, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_w_256, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_load_w_512, + EXPAND_FROM_MEM, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8, X86ISD::VTRUNC, 0), X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8, @@ -342,6 +368,8 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) { * the alphabetical order. */ static const IntrinsicData IntrinsicsWithoutChain[] = { + X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), + X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0), @@ -360,19 +388,15 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0), + X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), - X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), - X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, ISD::ABS, 0), X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), @@ -381,8 +405,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -427,13 +449,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0), - X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0), @@ -464,9 +479,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0), X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0), X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0), - X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0), - X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0), X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0), X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD, X86ISD::FADD_RND), @@ -476,40 +488,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FADDS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FADDS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC, - X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0), X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM, @@ -522,6 +500,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSETCCM, X86ISD::FSETCCM_RND), X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC, X86ISD::FSETCCM, X86ISD::FSETCCM_RND), + + X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG, @@ -546,6 +531,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), + X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG, + X86ISD::COMPRESS, 0), X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK, X86ISD::CONFLICT, 0), X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK, @@ -720,6 +711,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FDIVS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FDIVS_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG, @@ -744,6 +741,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG, X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), + X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG, + X86ISD::EXPAND, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0), @@ -776,22 +779,22 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FGETEXPS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FGETEXPS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM, - X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK, + X86ISD::VGETMANT, X86ISD::VGETMANT_RND), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM, - X86ISD::VGETMANT, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM, - X86ISD::VGETMANTS, 0), - X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM, - X86ISD::VGETMANTS, 0), + X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK, + X86ISD::VGETMANT, X86ISD::VGETMANT_RND), + X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), + X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK, + X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND), X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, X86ISD::FMAX_RND), X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX, @@ -816,18 +819,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FMULS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FMULS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0), - X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0), @@ -840,36 +831,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), - X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK, - X86ISD::VBROADCAST, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK, @@ -1081,32 +1042,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0), - X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VRNDSCALES, 0), - X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::VRNDSCALES, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), + X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0), + X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND), + X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND), + X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0), + X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND), + X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND), + X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND), + X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK, + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND), + X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK, + X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::SCALEF, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM, @@ -1123,22 +1084,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::SCALEFS, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), - X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK, - X86ISD::SHUF128, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, @@ -1159,29 +1104,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::FSUBS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM, X86ISD::FSUBS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM, - X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), - X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM, + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK, X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK, + X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK, X86ISD::CVTPS2PH, 0), X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK, X86ISD::CVTPS2PH, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD, + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD, + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), + X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB, @@ -1209,7 +1154,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_128, FMA_OP_MASK, X86ISD::VPDPBUSD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_256, FMA_OP_MASK, X86ISD::VPDPBUSD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_512, FMA_OP_MASK, X86ISD::VPDPBUSD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_128, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_256, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_512, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_128, FMA_OP_MASK, X86ISD::VPDPWSSD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_256, FMA_OP_MASK, X86ISD::VPDPWSSD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_512, FMA_OP_MASK, X86ISD::VPDPWSSD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_128, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_256, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0), + X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_512, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0), + + X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK, X86ISD::VPERMIV3, 0), @@ -1281,29 +1239,74 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK, X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK, X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK, X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK, X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK, X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK, + X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK, X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD, + + X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_128, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_256, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_512, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_128, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_256, FMA_OP_MASK, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_512, FMA_OP_MASK, X86ISD::VSHRDV, 0), + + X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK, + X86ISD::VPSHUFBITQMB, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_256, CMP_MASK, + X86ISD::VPSHUFBITQMB, 0), + X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK, + X86ISD::VPSHUFBITQMB, 0), + + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD, + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB, @@ -1321,8 +1324,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB, X86ISD::FMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0), @@ -1341,8 +1344,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB, X86ISD::FNMSUB_RND), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), - X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND), + X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ, X86ISD::VFIXUPIMM, 0), X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ, @@ -1371,17 +1374,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ, X86ISD::VPTERNLOG, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD, + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD, + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, ISD::FMA, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA, X86ISD::FMADD_RND), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), - X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), + X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, @@ -1391,6 +1394,19 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB, X86ISD::FMADDSUB_RND), + X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_128, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_256, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_512, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_128, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_256, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_512, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_128, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_256, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_512, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_128, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_256, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_512, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ, @@ -1427,18 +1443,38 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86ISD::VPERMV3, 0), X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ, X86ISD::VPERMV3, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ, + X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ, X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ, + X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ, X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ, + X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ, X86ISD::VPMADD52H, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ, + X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ, X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ, + X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ, X86ISD::VPMADD52L, 0), - X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ, + X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ, X86ISD::VPMADD52L, 0), + + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0), + X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), @@ -1486,50 +1522,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_d_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_d_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_d_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_q_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_q_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_q_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_w_128, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_w_256, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestm_w_512, CMP_MASK, X86ISD::TESTM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_b_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_b_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_b_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_d_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_d_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_d_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_q_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_q_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_q_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0), - X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0), - X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0), - X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0), + X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), + X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0), X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0), X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0), - X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), + X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), @@ -1546,10 +1558,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0), X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), - X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0), - X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0), + X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0), + X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0), + X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0), + X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0), @@ -1558,6 +1572,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0), X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0), + X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0), + X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0), X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0), @@ -1566,10 +1582,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0), X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0), + X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0), X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), + X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0), + X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), + X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0), X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0), X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE), @@ -1615,8 +1637,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0), X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0), X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0), - X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0), - X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1650,18 +1670,22 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE), X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT), X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE), + X86_INTRINSIC_DATA(sse3_addsub_pd, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), + X86_INTRINSIC_DATA(sse3_addsub_ps, INTR_TYPE_2OP, X86ISD::ADDSUB, 0), X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0), X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0), X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0), X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), + X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0), X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0), + X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0), + X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0), + X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0), X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0), X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0), - X86_INTRINSIC_DATA(ssse3_pabs_b_128, INTR_TYPE_1OP, ISD::ABS, 0), - X86_INTRINSIC_DATA(ssse3_pabs_d_128, INTR_TYPE_1OP, ISD::ABS, 0), - X86_INTRINSIC_DATA(ssse3_pabs_w_128, INTR_TYPE_1OP, ISD::ABS, 0), X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0), X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0), @@ -1669,6 +1693,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0), X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0), X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0), + X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0), + X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), + X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0), + + X86_INTRINSIC_DATA(vgf2p8affineinvqb_128, INTR_TYPE_3OP, + X86ISD::GF2P8AFFINEINVQB, 0), + X86_INTRINSIC_DATA(vgf2p8affineinvqb_256, INTR_TYPE_3OP, + X86ISD::GF2P8AFFINEINVQB, 0), + X86_INTRINSIC_DATA(vgf2p8affineinvqb_512, INTR_TYPE_3OP, + X86ISD::GF2P8AFFINEINVQB, 0), + X86_INTRINSIC_DATA(vgf2p8affineqb_128, INTR_TYPE_3OP, + X86ISD::GF2P8AFFINEQB, 0), + X86_INTRINSIC_DATA(vgf2p8affineqb_256, INTR_TYPE_3OP, + X86ISD::GF2P8AFFINEQB, 0), + X86_INTRINSIC_DATA(vgf2p8affineqb_512, INTR_TYPE_3OP, + X86ISD::GF2P8AFFINEQB, 0), + X86_INTRINSIC_DATA(vgf2p8mulb_128, INTR_TYPE_2OP, + X86ISD::GF2P8MULB, 0), + X86_INTRINSIC_DATA(vgf2p8mulb_256, INTR_TYPE_2OP, + X86ISD::GF2P8MULB, 0), + X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP, + X86ISD::GF2P8MULB, 0), + X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0), X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0), @@ -1682,14 +1730,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = { X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0), X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0), - X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0), - X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), - X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0), - X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), - X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0), - X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), - X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0), - X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0), + X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, ISD::ROTL, 0), + X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, ISD::ROTL, 0), + X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, ISD::ROTL, 0), + X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VROTLI, 0), + X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, ISD::ROTL, 0), + X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VROTLI, 0), X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0), X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0), X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0), diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 744ba21011af..4108a58fa7a5 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -14,17 +14,45 @@ #include "X86LegalizerInfo.h" #include "X86Subtarget.h" #include "X86TargetMachine.h" +#include "llvm/CodeGen/TargetOpcodes.h" #include "llvm/CodeGen/ValueTypes.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" -#include "llvm/Target/TargetOpcodes.h" using namespace llvm; using namespace TargetOpcode; -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif +/// FIXME: The following static functions are SizeChangeStrategy functions +/// that are meant to temporarily mimic the behaviour of the old legalization +/// based on doubling/halving non-legal types as closely as possible. This is +/// not entirly possible as only legalizing the types that are exactly a power +/// of 2 times the size of the legal types would require specifying all those +/// sizes explicitly. +/// In practice, not specifying those isn't a problem, and the below functions +/// should disappear quickly as we add support for legalizing non-power-of-2 +/// sized types further. +static void +addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result, + const LegalizerInfo::SizeAndActionsVec &v) { + for (unsigned i = 0; i < v.size(); ++i) { + result.push_back(v[i]); + if (i + 1 < v[i].first && i + 1 < v.size() && + v[i + 1].first != v[i].first + 1) + result.push_back({v[i].first + 1, LegalizerInfo::Unsupported}); + } +} + +static LegalizerInfo::SizeAndActionsVec +widen_1(const LegalizerInfo::SizeAndActionsVec &v) { + assert(v.size() >= 1); + assert(v[0].first > 1); + LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar}, + {2, LegalizerInfo::Unsupported}}; + addAndInterleaveWithUnsupported(result, v); + auto Largest = result.back().first; + result.push_back({Largest + 1, LegalizerInfo::Unsupported}); + return result; +} X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM) @@ -41,21 +69,35 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI, setLegalizerInfoAVX512DQ(); setLegalizerInfoAVX512BW(); + setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1); + for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1); + for (unsigned MemOp : {G_LOAD, G_STORE}) + setLegalizeScalarToDifferentSizeStrategy(MemOp, 0, + narrowToSmallerAndWidenToSmallest); + setLegalizeScalarToDifferentSizeStrategy( + G_GEP, 1, widenToLargerTypesUnsupportedOtherwise); + setLegalizeScalarToDifferentSizeStrategy( + G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest); + computeTables(); } void X86LegalizerInfo::setLegalizerInfo32bit() { - if (Subtarget.is64Bit()) - return; - - const LLT p0 = LLT::pointer(0, 32); + const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8); const LLT s1 = LLT::scalar(1); const LLT s8 = LLT::scalar(8); const LLT s16 = LLT::scalar(16); const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); + for (auto Ty : {p0, s1, s8, s16, s32}) + setAction({G_IMPLICIT_DEF, Ty}, Legal); + + for (auto Ty : {s8, s16, s32, p0}) + setAction({G_PHI, Ty}, Legal); + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) for (auto Ty : {s8, s16, s32}) setAction({BinOp, Ty}, Legal); @@ -69,7 +111,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32, p0}) setAction({MemOp, Ty}, Legal); - setAction({MemOp, s1}, WidenScalar); // And everything's fine in addrspace 0. setAction({MemOp, 1, p0}, Legal); } @@ -81,25 +122,18 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_GEP, p0}, Legal); setAction({G_GEP, 1, s32}, Legal); - for (auto Ty : {s1, s8, s16}) - setAction({G_GEP, 1, Ty}, WidenScalar); + // Control-flow + setAction({G_BRCOND, s1}, Legal); // Constants for (auto Ty : {s8, s16, s32, p0}) setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); - setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar); - setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar); - // Extensions for (auto Ty : {s8, s16, s32}) { setAction({G_ZEXT, Ty}, Legal); setAction({G_SEXT, Ty}, Legal); - } - - for (auto Ty : {s1, s8, s16}) { - setAction({G_ZEXT, 1, Ty}, Legal); - setAction({G_SEXT, 1, Ty}, Legal); + setAction({G_ANYEXT, Ty}, Legal); } // Comparison @@ -107,6 +141,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { for (auto Ty : {s8, s16, s32, p0}) setAction({G_ICMP, 1, Ty}, Legal); + + // Merge/Unmerge + for (const auto &Ty : {s16, s32, s64}) { + setAction({G_MERGE_VALUES, Ty}, Legal); + setAction({G_UNMERGE_VALUES, 1, Ty}, Legal); + } + for (const auto &Ty : {s8, s16, s32}) { + setAction({G_MERGE_VALUES, 1, Ty}, Legal); + setAction({G_UNMERGE_VALUES, Ty}, Legal); + } } void X86LegalizerInfo::setLegalizerInfo64bit() { @@ -114,59 +158,38 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { if (!Subtarget.is64Bit()) return; - const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8); - const LLT s1 = LLT::scalar(1); - const LLT s8 = LLT::scalar(8); - const LLT s16 = LLT::scalar(16); - const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); + const LLT s128 = LLT::scalar(128); - for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) - for (auto Ty : {s8, s16, s32, s64}) - setAction({BinOp, Ty}, Legal); + setAction({G_IMPLICIT_DEF, s64}, Legal); - for (unsigned MemOp : {G_LOAD, G_STORE}) { - for (auto Ty : {s8, s16, s32, s64, p0}) - setAction({MemOp, Ty}, Legal); + setAction({G_PHI, s64}, Legal); - setAction({MemOp, s1}, WidenScalar); - // And everything's fine in addrspace 0. - setAction({MemOp, 1, p0}, Legal); - } + for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR}) + setAction({BinOp, s64}, Legal); - // Pointer-handling - setAction({G_FRAME_INDEX, p0}, Legal); - setAction({G_GLOBAL_VALUE, p0}, Legal); + for (unsigned MemOp : {G_LOAD, G_STORE}) + setAction({MemOp, s64}, Legal); - setAction({G_GEP, p0}, Legal); - setAction({G_GEP, 1, s32}, Legal); + // Pointer-handling setAction({G_GEP, 1, s64}, Legal); - for (auto Ty : {s1, s8, s16}) - setAction({G_GEP, 1, Ty}, WidenScalar); - // Constants - for (auto Ty : {s8, s16, s32, s64, p0}) - setAction({TargetOpcode::G_CONSTANT, Ty}, Legal); - - setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar); + setAction({TargetOpcode::G_CONSTANT, s64}, Legal); // Extensions - for (auto Ty : {s8, s16, s32, s64}) { - setAction({G_ZEXT, Ty}, Legal); - setAction({G_SEXT, Ty}, Legal); - } - - for (auto Ty : {s1, s8, s16, s32}) { - setAction({G_ZEXT, 1, Ty}, Legal); - setAction({G_SEXT, 1, Ty}, Legal); + for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) { + setAction({extOp, s64}, Legal); } // Comparison - setAction({G_ICMP, s1}, Legal); + setAction({G_ICMP, 1, s64}, Legal); - for (auto Ty : {s8, s16, s32, s64, p0}) - setAction({G_ICMP, 1, Ty}, Legal); + // Merge/Unmerge + setAction({G_MERGE_VALUES, s128}, Legal); + setAction({G_UNMERGE_VALUES, 1, s128}, Legal); + setAction({G_MERGE_VALUES, 1, s128}, Legal); + setAction({G_UNMERGE_VALUES, s128}, Legal); } void X86LegalizerInfo::setLegalizerInfoSSE1() { @@ -174,6 +197,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE1() { return; const LLT s32 = LLT::scalar(32); + const LLT s64 = LLT::scalar(64); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); @@ -184,18 +208,35 @@ void X86LegalizerInfo::setLegalizerInfoSSE1() { for (unsigned MemOp : {G_LOAD, G_STORE}) for (auto Ty : {v4s32, v2s64}) setAction({MemOp, Ty}, Legal); + + // Constants + setAction({TargetOpcode::G_FCONSTANT, s32}, Legal); + + // Merge/Unmerge + for (const auto &Ty : {v4s32, v2s64}) { + setAction({G_MERGE_VALUES, Ty}, Legal); + setAction({G_UNMERGE_VALUES, 1, Ty}, Legal); + } + setAction({G_MERGE_VALUES, 1, s64}, Legal); + setAction({G_UNMERGE_VALUES, s64}, Legal); } void X86LegalizerInfo::setLegalizerInfoSSE2() { if (!Subtarget.hasSSE2()) return; + const LLT s32 = LLT::scalar(32); const LLT s64 = LLT::scalar(64); const LLT v16s8 = LLT::vector(16, 8); const LLT v8s16 = LLT::vector(8, 16); const LLT v4s32 = LLT::vector(4, 32); const LLT v2s64 = LLT::vector(2, 64); + const LLT v32s8 = LLT::vector(32, 8); + const LLT v16s16 = LLT::vector(16, 16); + const LLT v8s32 = LLT::vector(8, 32); + const LLT v4s64 = LLT::vector(4, 64); + for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV}) for (auto Ty : {s64, v2s64}) setAction({BinOp, Ty}, Legal); @@ -205,6 +246,23 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() { setAction({BinOp, Ty}, Legal); setAction({G_MUL, v8s16}, Legal); + + setAction({G_FPEXT, s64}, Legal); + setAction({G_FPEXT, 1, s32}, Legal); + + // Constants + setAction({TargetOpcode::G_FCONSTANT, s64}, Legal); + + // Merge/Unmerge + for (const auto &Ty : + {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) { + setAction({G_MERGE_VALUES, Ty}, Legal); + setAction({G_UNMERGE_VALUES, 1, Ty}, Legal); + } + for (const auto &Ty : {v16s8, v8s16, v4s32, v2s64}) { + setAction({G_MERGE_VALUES, 1, Ty}, Legal); + setAction({G_UNMERGE_VALUES, Ty}, Legal); + } } void X86LegalizerInfo::setLegalizerInfoSSE41() { @@ -226,9 +284,13 @@ void X86LegalizerInfo::setLegalizerInfoAVX() { const LLT v2s64 = LLT::vector(2, 64); const LLT v32s8 = LLT::vector(32, 8); + const LLT v64s8 = LLT::vector(64, 8); const LLT v16s16 = LLT::vector(16, 16); + const LLT v32s16 = LLT::vector(32, 16); const LLT v8s32 = LLT::vector(8, 32); + const LLT v16s32 = LLT::vector(16, 32); const LLT v4s64 = LLT::vector(4, 64); + const LLT v8s64 = LLT::vector(8, 64); for (unsigned MemOp : {G_LOAD, G_STORE}) for (auto Ty : {v8s32, v4s64}) @@ -242,6 +304,17 @@ void X86LegalizerInfo::setLegalizerInfoAVX() { setAction({G_INSERT, 1, Ty}, Legal); setAction({G_EXTRACT, Ty}, Legal); } + // Merge/Unmerge + for (const auto &Ty : + {v32s8, v64s8, v16s16, v32s16, v8s32, v16s32, v4s64, v8s64}) { + setAction({G_MERGE_VALUES, Ty}, Legal); + setAction({G_UNMERGE_VALUES, 1, Ty}, Legal); + } + for (const auto &Ty : + {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) { + setAction({G_MERGE_VALUES, 1, Ty}, Legal); + setAction({G_UNMERGE_VALUES, Ty}, Legal); + } } void X86LegalizerInfo::setLegalizerInfoAVX2() { @@ -253,12 +326,27 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() { const LLT v8s32 = LLT::vector(8, 32); const LLT v4s64 = LLT::vector(4, 64); + const LLT v64s8 = LLT::vector(64, 8); + const LLT v32s16 = LLT::vector(32, 16); + const LLT v16s32 = LLT::vector(16, 32); + const LLT v8s64 = LLT::vector(8, 64); + for (unsigned BinOp : {G_ADD, G_SUB}) for (auto Ty : {v32s8, v16s16, v8s32, v4s64}) setAction({BinOp, Ty}, Legal); for (auto Ty : {v16s16, v8s32}) setAction({G_MUL, Ty}, Legal); + + // Merge/Unmerge + for (const auto &Ty : {v64s8, v32s16, v16s32, v8s64}) { + setAction({G_MERGE_VALUES, Ty}, Legal); + setAction({G_UNMERGE_VALUES, 1, Ty}, Legal); + } + for (const auto &Ty : {v32s8, v16s16, v8s32, v4s64}) { + setAction({G_MERGE_VALUES, 1, Ty}, Legal); + setAction({G_UNMERGE_VALUES, Ty}, Legal); + } } void X86LegalizerInfo::setLegalizerInfoAVX512() { diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp index fd2837b79103..8a7179e48a0b 100644 --- a/lib/Target/X86/X86MCInstLower.cpp +++ b/lib/Target/X86/X86MCInstLower.cpp @@ -15,6 +15,7 @@ #include "InstPrinter/X86ATTInstPrinter.h" #include "InstPrinter/X86InstComments.h" #include "MCTargetDesc/X86BaseInfo.h" +#include "MCTargetDesc/X86TargetStreamer.h" #include "Utils/X86ShuffleDecode.h" #include "X86AsmPrinter.h" #include "X86RegisterInfo.h" @@ -22,12 +23,12 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/iterator_range.h" -#include "llvm/BinaryFormat/ELF.h" #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineModuleInfoImpls.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/StackMaps.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/GlobalValue.h" #include "llvm/IR/Mangler.h" @@ -40,12 +41,9 @@ #include "llvm/MC/MCInstBuilder.h" #include "llvm/MC/MCSection.h" #include "llvm/MC/MCSectionELF.h" -#include "llvm/MC/MCSectionMachO.h" #include "llvm/MC/MCStreamer.h" #include "llvm/MC/MCSymbol.h" #include "llvm/MC/MCSymbolELF.h" -#include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" using namespace llvm; @@ -102,7 +100,9 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding( } void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) { - OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo); + OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), + EnablePrintSchedInfo && + !(Inst.getFlags() & X86::NO_SCHED_INFO)); SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get()); } @@ -960,7 +960,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI, // This is an optimization that lets us get away without emitting a nop in // many cases. // - // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two + // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %r9) takes two // bytes too, so the check on MinSize is important. MCI.setOpcode(X86::PUSH64rmr); } else { @@ -1047,20 +1047,20 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, // We want to emit the following pattern, which follows the x86 calling // convention to prepare for the trampoline call to be patched in. // - // <args placement according SysV64 calling convention> // .p2align 1, ... // .Lxray_event_sled_N: - // jmp +N // jump across the call instruction - // callq __xray_CustomEvent // force relocation to symbol - // <args cleanup, jump to here> - // - // The relative jump needs to jump forward 24 bytes: - // 10 (args) + 5 (nops) + 9 (cleanup) + // jmp +N // jump across the instrumentation sled + // ... // set up arguments in register + // callq __xray_CustomEvent@plt // force dependency to symbol + // ... + // <jump here> // // After patching, it would look something like: // // nopw (2-byte nop) + // ... // callq __xrayCustomEvent // already lowered + // ... // // --- // First we emit the label and the jump. @@ -1072,49 +1072,57 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI, // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as // an operand (computed as an offset from the jmp instruction). // FIXME: Find another less hacky way do force the relative jump. - OutStreamer->EmitBytes("\xeb\x14"); + OutStreamer->EmitBinaryData("\xeb\x0f"); // The default C calling convention will place two arguments into %rcx and // %rdx -- so we only work with those. - unsigned UsedRegs[] = {X86::RDI, X86::RSI, X86::RAX}; - - // Because we will use %rax, we preserve that across the call. - EmitAndCountInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RAX)); - - // Then we put the operands in the %rdi and %rsi registers. + unsigned UsedRegs[] = {X86::RDI, X86::RSI}; + bool UsedMask[] = {false, false}; + + // Then we put the operands in the %rdi and %rsi registers. We spill the + // values in the register before we clobber them, and mark them as used in + // UsedMask. In case the arguments are already in the correct register, we use + // emit nops appropriately sized to keep the sled the same size in every + // situation. for (unsigned I = 0; I < MI.getNumOperands(); ++I) if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) { - if (Op->isImm()) - EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri) + assert(Op->isReg() && "Only support arguments in registers"); + if (Op->getReg() != UsedRegs[I]) { + UsedMask[I] = true; + EmitAndCountInstruction( + MCInstBuilder(X86::PUSH64r).addReg(UsedRegs[I])); + EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr) .addReg(UsedRegs[I]) - .addImm(Op->getImm())); - else if (Op->isReg()) { - if (Op->getReg() != UsedRegs[I]) - EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr) - .addReg(UsedRegs[I]) - .addReg(Op->getReg())); - else - EmitNops(*OutStreamer, 3, Subtarget->is64Bit(), getSubtargetInfo()); + .addReg(Op->getReg())); + } else { + EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo()); } } // We emit a hard dependency on the __xray_CustomEvent symbol, which is the - // name of the trampoline to be implemented by the XRay runtime. We put this - // explicitly in the %rax register. + // name of the trampoline to be implemented by the XRay runtime. auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent"); MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym); - EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri) - .addReg(X86::RAX) - .addOperand(MCIL.LowerSymbolOperand(TOp, TSym))); + if (isPositionIndependent()) + TOp.setTargetFlags(X86II::MO_PLT); // Emit the call instruction. - EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(X86::RAX)); + EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32) + .addOperand(MCIL.LowerSymbolOperand(TOp, TSym))); // Restore caller-saved and used registers. + for (unsigned I = sizeof UsedMask; I-- > 0;) + if (UsedMask[I]) + EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(UsedRegs[I])); + else + EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo()); + OutStreamer->AddComment("xray custom event end."); - EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RAX)); - recordSled(CurSled, MI, SledKind::CUSTOM_EVENT); + // Record the sled version. Older versions of this sled were spelled + // differently, so we let the runtime handle the different offsets we're + // using. + recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1); } void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, @@ -1125,7 +1133,6 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, // .Lxray_sled_N: // jmp .tmpN // # 9 bytes worth of noops - // .tmpN // // We need the 9 bytes because at runtime, we'd be patching over the full 11 // bytes with the following pattern: @@ -1136,14 +1143,12 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI, auto CurSled = OutContext.createTempSymbol("xray_sled_", true); OutStreamer->EmitCodeAlignment(2); OutStreamer->EmitLabel(CurSled); - auto Target = OutContext.createTempSymbol(); // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as // an operand (computed as an offset from the jmp instruction). // FIXME: Find another less hacky way do force the relative jump. OutStreamer->EmitBytes("\xeb\x09"); EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo()); - OutStreamer->EmitLabel(Target); recordSled(CurSled, MI, SledKind::FUNCTION_ENTER); } @@ -1358,6 +1363,82 @@ static void printConstant(const Constant *COp, raw_ostream &CS) { } } +void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) { + assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); + assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only"); + const X86RegisterInfo *RI = + MF->getSubtarget<X86Subtarget>().getRegisterInfo(); + + // Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86. + if (EmitFPOData) { + X86TargetStreamer *XTS = + static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()); + switch (MI->getOpcode()) { + case X86::SEH_PushReg: + XTS->emitFPOPushReg(MI->getOperand(0).getImm()); + break; + case X86::SEH_StackAlloc: + XTS->emitFPOStackAlloc(MI->getOperand(0).getImm()); + break; + case X86::SEH_SetFrame: + assert(MI->getOperand(1).getImm() == 0 && + ".cv_fpo_setframe takes no offset"); + XTS->emitFPOSetFrame(MI->getOperand(0).getImm()); + break; + case X86::SEH_EndPrologue: + XTS->emitFPOEndPrologue(); + break; + case X86::SEH_SaveReg: + case X86::SEH_SaveXMM: + case X86::SEH_PushFrame: + llvm_unreachable("SEH_ directive incompatible with FPO"); + break; + default: + llvm_unreachable("expected SEH_ instruction"); + } + return; + } + + // Otherwise, use the .seh_ directives for all other Windows platforms. + switch (MI->getOpcode()) { + case X86::SEH_PushReg: + OutStreamer->EmitWinCFIPushReg( + RI->getSEHRegNum(MI->getOperand(0).getImm())); + break; + + case X86::SEH_SaveReg: + OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + break; + + case X86::SEH_SaveXMM: + OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + break; + + case X86::SEH_StackAlloc: + OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm()); + break; + + case X86::SEH_SetFrame: + OutStreamer->EmitWinCFISetFrame( + RI->getSEHRegNum(MI->getOperand(0).getImm()), + MI->getOperand(1).getImm()); + break; + + case X86::SEH_PushFrame: + OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm()); + break; + + case X86::SEH_EndPrologue: + OutStreamer->EmitWinCFIEndProlog(); + break; + + default: + llvm_unreachable("expected SEH_ instruction"); + } +} + void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { X86MCInstLower MCInstLowering(*MF, *this); const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo(); @@ -1535,41 +1616,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { return; case X86::SEH_PushReg: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm())); - return; - case X86::SEH_SaveReg: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); - return; - case X86::SEH_SaveXMM: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); - return; - case X86::SEH_StackAlloc: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm()); - return; - case X86::SEH_SetFrame: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()), - MI->getOperand(1).getImm()); - return; - case X86::SEH_PushFrame: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm()); - return; - case X86::SEH_EndPrologue: - assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?"); - OutStreamer->EmitWinCFIEndProlog(); + EmitSEHInstruction(MI); return; case X86::SEH_Epilogue: { @@ -1949,6 +2002,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) { MCInst TmpInst; MCInstLowering.Lower(MI, TmpInst); + if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment)) + TmpInst.setFlags(TmpInst.getFlags() | X86::NO_SCHED_INFO); // Stackmap shadows cannot include branch targets, so we can count the bytes // in a call towards the shadow, but must ensure that the no thread returns diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp index 3fcb642424ad..5433033671f3 100644 --- a/lib/Target/X86/X86MachineFunctionInfo.cpp +++ b/lib/Target/X86/X86MachineFunctionInfo.cpp @@ -10,7 +10,7 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetSubtargetInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" using namespace llvm; diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp index 8fdf10617059..67d95c2233de 100644 --- a/lib/Target/X86/X86MacroFusion.cpp +++ b/lib/Target/X86/X86MacroFusion.cpp @@ -14,8 +14,8 @@ #include "X86MacroFusion.h" #include "X86Subtarget.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/TargetInstrInfo.h" using namespace llvm; @@ -27,10 +27,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, const MachineInstr *FirstMI, const MachineInstr &SecondMI) { const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI); - // Check if this processor supports macro-fusion. Since this is a minor - // heuristic, we haven't specifically reserved a feature. hasAVX is a decent - // proxy for SandyBridge+. - if (!ST.hasAVX()) + // Check if this processor supports macro-fusion. + if (!ST.hasMacroFusion()) return false; enum { @@ -84,10 +82,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII, case X86::TEST32i32: case X86::TEST64i32: case X86::TEST64ri32: - case X86::TEST8rm: - case X86::TEST16rm: - case X86::TEST32rm: - case X86::TEST64rm: + case X86::TEST8mr: + case X86::TEST16mr: + case X86::TEST32mr: + case X86::TEST64mr: case X86::TEST8ri_NOREX: case X86::AND16i16: case X86::AND16ri: diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp index e6756b975c10..1fc6f07b79fa 100644 --- a/lib/Target/X86/X86OptimizeLEAs.cpp +++ b/lib/Target/X86/X86OptimizeLEAs.cpp @@ -1,4 +1,4 @@ -//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===// +//===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===// // // The LLVM Compiler Infrastructure // @@ -17,22 +17,36 @@ // //===----------------------------------------------------------------------===// +#include "MCTargetDesc/X86BaseInfo.h" #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseMapInfo.h" +#include "llvm/ADT/Hashing.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" -#include "llvm/CodeGen/LiveVariables.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" -#include "llvm/IR/DIBuilder.h" +#include "llvm/CodeGen/TargetOpcodes.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DebugLoc.h" #include "llvm/IR/Function.h" +#include "llvm/MC/MCInstrDesc.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" +#include <cassert> +#include <cstdint> +#include <iterator> using namespace llvm; @@ -60,6 +74,7 @@ static bool isSimilarDispOp(const MachineOperand &MO1, static inline bool isLEA(const MachineInstr &MI); namespace { + /// A key based on instruction's memory operands. class MemOpKey { public: @@ -92,12 +107,14 @@ public: // Address' displacement operand. const MachineOperand *Disp; }; + } // end anonymous namespace /// Provide DenseMapInfo for MemOpKey. namespace llvm { + template <> struct DenseMapInfo<MemOpKey> { - typedef DenseMapInfo<const MachineOperand *> PtrInfo; + using PtrInfo = DenseMapInfo<const MachineOperand *>; static inline MemOpKey getEmptyKey() { return MemOpKey(PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(), @@ -164,7 +181,8 @@ template <> struct DenseMapInfo<MemOpKey> { return LHS == RHS; } }; -} + +} // end namespace llvm /// \brief Returns a hash table key based on memory operands of \p MI. The /// number of the first memory operand of \p MI is specified through \p N. @@ -217,6 +235,7 @@ static inline bool isLEA(const MachineInstr &MI) { } namespace { + class OptimizeLEAPass : public MachineFunctionPass { public: OptimizeLEAPass() : MachineFunctionPass(ID) {} @@ -229,7 +248,7 @@ public: bool runOnMachineFunction(MachineFunction &MF) override; private: - typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap; + using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>; /// \brief Returns a distance between two instructions inside one basic block. /// Negative result means, that instructions occur in reverse order. @@ -281,8 +300,10 @@ private: static char ID; }; + +} // end anonymous namespace + char OptimizeLEAPass::ID = 0; -} FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); } @@ -547,16 +568,18 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI, if (AddrDispShift != 0) Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift, + DIExpression::NoDeref, DIExpression::WithStackValue); // Replace DBG_VALUE instruction with modified version. MachineBasicBlock *MBB = MI.getParent(); DebugLoc DL = MI.getDebugLoc(); bool IsIndirect = MI.isIndirectDebugValue(); - int64_t Offset = IsIndirect ? MI.getOperand(1).getImm() : 0; const MDNode *Var = MI.getDebugVariable(); + if (IsIndirect) + assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset"); return BuildMI(*MBB, MBB->erase(&MI), DL, TII->get(TargetOpcode::DBG_VALUE), - IsIndirect, VReg, Offset, Var, Expr); + IsIndirect, VReg, Var, Expr); } // Try to find similar LEAs in the list and replace one with another. @@ -649,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) { bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { bool Changed = false; - if (DisableX86LEAOpt || skipFunction(*MF.getFunction())) + if (DisableX86LEAOpt || skipFunction(MF.getFunction())) return false; MRI = &MF.getRegInfo(); @@ -673,7 +696,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) { // Remove redundant address calculations. Do it only for -Os/-Oz since only // a code size gain is expected from this part of the pass. - if (MF.getFunction()->optForSize()) + if (MF.getFunction().optForSize()) Changed |= removeRedundantAddrCalc(LEAs); } diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp index 3069d1fd3497..1da0fad8b6cf 100644 --- a/lib/Target/X86/X86PadShortFunction.cpp +++ b/lib/Target/X86/X86PadShortFunction.cpp @@ -13,7 +13,6 @@ // //===----------------------------------------------------------------------===// -#include <algorithm> #include "X86.h" #include "X86InstrInfo.h" @@ -21,12 +20,11 @@ #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -98,10 +96,10 @@ FunctionPass *llvm::createX86PadShortFunctions() { /// runOnMachineFunction - Loop over all of the basic blocks, inserting /// NOOP instructions before early exits. bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) { - if (skipFunction(*MF.getFunction())) + if (skipFunction(MF.getFunction())) return false; - if (MF.getFunction()->optForSize()) { + if (MF.getFunction().optForSize()) { return false; } diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp index efd3df26dd42..aa0e3743c948 100644 --- a/lib/Target/X86/X86RegisterBankInfo.cpp +++ b/lib/Target/X86/X86RegisterBankInfo.cpp @@ -16,7 +16,7 @@ #include "llvm/CodeGen/GlobalISel/RegisterBank.h" #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_TARGET_REGBANK_IMPL #include "X86GenRegisterBank.inc" @@ -26,10 +26,6 @@ using namespace llvm; #define GET_TARGET_REGBANK_INFO_IMPL #include "X86GenRegisterBankInfo.def" -#ifndef LLVM_BUILD_GLOBAL_ISEL -#error "You shouldn't build this" -#endif - X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI) : X86GenRegisterBankInfo() { @@ -164,7 +160,7 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // Try the default logic for non-generic instructions that are either copies // or already have some operands assigned to banks. - if (!isPreISelGenericOpcode(Opc)) { + if (!isPreISelGenericOpcode(Opc) || Opc == TargetOpcode::G_PHI) { const InstructionMapping &Mapping = getInstrMappingImpl(MI); if (Mapping.isValid()) return Mapping; @@ -186,10 +182,19 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } unsigned NumOperands = MI.getNumOperands(); - - // Track the bank of each register, use NotFP mapping (all scalars in GPRs) SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands); - getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx); + + switch (Opc) { + case TargetOpcode::G_FPEXT: + case TargetOpcode::G_FCONSTANT: + // Instruction having only floating-point operands (all scalars in VECRReg) + getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx); + break; + default: + // Track the bank of each register, use NotFP mapping (all scalars in GPRs) + getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx); + break; + } // Finally construct the computed mapping. SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands); @@ -215,7 +220,8 @@ X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const { switch (MI.getOpcode()) { case TargetOpcode::G_LOAD: - case TargetOpcode::G_STORE: { + case TargetOpcode::G_STORE: + case TargetOpcode::G_IMPLICIT_DEF: { // we going to try to map 32/64 bit to PMI_FP32/PMI_FP64 unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI); if (Size != 32 && Size != 64) diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index 343da2573b55..bc31e95aa6b5 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -15,26 +15,21 @@ #include "X86RegisterInfo.h" #include "X86FrameLowering.h" -#include "X86InstrBuilder.h" #include "X86MachineFunctionInfo.h" #include "X86Subtarget.h" -#include "X86TargetMachine.h" #include "llvm/ADT/BitVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetFrameLowering.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" #include "llvm/IR/Type.h" -#include "llvm/MC/MCAsmInfo.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" -#include "llvm/Target/TargetFrameLowering.h" -#include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetMachine.h" #include "llvm/Target/TargetOptions.h" @@ -223,13 +218,13 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF, const TargetRegisterClass * X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const { - const Function *F = MF.getFunction(); - if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64)) + const Function &F = MF.getFunction(); + if (IsWin64 || (F.getCallingConv() == CallingConv::Win64)) return &X86::GR64_TCW64RegClass; else if (Is64Bit) return &X86::GR64_TCRegClass; - bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false); + bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE); if (hasHipeCC) return &X86::GR32RegClass; return &X86::GR32_TCRegClass; @@ -271,16 +266,17 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { assert(MF && "MachineFunction required"); const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>(); + const Function &F = MF->getFunction(); bool HasSSE = Subtarget.hasSSE1(); bool HasAVX = Subtarget.hasAVX(); bool HasAVX512 = Subtarget.hasAVX512(); bool CallsEHReturn = MF->callsEHReturn(); - CallingConv::ID CC = MF->getFunction()->getCallingConv(); + CallingConv::ID CC = F.getCallingConv(); // If attribute NoCallerSavedRegisters exists then we set X86_INTR calling // convention because it has the CSR list. - if (MF->getFunction()->hasFnAttribute("no_caller_saved_registers")) + if (MF->getFunction().hasFnAttribute("no_caller_saved_registers")) CC = CallingConv::X86_INTR; switch (CC) { @@ -365,28 +361,26 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { } if (Is64Bit) { - if (IsWin64) { - if (!HasSSE) - return CSR_Win64_NoSSE_SaveList; - return CSR_Win64_SaveList; - } + bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() && + F.getAttributes().hasAttrSomewhere(Attribute::SwiftError); + if (IsSwiftCC) + return IsWin64 ? CSR_Win64_SwiftError_SaveList + : CSR_64_SwiftError_SaveList; + + if (IsWin64) + return HasSSE ? CSR_Win64_SaveList : CSR_Win64_NoSSE_SaveList; if (CallsEHReturn) return CSR_64EHRet_SaveList; - if (Subtarget.getTargetLowering()->supportSwiftError() && - MF->getFunction()->getAttributes().hasAttrSomewhere( - Attribute::SwiftError)) - return CSR_64_SwiftError_SaveList; return CSR_64_SaveList; } - if (CallsEHReturn) - return CSR_32EHRet_SaveList; - return CSR_32_SaveList; + + return CallsEHReturn ? CSR_32EHRet_SaveList : CSR_32_SaveList; } const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy( const MachineFunction *MF) const { assert(MF && "Invalid MachineFunction pointer."); - if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS && + if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS && MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR()) return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList; return nullptr; @@ -479,14 +473,14 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, // Unlike getCalleeSavedRegs(), we don't have MMI so we can't check // callsEHReturn(). if (Is64Bit) { - if (IsWin64) - return CSR_Win64_RegMask; - if (Subtarget.getTargetLowering()->supportSwiftError() && - MF.getFunction()->getAttributes().hasAttrSomewhere( - Attribute::SwiftError)) - return CSR_64_SwiftError_RegMask; - return CSR_64_RegMask; + const Function &F = MF.getFunction(); + bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() && + F.getAttributes().hasAttrSomewhere(Attribute::SwiftError); + if (IsSwiftCC) + return IsWin64 ? CSR_Win64_SwiftError_RegMask : CSR_64_SwiftError_RegMask; + return IsWin64 ? CSR_Win64_RegMask : CSR_64_RegMask; } + return CSR_32_RegMask; } @@ -508,6 +502,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { ++I) Reserved.set(*I); + // Set the Shadow Stack Pointer as reserved. + Reserved.set(X86::SSP); + // Set the instruction pointer register and its aliases as reserved. for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid(); ++I) @@ -522,7 +519,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const { // Set the base-pointer register and its aliases as reserved if needed. if (hasBasePointer(MF)) { - CallingConv::ID CC = MF.getFunction()->getCallingConv(); + CallingConv::ID CC = MF.getFunction().getCallingConv(); const uint32_t *RegMask = getCallPreservedMask(MF, CC); if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister())) report_fatal_error( diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h index 25958f0c3106..29401dadead0 100644 --- a/lib/Target/X86/X86RegisterInfo.h +++ b/lib/Target/X86/X86RegisterInfo.h @@ -14,7 +14,7 @@ #ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H #define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H -#include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" #define GET_REGINFO_HEADER #include "X86GenRegisterInfo.inc" diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td index 3a61a7247c72..2341e1fb0fac 100644 --- a/lib/Target/X86/X86RegisterInfo.td +++ b/lib/Target/X86/X86RegisterInfo.td @@ -308,6 +308,9 @@ def BND1 : X86Reg<"bnd1", 1>; def BND2 : X86Reg<"bnd2", 2>; def BND3 : X86Reg<"bnd3", 3>; +// CET registers - Shadow Stack Pointer +def SSP : X86Reg<"ssp", 0>; + //===----------------------------------------------------------------------===// // Register Class Definitions... now that we have all of the pieces, define the // top-level register classes. The order specified in the register list is @@ -357,7 +360,7 @@ def GR64 : RegisterClass<"X86", [i64], 64, def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>; // Debug registers. -def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>; +def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 15)>; // Control registers. def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>; diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td new file mode 100755 index 000000000000..e4e0ed435103 --- /dev/null +++ b/lib/Target/X86/X86SchedBroadwell.td @@ -0,0 +1,3869 @@ +//=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Broadwell to support instruction +// scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// +def BroadwellModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and HW can decode 4 + // instructions per cycle. + let IssueWidth = 4; + let MicroOpBufferSize = 192; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 16; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = BroadwellModel in { + +// Broadwell can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def BWPort0 : ProcResource<1>; +def BWPort1 : ProcResource<1>; +def BWPort2 : ProcResource<1>; +def BWPort3 : ProcResource<1>; +def BWPort4 : ProcResource<1>; +def BWPort5 : ProcResource<1>; +def BWPort6 : ProcResource<1>; +def BWPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def BWPort01 : ProcResGroup<[BWPort0, BWPort1]>; +def BWPort23 : ProcResGroup<[BWPort2, BWPort3]>; +def BWPort237 : ProcResGroup<[BWPort2, BWPort3, BWPort7]>; +def BWPort04 : ProcResGroup<[BWPort0, BWPort4]>; +def BWPort05 : ProcResGroup<[BWPort0, BWPort5]>; +def BWPort06 : ProcResGroup<[BWPort0, BWPort6]>; +def BWPort15 : ProcResGroup<[BWPort1, BWPort5]>; +def BWPort16 : ProcResGroup<[BWPort1, BWPort6]>; +def BWPort56 : ProcResGroup<[BWPort5, BWPort6]>; +def BWPort015 : ProcResGroup<[BWPort0, BWPort1, BWPort5]>; +def BWPort056 : ProcResGroup<[BWPort0, BWPort5, BWPort6]>; +def BWPort0156: ProcResGroup<[BWPort0, BWPort1, BWPort5, BWPort6]>; + +// 60 Entry Unified Scheduler +def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4, + BWPort5, BWPort6, BWPort7]> { + let BufferSize=60; +} + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [BWPort23, ExePort]> { + let Latency = !add(Lat, 5); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [BWPort4]>; + +// Arithmetic. +defm : BWWriteResPair<WriteALU, BWPort0156, 1>; // Simple integer ALU op. +defm : BWWriteResPair<WriteIMul, BWPort1, 3>; // Integer multiplication. +def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. +def BWDivider : ProcResource<1>; // Integer division issued on port 0. +def : WriteRes<WriteIDiv, [BWPort0, BWDivider]> { // Integer division. + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [BWPort23, BWPort0, BWDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads. + +// Integer shifts and rotates. +defm : BWWriteResPair<WriteShift, BWPort06, 1>; + +// Loads, stores, and moves, not folded with other operations. +def : WriteRes<WriteLoad, [BWPort23]> { let Latency = 5; } +def : WriteRes<WriteStore, [BWPort237, BWPort4]>; +def : WriteRes<WriteMove, [BWPort0156]>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes<WriteZero, []>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : BWWriteResPair<WriteJump, BWPort06, 1>; + +// Floating point. This covers both scalar and vector operations. +defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare. +defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication. +defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division. +defm : BWWriteResPair<WriteFSqrt, BWPort0, 15>; // Floating point square root. +defm : BWWriteResPair<WriteFRcp, BWPort0, 5>; // Floating point reciprocal estimate. +defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate. +defm : BWWriteResPair<WriteFMA, BWPort01, 5>; // Fused Multiply Add. +defm : BWWriteResPair<WriteFShuffle, BWPort5, 1>; // Floating point vector shuffles. +defm : BWWriteResPair<WriteFBlend, BWPort015, 1>; // Floating point vector blends. +def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals. +defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts. +defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply. +defm : BWWriteResPair<WriteShuffle, BWPort5, 1>; // Vector shuffles. +defm : BWWriteResPair<WriteBlend, BWPort15, 1>; // Vector blends. + +def : WriteRes<WriteVarBlend, [BWPort5]> { // Vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [BWPort5, BWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteMPSAD, [BWPort0, BWPort5]> { // Vector MPSAD. + let Latency = 6; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteMPSADLd, [BWPort23, BWPort0, BWPort5]> { + let Latency = 6; + let ResourceCycles = [1, 1, 2]; +} + +// Vector bitwise operations. +// These are often used on both floating point and integer vectors. +defm : BWWriteResPair<WriteVecLogic, BWPort015, 1>; // Vector and/or/xor. + +// Conversion between integer and float. +defm : BWWriteResPair<WriteCvtF2I, BWPort1, 3>; // Float -> Integer. +defm : BWWriteResPair<WriteCvtI2F, BWPort1, 4>; // Integer -> Float. +defm : BWWriteResPair<WriteCvtF2F, BWPort1, 3>; // Float -> Float size conversion. + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +// String instructions. +def : WriteRes<WritePCmpIStrM, [BWPort0]> { + let Latency = 10; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [BWPort0, BWPort23]> { + let Latency = 10; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort16, BWPort5]> { + let Latency = 10; + let ResourceCycles = [3, 2, 4]; +} +def : WriteRes<WritePCmpEStrMLd, [BWPort05, BWPort16, BWPort23]> { + let Latency = 10; + let ResourceCycles = [6, 2, 1]; +} + // Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [BWPort0]> { + let Latency = 11; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [BWPort0, BWPort23]> { + let Latency = 11; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [BWPort05, BWPort16]> { + let Latency = 11; + let ResourceCycles = [6, 2]; +} +def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort16, BWPort5, BWPort23]> { + let Latency = 11; + let ResourceCycles = [3, 2, 2, 1]; +} + +// AES instructions. +def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption. + let Latency = 7; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [BWPort5, BWPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteAESIMC, [BWPort5]> { // InvMixColumn. + let Latency = 14; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [BWPort5, BWPort23]> { + let Latency = 14; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5]> { // Key Generation. + let Latency = 10; + let ResourceCycles = [2, 8]; +} +def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23]> { + let Latency = 10; + let ResourceCycles = [2, 7, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [BWPort0, BWPort5]> { + let Latency = 7; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteCLMulLd, [BWPort0, BWPort5, BWPort23]> { + let Latency = 7; + let ResourceCycles = [2, 1, 1]; +} + +// Catch-all for expensive system instructions. +def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : BWWriteResPair<WriteFShuffle256, BWPort5, 3>; // Fp 256-bit width vector shuffles. +defm : BWWriteResPair<WriteShuffle256, BWPort5, 3>; // 256-bit width vector shuffles. +def : WriteRes<WriteVarVecShift, [BWPort0, BWPort5]> { // Variable vector shifts. + let Latency = 2; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteVarVecShiftLd, [BWPort0, BWPort5, BWPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1, 1]; +} + +// Old microcoded instructions that nobody use. +def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes<WriteFence, [BWPort23, BWPort4]>; + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes<WriteFHAdd, [BWPort1]> { + let Latency = 3; +} + +// x,m / v,v,m. +def : WriteRes<WriteFHAddLd, [BWPort1, BWPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes<WritePHAdd, [BWPort15]>; + +// v <- v,m. +def : WriteRes<WritePHAddLd, [BWPort15, BWPort23]> { + let Latency = 5; + let ResourceCycles = [1, 1]; +} + +// Remaining instrs. + +def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64grr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PMOVMSKBrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MOVPDI2DIrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "MOVPQIto64rr")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSLLDri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSLLQri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSLLWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSRADri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSRAWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSRLDri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSRLQri")>; +def: InstRW<[BWWriteResGroup1], (instregex "PSRLWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VMOVPQIto64rr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQYrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRADYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRADri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQYrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWYri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWri")>; +def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDYrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSYrr")>; +def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSrr")>; + +def BWWriteResGroup2 : SchedWriteRes<[BWPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup2], (instregex "COMP_FST0r")>; +def: InstRW<[BWWriteResGroup2], (instregex "COM_FST0r")>; +def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[BWWriteResGroup2], (instregex "UCOM_FPr")>; +def: InstRW<[BWWriteResGroup2], (instregex "UCOM_Fr")>; +def: InstRW<[BWWriteResGroup2], (instregex "VMASKMOVDQU")>; + +def BWWriteResGroup3 : SchedWriteRes<[BWPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup3], (instregex "ANDNPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "ANDNPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "ANDPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "ANDPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "INSERTPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64rr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64to64rr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFWri")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOV64toPQIrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVAPDrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVAPSrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVDDUPrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVDI2PDIrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVHLPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVLHPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVSDrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVSHDUPrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVSLDUPrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVSSrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "ORPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "ORPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PACKSSDWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PACKSSWBrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PACKUSDWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PACKUSWBrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PALIGNRrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PBLENDWrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PSHUFBrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PSHUFDri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PSHUFHWri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PSHUFLWri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PSLLDQri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PSRLDQri")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "SHUFPDrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "SHUFPSrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VANDPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VBROADCASTSSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VINSERTPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOV64toPQIrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDYrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSYrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVHLPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVLHPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVSSrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>; +def: InstRW<[BWWriteResGroup3], (instregex "VORPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VORPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VORPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VORPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRYrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWYrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQYri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDYrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSYrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSrri")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VXORPDYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VXORPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VXORPSYrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "VXORPSrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "XORPDrr")>; +def: InstRW<[BWWriteResGroup3], (instregex "XORPSrr")>; + +def BWWriteResGroup4 : SchedWriteRes<[BWPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup4], (instregex "JMP(16|32|64)r")>; + +def BWWriteResGroup5 : SchedWriteRes<[BWPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup5], (instregex "FINCSTP")>; +def: InstRW<[BWWriteResGroup5], (instregex "FNOP")>; + +def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup6], (instregex "ADC8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup6], (instregex "ADCX(32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "ADOX(32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CDQ")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "CQO")>; +def: InstRW<[BWWriteResGroup6], (instregex "JAE_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JAE_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JA_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JA_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JBE_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JBE_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JB_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JB_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JE_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JE_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JGE_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JGE_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JG_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JG_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JLE_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JLE_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JL_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JL_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JMP_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JMP_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNE_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNE_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNO_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNO_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNP_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNP_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNS_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JNS_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JO_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JO_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JP_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JP_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "JS_1")>; +def: InstRW<[BWWriteResGroup6], (instregex "JS_4")>; +def: InstRW<[BWWriteResGroup6], (instregex "RORX(32|64)ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SAR8r1")>; +def: InstRW<[BWWriteResGroup6], (instregex "SAR8ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SARX(32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup6], (instregex "SBB8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETAEr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETBr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETEr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETGEr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETGr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETLEr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETLr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETNEr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETNOr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETNPr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETNSr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETOr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETPr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SETSr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHL8r1")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHL8ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHLX(32|64)rr")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHR8r1")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHR8ri")>; +def: InstRW<[BWWriteResGroup6], (instregex "SHRX(32|64)rr")>; + +def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr")>; +def: InstRW<[BWWriteResGroup7], (instregex "BLSI(32|64)rr")>; +def: InstRW<[BWWriteResGroup7], (instregex "BLSMSK(32|64)rr")>; +def: InstRW<[BWWriteResGroup7], (instregex "BLSR(32|64)rr")>; +def: InstRW<[BWWriteResGroup7], (instregex "BZHI(32|64)rr")>; +def: InstRW<[BWWriteResGroup7], (instregex "LEA(16|32|64)(_32)?r")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSBrr64")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSDrr64")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSWrr64")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDDirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDQirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQDirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTDirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXSWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXUBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINSWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINUBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBDirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBQirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSBirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBWirr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PABSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PABSDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PABSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDQrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDUSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDUSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PADDWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PAVGBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PAVGWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQQrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMAXSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMAXSDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMAXSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMAXUBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMAXUDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMAXUWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMINSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMINSDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMINSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMINUBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMINUDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PMINUWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSIGNBrr128")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSIGNDrr128")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSIGNWrr128")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBQrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "PSUBWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPABSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPABSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPABSDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPABSDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPABSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPABSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDQYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDQrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPADDWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBYrr256")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBrr128")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDYrr256")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDrr128")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWYrr256")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWrr128")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWYrr")>; +def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWrr")>; + +def BWWriteResGroup8 : SchedWriteRes<[BWPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup8], (instregex "BLENDPDrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "BLENDPSrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVQ64rr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDNirr")>; +def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDirr")>; +def: InstRW<[BWWriteResGroup8], (instregex "MMX_PORirr")>; +def: InstRW<[BWWriteResGroup8], (instregex "MMX_PXORirr")>; +def: InstRW<[BWWriteResGroup8], (instregex "MOVDQArr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "MOVDQUrr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "MOVPQI2QIrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "PANDNrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "PANDrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "PORrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "PXORrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDYrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSYrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQAYrr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQArr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUYrr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUrr(_REV)?")>; +def: InstRW<[BWWriteResGroup8], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPANDNYrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPANDNrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPANDYrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPANDrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDYrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDrri")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPORYrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPORrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPXORYrr")>; +def: InstRW<[BWWriteResGroup8], (instregex "VPXORrr")>; + +def BWWriteResGroup9 : SchedWriteRes<[BWPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "ADD8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "ADD8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "ADD8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "AND8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "AND8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "AND8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "CBW")>; +def: InstRW<[BWWriteResGroup9], (instregex "CLC")>; +def: InstRW<[BWWriteResGroup9], (instregex "CMC")>; +def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "CMP8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "CMP8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "CMP8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "CWDE")>; +def: InstRW<[BWWriteResGroup9], (instregex "DEC(16|32|64)r")>; +def: InstRW<[BWWriteResGroup9], (instregex "DEC8r")>; +def: InstRW<[BWWriteResGroup9], (instregex "INC(16|32|64)r")>; +def: InstRW<[BWWriteResGroup9], (instregex "INC8r")>; +def: InstRW<[BWWriteResGroup9], (instregex "LAHF")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOV(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri(_alt)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOV8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[BWWriteResGroup9], (instregex "NEG(16|32|64)r")>; +def: InstRW<[BWWriteResGroup9], (instregex "NEG8r")>; +def: InstRW<[BWWriteResGroup9], (instregex "NOOP")>; +def: InstRW<[BWWriteResGroup9], (instregex "NOT(16|32|64)r")>; +def: InstRW<[BWWriteResGroup9], (instregex "NOT8r")>; +def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "OR8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "OR8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "OR8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "SAHF")>; +def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m")>; +def: InstRW<[BWWriteResGroup9], (instregex "SIDT64m")>; +def: InstRW<[BWWriteResGroup9], (instregex "SLDT64m")>; +def: InstRW<[BWWriteResGroup9], (instregex "SMSW16m")>; +def: InstRW<[BWWriteResGroup9], (instregex "STC")>; +def: InstRW<[BWWriteResGroup9], (instregex "STRm")>; +def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "SUB8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "SUB8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "SUB8rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "SYSCALL")>; +def: InstRW<[BWWriteResGroup9], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup9], (instregex "TEST8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "TEST8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "TEST8rr")>; +def: InstRW<[BWWriteResGroup9], (instregex "XCHG(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)rr(_REV)?")>; +def: InstRW<[BWWriteResGroup9], (instregex "XOR8i8")>; +def: InstRW<[BWWriteResGroup9], (instregex "XOR8ri")>; +def: InstRW<[BWWriteResGroup9], (instregex "XOR8rr(_REV)?")>; + +def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm")>; +def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVNTQmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVQ64mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOV8mi")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOV8mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVAPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVAPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVDQAmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVDQUmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVHPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVHPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVLPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVLPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVNTDQmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVNTI_64mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVNTImr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVPDI2DImr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVPQI2QImr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVPQIto64mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVSDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVSSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVUPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "MOVUPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "ST_FP32m")>; +def: InstRW<[BWWriteResGroup10], (instregex "ST_FP64m")>; +def: InstRW<[BWWriteResGroup10], (instregex "ST_FP80m")>; +def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTF128mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTI128mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVPDI2DImr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQI2QImr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQIto64mr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVSDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVSSmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSYmr")>; +def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSmr")>; + +def BWWriteResGroup11 : SchedWriteRes<[BWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPDrr0")>; +def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPSrr0")>; +def: InstRW<[BWWriteResGroup11], (instregex "MMX_PINSRWirri")>; +def: InstRW<[BWWriteResGroup11], (instregex "PBLENDVBrr0")>; +def: InstRW<[BWWriteResGroup11], (instregex "PINSRBrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "PINSRDrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "PINSRQrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "PINSRWrri")>; +def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDYrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSYrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBYrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VPINSRBrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VPINSRDrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VPINSRQrr")>; +def: InstRW<[BWWriteResGroup11], (instregex "VPINSRWrri")>; + +def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup12], (instregex "FDECSTP")>; + +def BWWriteResGroup13 : SchedWriteRes<[BWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROL8r1")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROL8ri")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROR8r1")>; +def: InstRW<[BWWriteResGroup13], (instregex "ROR8ri")>; + +def BWWriteResGroup14 : SchedWriteRes<[BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup14], (instregex "LFENCE")>; +def: InstRW<[BWWriteResGroup14], (instregex "MFENCE")>; +def: InstRW<[BWWriteResGroup14], (instregex "WAIT")>; +def: InstRW<[BWWriteResGroup14], (instregex "XGETBV")>; + +def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup15], (instregex "CVTPS2PDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "CVTSS2SDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "EXTRACTPSrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "MMX_PEXTRWirri")>; +def: InstRW<[BWWriteResGroup15], (instregex "PEXTRBrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PEXTRDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PEXTRQrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWri")>; +def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWrr_REV")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSLLDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSLLQrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSLLWrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSRADrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSRAWrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSRLDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSRLQrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PSRLWrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "PTESTrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSYrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VCVTPS2PDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VCVTSS2SDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VEXTRACTPSrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRBrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRQrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWri")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWrr_REV")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSLLDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSLLQrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSLLWrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSRADrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSRAWrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSRLDrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSRLQrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPSRLWrr")>; +def: InstRW<[BWWriteResGroup15], (instregex "VPTESTrr")>; + +def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup16], (instregex "CLFLUSH")>; + +def BWWriteResGroup17 : SchedWriteRes<[BWPort01,BWPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup17], (instregex "MMX_MOVDQ2Qrr")>; + +def BWWriteResGroup18 : SchedWriteRes<[BWPort237,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup18], (instregex "SFENCE")>; + +def BWWriteResGroup19 : SchedWriteRes<[BWPort06,BWPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr")>; +def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>; + +def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8")>; +def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri")>; +def: InstRW<[BWWriteResGroup20], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup20], (instregex "CMOVBE(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup20], (instregex "CWD")>; +def: InstRW<[BWWriteResGroup20], (instregex "JRCXZ")>; +def: InstRW<[BWWriteResGroup20], (instregex "SBB8i8")>; +def: InstRW<[BWWriteResGroup20], (instregex "SBB8ri")>; +def: InstRW<[BWWriteResGroup20], (instregex "SETAr")>; +def: InstRW<[BWWriteResGroup20], (instregex "SETBEr")>; + +def BWWriteResGroup21 : SchedWriteRes<[BWPort4,BWPort5,BWPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup21], (instregex "EXTRACTPSmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "PEXTRBmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "PEXTRDmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "PEXTRQmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "PEXTRWmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "STMXCSR")>; +def: InstRW<[BWWriteResGroup21], (instregex "VEXTRACTPSmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRBmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRDmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRQmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRWmr")>; +def: InstRW<[BWWriteResGroup21], (instregex "VSTMXCSR")>; + +def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup22], (instregex "FNSTCW16m")>; + +def BWWriteResGroup23 : SchedWriteRes<[BWPort4,BWPort237,BWPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup23], (instregex "SETAEm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETBm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETEm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETGEm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETGm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETLEm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETLm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETNEm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETNOm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETNPm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETNSm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETOm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETPm")>; +def: InstRW<[BWWriteResGroup23], (instregex "SETSm")>; + +def BWWriteResGroup24 : SchedWriteRes<[BWPort4,BWPort237,BWPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup24], (instregex "MOVBE(16|32|64)mr")>; + +def BWWriteResGroup25 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)r(mr)?")>; +def: InstRW<[BWWriteResGroup25], (instregex "PUSH64i8")>; +def: InstRW<[BWWriteResGroup25], (instregex "STOSB")>; +def: InstRW<[BWWriteResGroup25], (instregex "STOSL")>; +def: InstRW<[BWWriteResGroup25], (instregex "STOSQ")>; +def: InstRW<[BWWriteResGroup25], (instregex "STOSW")>; + +def BWWriteResGroup26 : SchedWriteRes<[BWPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPDrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPSrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "PMOVMSKBrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBYrr")>; +def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBrr")>; + +def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup27], (instregex "ADDPDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADDPSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADDSDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADDSSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADD_FPrST0")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADD_FST0r")>; +def: InstRW<[BWWriteResGroup27], (instregex "ADD_FrST0")>; +def: InstRW<[BWWriteResGroup27], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "CMPPDrri")>; +def: InstRW<[BWWriteResGroup27], (instregex "CMPPSrri")>; +def: InstRW<[BWWriteResGroup27], (instregex "CMPSDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "CMPSSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "COMISDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "COMISSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "CVTDQ2PSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "CVTPS2DQrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "CVTTPS2DQrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "IMUL(32|64)rr(i8)?")>; +def: InstRW<[BWWriteResGroup27], (instregex "IMUL8r")>; +def: InstRW<[BWWriteResGroup27], (instregex "LZCNT(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr")>; +def: InstRW<[BWWriteResGroup27], (instregex "MUL8r")>; +def: InstRW<[BWWriteResGroup27], (instregex "PDEP(32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "PEXT(32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[BWWriteResGroup27], (instregex "SHRD(16|32|64)rri8")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBPDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBPSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FPrST0")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FST0r")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FrST0")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBSDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUBSSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUB_FPrST0")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUB_FST0r")>; +def: InstRW<[BWWriteResGroup27], (instregex "SUB_FrST0")>; +def: InstRW<[BWWriteResGroup27], (instregex "TZCNT(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup27], (instregex "UCOMISDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "UCOMISSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDPDYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDPDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDPSYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDPSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDSDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDSSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDYrri")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDrri")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSYrri")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSrri")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCMPSDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCMPSSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCOMISDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCOMISSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSYrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VSUBSDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VSUBSSrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISDrr")>; +def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISSrr")>; + +def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup27_16], (instregex "IMUL16rr(i8)?")>; + +def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSSYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTF128rr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTI128rr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VINSERTF128rr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VINSERTI128rr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTQYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPERM2F128rr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPERM2I128rr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPERMDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPERMPDYri")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPERMPSYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPERMQYri")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBQYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBWYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXDQYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWQYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBQYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBWYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXDQYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWDYrr")>; +def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWQYrr")>; + +def BWWriteResGroup29 : SchedWriteRes<[BWPort01]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup29], (instregex "MULPDrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "MULPSrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "MULSDrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "MULSSrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "VMULPDYrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "VMULPDrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "VMULPSYrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "VMULPSrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "VMULSDrr")>; +def: InstRW<[BWWriteResGroup29], (instregex "VMULSSrr")>; + +def BWWriteResGroup30 : SchedWriteRes<[BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[BWWriteResGroup30], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup30], (instregex "XADD8rr")>; +def: InstRW<[BWWriteResGroup30], (instregex "XCHG8rr")>; + +def BWWriteResGroup31 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDYrr")>; +def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDrr")>; +def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDYrr")>; +def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDrr")>; +def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDYrr")>; +def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDrr")>; + +def BWWriteResGroup32 : SchedWriteRes<[BWPort5,BWPort15]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDrr64")>; +def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBSWrr64")>; +def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBWrr64")>; +def: InstRW<[BWWriteResGroup32], (instregex "PHADDDrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "PHADDSWrr128")>; +def: InstRW<[BWWriteResGroup32], (instregex "PHADDWrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "PHSUBDrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "PHSUBSWrr128")>; +def: InstRW<[BWWriteResGroup32], (instregex "PHSUBWrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDYrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr128")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr256")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWYrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDYrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr128")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr256")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWYrr")>; +def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWrr")>; + +def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr")>; +def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSWBirr")>; +def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKUSWBirr")>; + +def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup34], (instregex "CLD")>; + +def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCL8r1")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCL8ri")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)r1")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)ri")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCR8r1")>; +def: InstRW<[BWWriteResGroup35], (instregex "RCR8ri")>; + +def BWWriteResGroup36 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup36], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "ROL8rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "ROR8rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "SAR8rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "SHL8rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[BWWriteResGroup36], (instregex "SHR8rCL")>; + +def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup37], (instregex "CALL(16|32|64)r")>; + +def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup38], (instregex "CALL64pcrel32")>; +def: InstRW<[BWWriteResGroup38], (instregex "SETAm")>; +def: InstRW<[BWWriteResGroup38], (instregex "SETBEm")>; + +def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SIrr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SIrr")>; + +def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSLLDYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSLLQYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSLLWYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSRADYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSRAWYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSRLDYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSRLQYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPSRLWYrr")>; +def: InstRW<[BWWriteResGroup40], (instregex "VPTESTYrr")>; + +def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup41], (instregex "FNSTSW16r")>; + +def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup42], (instregex "CVTDQ2PDrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2DQrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2PSrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTSD2SSrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTSI642SDrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SDrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SSrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "CVTTPD2DQrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "IMUL(32|64)r")>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[BWWriteResGroup42], (instregex "MUL(32|64)r")>; +def: InstRW<[BWWriteResGroup42], (instregex "MULX64rr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTDQ2PDrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2DQrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2PSrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTPS2PHrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTSD2SSrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI642SDrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SDrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SSrr")>; +def: InstRW<[BWWriteResGroup42], (instregex "VCVTTPD2DQrr")>; + +def BWWriteResGroup42_16 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def: InstRW<[BWWriteResGroup42_16], (instregex "IMUL16r")>; +def: InstRW<[BWWriteResGroup42_16], (instregex "MUL16r")>; + +def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup43], (instregex "FNSTSWm")>; + +def BWWriteResGroup44 : SchedWriteRes<[BWPort1,BWPort4,BWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP16m")>; +def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP32m")>; +def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP64m")>; +def: InstRW<[BWWriteResGroup44], (instregex "IST_F16m")>; +def: InstRW<[BWWriteResGroup44], (instregex "IST_F32m")>; +def: InstRW<[BWWriteResGroup44], (instregex "IST_FP16m")>; +def: InstRW<[BWWriteResGroup44], (instregex "IST_FP32m")>; +def: InstRW<[BWWriteResGroup44], (instregex "IST_FP64m")>; +def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHYmr")>; +def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHmr")>; + +def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[BWWriteResGroup45], (instregex "FNCLEX")>; + +def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[BWWriteResGroup46], (instregex "VZEROUPPER")>; + +def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDWDirr")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHUWirr")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHWirr")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULLWirr")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[BWWriteResGroup47], (instregex "MMX_PSADBWirr")>; +def: InstRW<[BWWriteResGroup47], (instregex "MUL_FPrST0")>; +def: InstRW<[BWWriteResGroup47], (instregex "MUL_FST0r")>; +def: InstRW<[BWWriteResGroup47], (instregex "MUL_FrST0")>; +def: InstRW<[BWWriteResGroup47], (instregex "PCLMULQDQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PCMPGTQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMADDUBSWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMADDWDrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMULDQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMULHRSWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMULHUWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMULHWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMULLWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PMULUDQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "PSADBWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "RCPPSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "RCPSSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "RSQRTPSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "RSQRTSSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPCLMULQDQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWYrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWrr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VRCPPSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VRCPSSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTPSr")>; +def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTSSr")>; + +def BWWriteResGroup48 : SchedWriteRes<[BWPort01]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup48], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>; + +def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup49], (instregex "LDDQUrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64rm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64to64rm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVQ64rm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOV64toPQIrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOV8rm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVAPDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVAPSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVDDUPrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVDI2PDIrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVDQArm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVDQUrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVNTDQArm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVQI2PQIrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSHDUPrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSLDUPrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVUPDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVUPSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHNTA")>; +def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT0")>; +def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT1")>; +def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT2")>; +def: InstRW<[BWWriteResGroup49], (instregex "VBROADCASTSSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VLDDQUrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOV64toPQIrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVDDUPrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQArm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQUrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVNTDQArm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVSDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVSHDUPrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVSLDUPrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVSSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPSrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTDrm")>; +def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTQrm")>; + +def BWWriteResGroup50 : SchedWriteRes<[BWPort1,BWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup50], (instregex "CVTSI642SSrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "HADDPDrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "HADDPSrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "HSUBPDrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "HSUBPSrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VCVTSI642SSrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDYrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSYrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDYrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSYrr")>; +def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSrr")>; + +def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup51], (instregex "STR(16|32|64)r")>; + +def BWWriteResGroup52 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup52], (instregex "MULX32rr")>; + +def BWWriteResGroup53 : SchedWriteRes<[BWPort0,BWPort4,BWPort237,BWPort15]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDYmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQYmr")>; +def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQmr")>; + +def BWWriteResGroup54 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[BWWriteResGroup54], (instregex "PAUSE")>; + +def BWWriteResGroup55 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[BWWriteResGroup55], (instregex "XSETBV")>; + +def BWWriteResGroup56 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG8rr")>; + +def BWWriteResGroup57 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[BWWriteResGroup57], (instregex "PUSHF16")>; +def: InstRW<[BWWriteResGroup57], (instregex "PUSHF64")>; + +def BWWriteResGroup58 : SchedWriteRes<[BWPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup58], (instregex "LD_F32m")>; +def: InstRW<[BWWriteResGroup58], (instregex "LD_F64m")>; +def: InstRW<[BWWriteResGroup58], (instregex "LD_F80m")>; +def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTF128")>; +def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTI128")>; +def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VLDDQUYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPDYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPSYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVDDUPYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQAYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQUYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVNTDQAYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPDYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPSYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTDYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTQYrm")>; +def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPDr")>; +def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPSr")>; +def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSDr")>; +def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSSr")>; +def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPDr")>; +def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPSr")>; +def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSDr")>; +def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSSr")>; +def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPDr")>; +def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPSr")>; + +def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup59], (instregex "CVTPS2PDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "CVTSS2SDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLQrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLWrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRADrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRAWrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLQrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLWrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSYrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VCVTPS2PDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VCVTSS2SDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VPSLLVQrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VPSRLVQrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VTESTPDrm")>; +def: InstRW<[BWWriteResGroup59], (instregex "VTESTPSrm")>; + +def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr")>; +def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[BWWriteResGroup60], (instregex "VCVTPS2PHYrr")>; +def: InstRW<[BWWriteResGroup60], (instregex "VCVTTPD2DQYrr")>; + +def BWWriteResGroup61 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup61], (instregex "ANDNPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "ANDNPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "ANDPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "ANDPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "INSERTPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PINSRWirmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFWmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHBWirm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHDQirm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHWDirm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLBWirm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLDQirm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLWDirm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MOVHPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MOVHPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MOVLPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "MOVLPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "ORPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "ORPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PACKSSDWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PACKSSWBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PACKUSDWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PACKUSWBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PALIGNRrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "PBLENDWrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "PINSRBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PINSRDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PINSRQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PINSRWrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PSHUFBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PSHUFDmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "PSHUFHWmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "PSHUFLWmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "SHUFPDrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "SHUFPSrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VANDNPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VANDNPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VANDPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VANDPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VINSERTPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VORPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VORPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSDWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSWBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSDWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSWBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPALIGNRrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPBLENDWrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPINSRBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPINSRDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPINSRQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPINSRWrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFBrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFDmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFHWmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFLWmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLWDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPDrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPSrmi")>; +def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VXORPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "VXORPSrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "XORPDrm")>; +def: InstRW<[BWWriteResGroup61], (instregex "XORPSrm")>; + +def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64")>; +def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>; + +def BWWriteResGroup63 : SchedWriteRes<[BWPort23,BWPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup63], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "ADC8rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "ADCX(32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "ADOX(32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "BT(16|32|64)mi8")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "RORX(32|64)mi")>; +def: InstRW<[BWWriteResGroup63], (instregex "SARX(32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "SBB8rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "SHLX(32|64)rm")>; +def: InstRW<[BWWriteResGroup63], (instregex "SHRX(32|64)rm")>; + +def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm")>; +def: InstRW<[BWWriteResGroup64], (instregex "BLSI(32|64)rm")>; +def: InstRW<[BWWriteResGroup64], (instregex "BLSMSK(32|64)rm")>; +def: InstRW<[BWWriteResGroup64], (instregex "BLSR(32|64)rm")>; +def: InstRW<[BWWriteResGroup64], (instregex "BZHI(32|64)rm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSBrm64")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSDrm64")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSWrm64")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDDirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDQirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQDirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTDirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXSWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXUBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINSWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINUBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNWrm64")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBDirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBQirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSBirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBWirm")>; +def: InstRW<[BWWriteResGroup64], (instregex "MOVBE(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PABSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PABSDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PABSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDQrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDUSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDUSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PADDWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PAVGBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PAVGWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQQrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMAXSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMAXSDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMAXSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMAXUBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMAXUDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMAXUWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMINSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMINSDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMINSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMINUBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMINUDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PMINUWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSIGNBrm128")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSIGNDrm128")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSIGNWrm128")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBQrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "PSUBWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPABSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPABSDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPABSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDQrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPADDWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPAVGBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPAVGWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQQrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMINSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMINSDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMINSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMINUBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMINUDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPMINUWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNBrm128")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNDrm128")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNWrm128")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBDrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBQrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSBrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSWrm")>; +def: InstRW<[BWWriteResGroup64], (instregex "VPSUBWrm")>; + +def BWWriteResGroup65 : SchedWriteRes<[BWPort23,BWPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup65], (instregex "BLENDPDrmi")>; +def: InstRW<[BWWriteResGroup65], (instregex "BLENDPSrmi")>; +def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDNirm")>; +def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDirm")>; +def: InstRW<[BWWriteResGroup65], (instregex "MMX_PORirm")>; +def: InstRW<[BWWriteResGroup65], (instregex "MMX_PXORirm")>; +def: InstRW<[BWWriteResGroup65], (instregex "PANDNrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "PANDrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "PORrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "PXORrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPDrmi")>; +def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPSrmi")>; +def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm")>; +def: InstRW<[BWWriteResGroup65], (instregex "VINSERTI128rm")>; +def: InstRW<[BWWriteResGroup65], (instregex "VPANDNrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "VPANDrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "VPBLENDDrmi")>; +def: InstRW<[BWWriteResGroup65], (instregex "VPORrm")>; +def: InstRW<[BWWriteResGroup65], (instregex "VPXORrm")>; + +def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup66], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "ADD8rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "AND(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "AND8rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "CMP8mi")>; +def: InstRW<[BWWriteResGroup66], (instregex "CMP8mr")>; +def: InstRW<[BWWriteResGroup66], (instregex "CMP8rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "OR(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "OR8rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)r(mr)?")>; +def: InstRW<[BWWriteResGroup66], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "SUB8rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "TEST(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup66], (instregex "TEST8mi")>; +def: InstRW<[BWWriteResGroup66], (instregex "TEST8mr")>; +def: InstRW<[BWWriteResGroup66], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup66], (instregex "XOR8rm")>; + +def BWWriteResGroup67 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[BWWriteResGroup67], (instregex "SHRD(16|32|64)rrCL")>; + +def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup68], (instregex "SLDT(16|32|64)r")>; + +def BWWriteResGroup69 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[BWWriteResGroup69], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[BWWriteResGroup69], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup69], (instregex "SAR8m1")>; +def: InstRW<[BWWriteResGroup69], (instregex "SAR8mi")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHL8m1")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHL8mi")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHR8m1")>; +def: InstRW<[BWWriteResGroup69], (instregex "SHR8mi")>; + +def BWWriteResGroup70 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "ADD8mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "ADD8mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "AND8mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "AND8mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "DEC(16|32|64)m")>; +def: InstRW<[BWWriteResGroup70], (instregex "DEC8m")>; +def: InstRW<[BWWriteResGroup70], (instregex "INC(16|32|64)m")>; +def: InstRW<[BWWriteResGroup70], (instregex "INC8m")>; +def: InstRW<[BWWriteResGroup70], (instregex "NEG(16|32|64)m")>; +def: InstRW<[BWWriteResGroup70], (instregex "NEG8m")>; +def: InstRW<[BWWriteResGroup70], (instregex "NOT(16|32|64)m")>; +def: InstRW<[BWWriteResGroup70], (instregex "NOT8m")>; +def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "OR8mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "OR8mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm")>; +def: InstRW<[BWWriteResGroup70], (instregex "PUSH(16|32|64)rmm")>; +def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "SUB8mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "SUB8mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup70], (instregex "XOR8mi")>; +def: InstRW<[BWWriteResGroup70], (instregex "XOR8mr")>; + +def BWWriteResGroup71 : SchedWriteRes<[BWPort6,BWPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[BWWriteResGroup71], (instregex "STD")>; + +def BWWriteResGroup72 : SchedWriteRes<[BWPort5]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup72], (instregex "AESDECLASTrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "AESDECrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "AESENCLASTrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "AESENCrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "VAESDECLASTrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "VAESDECrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "VAESENCLASTrr")>; +def: InstRW<[BWWriteResGroup72], (instregex "VAESENCrr")>; + +def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLDYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLQYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSLLWYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSRADYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSRAWYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSRLDYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSRLQYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSRLVQYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VPSRLWYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VTESTPDYrm")>; +def: InstRW<[BWWriteResGroup73], (instregex "VTESTPSYrm")>; + +def BWWriteResGroup74 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup74], (instregex "FCOM32m")>; +def: InstRW<[BWWriteResGroup74], (instregex "FCOM64m")>; +def: InstRW<[BWWriteResGroup74], (instregex "FCOMP32m")>; +def: InstRW<[BWWriteResGroup74], (instregex "FCOMP64m")>; + +def BWWriteResGroup75 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup75], (instregex "VANDNPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VANDNPSYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VANDPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VANDPSYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VORPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VORPSYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSDWYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSWBYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSDWYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSWBYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPALIGNRYrmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPBLENDWYrmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFBYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFDYmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFHWYmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFLWYmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHBWYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHDQYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHQDQYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHWDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLBWYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLDQYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLQDQYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLWDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPDYrmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPSYrmi")>; +def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VXORPDYrm")>; +def: InstRW<[BWWriteResGroup75], (instregex "VXORPSYrm")>; + +def BWWriteResGroup76 : SchedWriteRes<[BWPort23,BWPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup76], (instregex "VPABSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPABSDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPABSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDQYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPADDWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPAVGBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPAVGWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQQYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMINSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMINSDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMINSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMINUBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMINUDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPMINUWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNBYrm256")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNDYrm256")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNWYrm256")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBDYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBQYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSBYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSWYrm")>; +def: InstRW<[BWWriteResGroup76], (instregex "VPSUBWYrm")>; + +def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPDYrmi")>; +def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPSYrmi")>; +def: InstRW<[BWWriteResGroup77], (instregex "VPANDNYrm")>; +def: InstRW<[BWWriteResGroup77], (instregex "VPANDYrm")>; +def: InstRW<[BWWriteResGroup77], (instregex "VPBLENDDYrmi")>; +def: InstRW<[BWWriteResGroup77], (instregex "VPORYrm")>; +def: InstRW<[BWWriteResGroup77], (instregex "VPXORYrm")>; + +def BWWriteResGroup78 : SchedWriteRes<[BWPort0,BWPort5]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup78], (instregex "MPSADBWrri")>; +def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWYrri")>; +def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWrri")>; + +def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPDrm0")>; +def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPSrm0")>; +def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm")>; +def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSWBirm")>; +def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKUSWBirm")>; +def: InstRW<[BWWriteResGroup79], (instregex "PBLENDVBrm0")>; +def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPDrm")>; +def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPSrm")>; +def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPDrm")>; +def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPSrm")>; +def: InstRW<[BWWriteResGroup79], (instregex "VPBLENDVBrm")>; +def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVDrm")>; +def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVQrm")>; + +def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[BWWriteResGroup80], (instregex "LEAVE64")>; +def: InstRW<[BWWriteResGroup80], (instregex "SCASB")>; +def: InstRW<[BWWriteResGroup80], (instregex "SCASL")>; +def: InstRW<[BWWriteResGroup80], (instregex "SCASQ")>; +def: InstRW<[BWWriteResGroup80], (instregex "SCASW")>; + +def BWWriteResGroup81 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup81], (instregex "PSLLDrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSLLQrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSLLWrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSRADrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSRAWrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSRLDrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSRLQrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PSRLWrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "PTESTrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSLLDrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSLLQrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSLLWrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSRADrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSRAWrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSRLDrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSRLQrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPSRLWrm")>; +def: InstRW<[BWWriteResGroup81], (instregex "VPTESTrm")>; + +def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup82], (instregex "FLDCW16m")>; + +def BWWriteResGroup83 : SchedWriteRes<[BWPort0,BWPort23,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup83], (instregex "LDMXCSR")>; +def: InstRW<[BWWriteResGroup83], (instregex "VLDMXCSR")>; + +def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup84], (instregex "LRETQ")>; +def: InstRW<[BWWriteResGroup84], (instregex "RETQ")>; + +def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>; + +def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup86], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup86], (instregex "CMOVBE(16|32|64)rm")>; + +def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROL8m1")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROL8mi")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROR8m1")>; +def: InstRW<[BWWriteResGroup87], (instregex "ROR8mi")>; + +def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[BWWriteResGroup88], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup88], (instregex "XADD8rm")>; + +def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>; +def: InstRW<[BWWriteResGroup89], (instregex "FARCALL64")>; + +def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [2,2,1,2]; +} +def: InstRW<[BWWriteResGroup90], (instregex "LOOP")>; + +def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup91], (instregex "ADDPDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "ADDPSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "ADDSDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "ADDSSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "CMPPDrmi")>; +def: InstRW<[BWWriteResGroup91], (instregex "CMPPSrmi")>; +def: InstRW<[BWWriteResGroup91], (instregex "CMPSDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "CMPSSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "COMISDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "COMISSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "CVTDQ2PSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "CVTPS2DQrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "CVTTPS2DQrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "IMUL64m")>; +def: InstRW<[BWWriteResGroup91], (instregex "IMUL(32|64)rm(i8)?")>; +def: InstRW<[BWWriteResGroup91], (instregex "IMUL8m")>; +def: InstRW<[BWWriteResGroup91], (instregex "LZCNT(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[BWWriteResGroup91], (instregex "MUL64m")>; +def: InstRW<[BWWriteResGroup91], (instregex "MUL8m")>; +def: InstRW<[BWWriteResGroup91], (instregex "PDEP(32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "PEXT(32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "SUBPDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "SUBPSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "SUBSDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "SUBSSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "TZCNT(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup91], (instregex "UCOMISDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "UCOMISSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VADDPDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VADDPSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VADDSDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VADDSSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCMPPDrmi")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCMPPSrmi")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCMPSDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCMPSSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCOMISDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCOMISSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCVTPS2DQrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VSUBPDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VSUBPSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VSUBSDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VSUBSSrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISDrm")>; +def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISSrm")>; + +def BWWriteResGroup91_16 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup91_16], (instregex "IMUL16rm(i8)?")>; + +def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> { + let Latency = 8; + let NumMicroOps = 5; +} +def: InstRW<[BWWriteResGroup91_16_2], (instregex "IMUL16m")>; +def: InstRW<[BWWriteResGroup91_16_2], (instregex "MUL16m")>; + +def BWWriteResGroup91_32 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup91_32], (instregex "IMUL32m")>; +def: InstRW<[BWWriteResGroup91_32], (instregex "MUL32m")>; + +def BWWriteResGroup92 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm")>; +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBQYrm")>; +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBWYrm")>; +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXDQYrm")>; +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWDYrm")>; +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWQYrm")>; +def: InstRW<[BWWriteResGroup92], (instregex "VPMOVZXWDYrm")>; + +def BWWriteResGroup93 : SchedWriteRes<[BWPort01,BWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup93], (instregex "MULPDrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "MULPSrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "MULSDrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "MULSSrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "VMULPDrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "VMULPSrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "VMULSDrm")>; +def: InstRW<[BWWriteResGroup93], (instregex "VMULSSrm")>; + +def BWWriteResGroup94 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPDYrm")>; +def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPSYrm")>; +def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[BWWriteResGroup94], (instregex "VPBLENDVBYrm")>; +def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVDYrm")>; +def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVQYrm")>; + +def BWWriteResGroup95 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup95], (instregex "VPSLLVDrm")>; +def: InstRW<[BWWriteResGroup95], (instregex "VPSRAVDrm")>; +def: InstRW<[BWWriteResGroup95], (instregex "VPSRLVDrm")>; + +def BWWriteResGroup96 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDrm64")>; +def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBSWrm64")>; +def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBWrm64")>; +def: InstRW<[BWWriteResGroup96], (instregex "PHADDDrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "PHADDSWrm128")>; +def: InstRW<[BWWriteResGroup96], (instregex "PHADDWrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "PHSUBDrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "PHSUBSWrm128")>; +def: InstRW<[BWWriteResGroup96], (instregex "PHSUBWrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "VPHADDDrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "VPHADDSWrm128")>; +def: InstRW<[BWWriteResGroup96], (instregex "VPHADDWrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBDrm")>; +def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBSWrm128")>; +def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBWrm")>; + +def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCL8m1")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCL8mi")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)m1")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCR8m1")>; +def: InstRW<[BWWriteResGroup97], (instregex "RCR8mi")>; + +def BWWriteResGroup98 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[BWWriteResGroup98], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup98], (instregex "ROR8mCL")>; + +def BWWriteResGroup99 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[BWWriteResGroup99], (instregex "ADC(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup99], (instregex "ADC8mi")>; +def: InstRW<[BWWriteResGroup99], (instregex "ADD8mi")>; +def: InstRW<[BWWriteResGroup99], (instregex "AND8mi")>; +def: InstRW<[BWWriteResGroup99], (instregex "OR8mi")>; +def: InstRW<[BWWriteResGroup99], (instregex "SUB8mi")>; +def: InstRW<[BWWriteResGroup99], (instregex "XCHG(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup99], (instregex "XCHG8rm")>; +def: InstRW<[BWWriteResGroup99], (instregex "XOR8mi")>; + +def BWWriteResGroup100 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[BWWriteResGroup100], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup100], (instregex "ADC8mr")>; +def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG8rm")>; +def: InstRW<[BWWriteResGroup100], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "ROL8mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "SAR8mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mi")>; +def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[BWWriteResGroup100], (instregex "SBB8mi")>; +def: InstRW<[BWWriteResGroup100], (instregex "SBB8mr")>; +def: InstRW<[BWWriteResGroup100], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "SHL8mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup100], (instregex "SHR8mCL")>; + +def BWWriteResGroup101 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup101], (instregex "ADD_F32m")>; +def: InstRW<[BWWriteResGroup101], (instregex "ADD_F64m")>; +def: InstRW<[BWWriteResGroup101], (instregex "ILD_F16m")>; +def: InstRW<[BWWriteResGroup101], (instregex "ILD_F32m")>; +def: InstRW<[BWWriteResGroup101], (instregex "ILD_F64m")>; +def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F32m")>; +def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F64m")>; +def: InstRW<[BWWriteResGroup101], (instregex "SUB_F32m")>; +def: InstRW<[BWWriteResGroup101], (instregex "SUB_F64m")>; +def: InstRW<[BWWriteResGroup101], (instregex "VADDPDYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VADDPSYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPDYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPSYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VCMPPDYrmi")>; +def: InstRW<[BWWriteResGroup101], (instregex "VCMPPSYrmi")>; +def: InstRW<[BWWriteResGroup101], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PDYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PSYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PDYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PSYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VSUBPDYrm")>; +def: InstRW<[BWWriteResGroup101], (instregex "VSUBPSYrm")>; + +def BWWriteResGroup102 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup102], (instregex "VPERM2F128rm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPERM2I128rm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPERMDYrm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPERMPDYmi")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPERMPSYrm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPERMQYmi")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBDYrm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBQYrm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBWYrm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXDQYrm")>; +def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXWQYrm")>; + +def BWWriteResGroup103 : SchedWriteRes<[BWPort01,BWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup103], (instregex "VMULPDYrm")>; +def: InstRW<[BWWriteResGroup103], (instregex "VMULPSYrm")>; + +def BWWriteResGroup104 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup104], (instregex "DPPDrri")>; +def: InstRW<[BWWriteResGroup104], (instregex "VDPPDrri")>; + +def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "CVTTSS2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SIrm")>; + +def BWWriteResGroup106 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup106], (instregex "VCVTPS2PDYrm")>; + +def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup107], (instregex "CVTDQ2PDrm")>; +def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2DQrm")>; +def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm")>; +def: InstRW<[BWWriteResGroup107], (instregex "CVTSD2SSrm")>; +def: InstRW<[BWWriteResGroup107], (instregex "CVTTPD2DQrm")>; +def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTTPD2PIirm")>; +def: InstRW<[BWWriteResGroup107], (instregex "MULX64rm")>; +def: InstRW<[BWWriteResGroup107], (instregex "VCVTDQ2PDrm")>; +def: InstRW<[BWWriteResGroup107], (instregex "VCVTSD2SSrm")>; + +def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBYrm")>; +def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBrm")>; +def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWYrm")>; +def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWrm")>; + +def BWWriteResGroup109 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup109], (instregex "VPSLLVDYrm")>; +def: InstRW<[BWWriteResGroup109], (instregex "VPSRAVDYrm")>; +def: InstRW<[BWWriteResGroup109], (instregex "VPSRLVDYrm")>; + +def BWWriteResGroup110 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup110], (instregex "VPHADDDYrm")>; +def: InstRW<[BWWriteResGroup110], (instregex "VPHADDSWrm256")>; +def: InstRW<[BWWriteResGroup110], (instregex "VPHADDWYrm")>; +def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBDYrm")>; +def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBSWrm256")>; +def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBWYrm")>; + +def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[BWWriteResGroup111], (instregex "SHRD(16|32|64)mri8")>; + +def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[BWWriteResGroup112], (instregex "RDRAND(16|32|64)r")>; + +def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm")>; +def: InstRW<[BWWriteResGroup113], (instregex "LSL(16|32|64)rm")>; + +def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup114], (instregex "PMULLDrr")>; +def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDYrr")>; +def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDrr")>; + +def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDWDirm")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHUWirm")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHWirm")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULLWirm")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[BWWriteResGroup115], (instregex "MMX_PSADBWirm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PCLMULQDQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PCMPGTQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMADDUBSWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMADDWDrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMULDQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMULHRSWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMULHUWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMULHWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMULLWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PMULUDQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "PSADBWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "RCPPSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "RCPSSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "RSQRTPSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "RSQRTSSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPCLMULQDQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPCMPGTQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMADDUBSWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMADDWDrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMULDQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMULHRSWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMULHUWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMULHWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMULLWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPMULUDQrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VPSADBWrm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VRCPPSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VRCPSSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTPSm")>; +def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTSSm")>; + +def BWWriteResGroup116 : SchedWriteRes<[BWPort01,BWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup116], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>; + +def BWWriteResGroup117 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup117], (instregex "FICOM16m")>; +def: InstRW<[BWWriteResGroup117], (instregex "FICOM32m")>; +def: InstRW<[BWWriteResGroup117], (instregex "FICOMP16m")>; +def: InstRW<[BWWriteResGroup117], (instregex "FICOMP32m")>; + +def BWWriteResGroup118 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup118], (instregex "VPTESTYrm")>; + +def BWWriteResGroup119 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[BWWriteResGroup119], (instregex "HADDPDrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "HADDPSrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "HSUBPDrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "HSUBPSrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "VHADDPDrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "VHADDPSrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPDrm")>; +def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPSrm")>; + +def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup120], (instregex "CVTTSS2SI64rm")>; + +def BWWriteResGroup121 : SchedWriteRes<[BWPort1,BWPort23,BWPort06,BWPort0156]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup121], (instregex "MULX32rm")>; + +def BWWriteResGroup122 : SchedWriteRes<[BWPort0]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup122], (instregex "DIVPSrr")>; +def: InstRW<[BWWriteResGroup122], (instregex "DIVSSrr")>; +def: InstRW<[BWWriteResGroup122], (instregex "VDIVPSrr")>; +def: InstRW<[BWWriteResGroup122], (instregex "VDIVSSrr")>; + +def BWWriteResGroup123 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup123], (instregex "MUL_F32m")>; +def: InstRW<[BWWriteResGroup123], (instregex "MUL_F64m")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPCMPGTQYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMADDUBSWYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMADDWDYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMULDQYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMULHRSWYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMULHUWYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMULHWYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMULLWYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPMULUDQYrm")>; +def: InstRW<[BWWriteResGroup123], (instregex "VPSADBWYrm")>; + +def BWWriteResGroup124 : SchedWriteRes<[BWPort01,BWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup124], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>; + +def BWWriteResGroup125 : SchedWriteRes<[BWPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRIrr")>; +def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRM128rr")>; +def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRIrr")>; +def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRM128rr")>; + +def BWWriteResGroup126 : SchedWriteRes<[BWPort0,BWPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup126], (instregex "VRCPPSYr")>; +def: InstRW<[BWWriteResGroup126], (instregex "VRSQRTPSYr")>; + +def BWWriteResGroup127 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPDm")>; +def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPSm")>; +def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSDm")>; +def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSSm")>; +def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPDm")>; +def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPSm")>; +def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSDm")>; +def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSSm")>; + +def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>; + +def BWWriteResGroup129 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[BWWriteResGroup129], (instregex "VHADDPDYrm")>; +def: InstRW<[BWWriteResGroup129], (instregex "VHADDPSYrm")>; +def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPDYrm")>; +def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPSYrm")>; + +def BWWriteResGroup130 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,1,2]; +} +def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[BWWriteResGroup130], (instregex "SHRD(16|32|64)mrCL")>; + +def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,2,3]; +} +def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL")>; +def: InstRW<[BWWriteResGroup131], (instregex "RCR(16|32|64)rCL")>; + +def BWWriteResGroup132 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,4,1,3]; +} +def: InstRW<[BWWriteResGroup132], (instregex "RCL8rCL")>; + +def BWWriteResGroup133 : SchedWriteRes<[BWPort06,BWPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[BWWriteResGroup133], (instregex "LOOPE")>; +def: InstRW<[BWWriteResGroup133], (instregex "LOOPNE")>; + +def BWWriteResGroup134 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup134], (instregex "AESDECLASTrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "AESDECrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "AESENCLASTrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "AESENCrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "VAESDECLASTrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "VAESDECrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "VAESENCLASTrm")>; +def: InstRW<[BWWriteResGroup134], (instregex "VAESENCrm")>; + +def BWWriteResGroup135 : SchedWriteRes<[BWPort1,BWPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI16m")>; +def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI32m")>; +def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI16m")>; +def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI32m")>; +def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI16m")>; +def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI32m")>; +def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPDm")>; +def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPSm")>; + +def BWWriteResGroup136 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[BWWriteResGroup136], (instregex "MPSADBWrmi")>; +def: InstRW<[BWWriteResGroup136], (instregex "VMPSADBWrmi")>; + +def BWWriteResGroup137 : SchedWriteRes<[BWPort0]> { + let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup137], (instregex "SQRTPSr")>; +def: InstRW<[BWWriteResGroup137], (instregex "SQRTSSr")>; + +def BWWriteResGroup138 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[BWWriteResGroup138], (instregex "VMPSADBWYrmi")>; + +def BWWriteResGroup139 : SchedWriteRes<[BWPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup139], (instregex "DIVPDrr")>; +def: InstRW<[BWWriteResGroup139], (instregex "DIVSDrr")>; +def: InstRW<[BWWriteResGroup139], (instregex "VDIVPDrr")>; +def: InstRW<[BWWriteResGroup139], (instregex "VDIVSDrr")>; +def: InstRW<[BWWriteResGroup139], (instregex "VSQRTPSr")>; +def: InstRW<[BWWriteResGroup139], (instregex "VSQRTSSr")>; + +def BWWriteResGroup140 : SchedWriteRes<[BWPort5]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[BWWriteResGroup140], (instregex "AESIMCrr")>; +def: InstRW<[BWWriteResGroup140], (instregex "VAESIMCrr")>; + +def BWWriteResGroup141 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI16m")>; +def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI32m")>; + +def BWWriteResGroup142 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup142], (instregex "DPPSrri")>; +def: InstRW<[BWWriteResGroup142], (instregex "VDPPSYrri")>; +def: InstRW<[BWWriteResGroup142], (instregex "VDPPSrri")>; + +def BWWriteResGroup143 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[BWWriteResGroup143], (instregex "DPPDrmi")>; +def: InstRW<[BWWriteResGroup143], (instregex "VDPPDrmi")>; + +def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> { + let Latency = 14; + let NumMicroOps = 8; + let ResourceCycles = [2,2,1,3]; +} +def: InstRW<[BWWriteResGroup144], (instregex "LAR(16|32|64)rr")>; + +def BWWriteResGroup145 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,3,1,4]; +} +def: InstRW<[BWWriteResGroup145], (instregex "RCR8rCL")>; + +def BWWriteResGroup146 : SchedWriteRes<[BWPort0,BWPort1,BWPort6,BWPort0156]> { + let Latency = 14; + let NumMicroOps = 12; + let ResourceCycles = [2,1,4,5]; +} +def: InstRW<[BWWriteResGroup146], (instregex "XCH_F")>; + +def BWWriteResGroup147 : SchedWriteRes<[BWPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FPrST0")>; +def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FST0r")>; +def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FrST0")>; + +def BWWriteResGroup148 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup148], (instregex "PMULLDrm")>; +def: InstRW<[BWWriteResGroup148], (instregex "VPMULLDrm")>; + +def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { + let Latency = 15; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,4,1,2]; +} +def: InstRW<[BWWriteResGroup149], (instregex "RCL(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup149], (instregex "RCL8mCL")>; + +def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup150], (instregex "DIVPSrm")>; +def: InstRW<[BWWriteResGroup150], (instregex "DIVSSrm")>; +def: InstRW<[BWWriteResGroup150], (instregex "VDIVPSrm")>; +def: InstRW<[BWWriteResGroup150], (instregex "VDIVSSrm")>; + +def BWWriteResGroup151 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup151], (instregex "VPMULLDYrm")>; + +def BWWriteResGroup152 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRIrm")>; +def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRM128rm")>; +def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRIrm")>; +def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRM128rm")>; + +def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { + let Latency = 16; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[BWWriteResGroup153], (instregex "CMPXCHG8B")>; + +def BWWriteResGroup154 : SchedWriteRes<[BWPort5]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[BWWriteResGroup154], (instregex "VZEROALL")>; + +def BWWriteResGroup155 : SchedWriteRes<[BWPort0,BWPort015]> { + let Latency = 17; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup155], (instregex "VDIVPSYrr")>; + +def BWWriteResGroup156 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup156], (instregex "VRCPPSYm")>; +def: InstRW<[BWWriteResGroup156], (instregex "VRSQRTPSYm")>; + +def BWWriteResGroup157 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 18; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup157], (instregex "SQRTPSm")>; +def: InstRW<[BWWriteResGroup157], (instregex "SQRTSSm")>; + +def BWWriteResGroup158 : SchedWriteRes<[BWPort0,BWPort5,BWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def: InstRW<[BWWriteResGroup158], (instregex "PCMPESTRIrr")>; +def: InstRW<[BWWriteResGroup158], (instregex "VPCMPESTRIrr")>; + +def BWWriteResGroup159 : SchedWriteRes<[BWPort5,BWPort6,BWPort06,BWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[BWWriteResGroup159], (instregex "CPUID")>; +def: InstRW<[BWWriteResGroup159], (instregex "RDTSC")>; + +def BWWriteResGroup160 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> { + let Latency = 18; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,3,1,3]; +} +def: InstRW<[BWWriteResGroup160], (instregex "RCR(16|32|64)mCL")>; +def: InstRW<[BWWriteResGroup160], (instregex "RCR8mCL")>; + +def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup161], (instregex "DIVPDrm")>; +def: InstRW<[BWWriteResGroup161], (instregex "DIVSDrm")>; +def: InstRW<[BWWriteResGroup161], (instregex "VDIVPDrm")>; +def: InstRW<[BWWriteResGroup161], (instregex "VDIVSDrm")>; +def: InstRW<[BWWriteResGroup161], (instregex "VSQRTPSm")>; +def: InstRW<[BWWriteResGroup161], (instregex "VSQRTSSm")>; + +def BWWriteResGroup162 : SchedWriteRes<[BWPort5,BWPort23]> { + let Latency = 19; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup162], (instregex "AESIMCrm")>; +def: InstRW<[BWWriteResGroup162], (instregex "VAESIMCrm")>; + +def BWWriteResGroup163 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[BWWriteResGroup163], (instregex "DPPSrmi")>; +def: InstRW<[BWWriteResGroup163], (instregex "VDPPSrmi")>; + +def BWWriteResGroup164 : SchedWriteRes<[BWPort0,BWPort5,BWPort015,BWPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[BWWriteResGroup164], (instregex "PCMPESTRM128rr")>; +def: InstRW<[BWWriteResGroup164], (instregex "VPCMPESTRM128rr")>; + +def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup165], (instregex "DIV_FPrST0")>; +def: InstRW<[BWWriteResGroup165], (instregex "DIV_FST0r")>; +def: InstRW<[BWWriteResGroup165], (instregex "DIV_FrST0")>; +def: InstRW<[BWWriteResGroup165], (instregex "SQRTPDr")>; +def: InstRW<[BWWriteResGroup165], (instregex "SQRTSDr")>; + +def BWWriteResGroup166 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[BWWriteResGroup166], (instregex "VDPPSYrmi")>; + +def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 20; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[BWWriteResGroup167], (instregex "INSB")>; +def: InstRW<[BWWriteResGroup167], (instregex "INSL")>; +def: InstRW<[BWWriteResGroup167], (instregex "INSW")>; + +def BWWriteResGroup168 : SchedWriteRes<[BWPort0]> { + let Latency = 21; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[BWWriteResGroup168], (instregex "VSQRTPDr")>; +def: InstRW<[BWWriteResGroup168], (instregex "VSQRTSDr")>; + +def BWWriteResGroup169 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup169], (instregex "DIV_F32m")>; +def: InstRW<[BWWriteResGroup169], (instregex "DIV_F64m")>; + +def BWWriteResGroup170 : SchedWriteRes<[BWPort0,BWPort015]> { + let Latency = 21; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup170], (instregex "VSQRTPSYr")>; + +def BWWriteResGroup171 : SchedWriteRes<[BWPort0,BWPort4,BWPort5,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 21; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[BWWriteResGroup171], (instregex "CMPXCHG16B")>; + +def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { + let Latency = 22; + let NumMicroOps = 18; + let ResourceCycles = [1,1,16]; +} +def: InstRW<[BWWriteResGroup172], (instregex "POPF64")>; + +def BWWriteResGroup173 : SchedWriteRes<[BWPort0,BWPort015]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup173], (instregex "VDIVPDYrr")>; + +def BWWriteResGroup174 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> { + let Latency = 23; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup174], (instregex "VDIVPSYrm")>; + +def BWWriteResGroup175 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort0156]> { + let Latency = 23; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[BWWriteResGroup175], (instregex "PCMPESTRIrm")>; +def: InstRW<[BWWriteResGroup175], (instregex "VPCMPESTRIrm")>; + +def BWWriteResGroup176 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [3,1,15]; +} +def: InstRW<[BWWriteResGroup176], (instregex "XRSTOR(64)?")>; + +def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 24; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI16m")>; +def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI32m")>; + +def BWWriteResGroup178 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015,BWPort0156]> { + let Latency = 24; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} +def: InstRW<[BWWriteResGroup178], (instregex "PCMPESTRM128rm")>; +def: InstRW<[BWWriteResGroup178], (instregex "VPCMPESTRM128rm")>; + +def BWWriteResGroup179 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 25; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup179], (instregex "SQRTPDm")>; +def: InstRW<[BWWriteResGroup179], (instregex "SQRTSDm")>; + +def BWWriteResGroup180 : SchedWriteRes<[BWPort0,BWPort23]> { + let Latency = 26; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F32m")>; +def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F64m")>; +def: InstRW<[BWWriteResGroup180], (instregex "VSQRTPDm")>; +def: InstRW<[BWWriteResGroup180], (instregex "VSQRTSDm")>; + +def BWWriteResGroup181 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> { + let Latency = 27; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup181], (instregex "VSQRTPSYm")>; + +def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> { + let Latency = 29; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI16m")>; +def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI32m")>; + +def BWWriteResGroup183 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> { + let Latency = 29; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup183], (instregex "VDIVPDYrm")>; + +def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 22; + let NumMicroOps = 7; + let ResourceCycles = [1,3,2,1]; +} +def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERQPDrm)>; + +def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 23; + let NumMicroOps = 9; + let ResourceCycles = [1,3,4,1]; +} +def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERQPDYrm)>; + +def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [1,5,2,1]; +} +def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSYrm)>; + +def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 25; + let NumMicroOps = 7; + let ResourceCycles = [1,3,2,1]; +} +def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPDrm, + VGATHERDPSrm)>; + +def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 26; + let NumMicroOps = 9; + let ResourceCycles = [1,5,2,1]; +} +def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>; + +def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 26; + let NumMicroOps = 14; + let ResourceCycles = [1,4,8,1]; +} +def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>; + +def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> { + let Latency = 27; + let NumMicroOps = 9; + let ResourceCycles = [1,5,2,1]; +} +def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>; + +def BWWriteResGroup184 : SchedWriteRes<[BWPort0,BWPort5,BWPort015]> { + let Latency = 29; + let NumMicroOps = 11; + let ResourceCycles = [2,7,2]; +} +def: InstRW<[BWWriteResGroup184], (instregex "AESKEYGENASSIST128rr")>; +def: InstRW<[BWWriteResGroup184], (instregex "VAESKEYGENASSIST128rr")>; + +def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { + let Latency = 29; + let NumMicroOps = 27; + let ResourceCycles = [1,5,1,1,19]; +} +def: InstRW<[BWWriteResGroup185], (instregex "XSAVE64")>; + +def BWWriteResGroup186 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> { + let Latency = 30; + let NumMicroOps = 28; + let ResourceCycles = [1,6,1,1,19]; +} +def: InstRW<[BWWriteResGroup186], (instregex "XSAVE(OPT)?")>; + +def BWWriteResGroup187 : SchedWriteRes<[BWPort01,BWPort15,BWPort015,BWPort0156]> { + let Latency = 31; + let NumMicroOps = 31; + let ResourceCycles = [8,1,21,1]; +} +def: InstRW<[BWWriteResGroup187], (instregex "MMX_EMMS")>; + +def BWWriteResGroup188 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015]> { + let Latency = 33; + let NumMicroOps = 11; + let ResourceCycles = [2,7,1,1]; +} +def: InstRW<[BWWriteResGroup188], (instregex "AESKEYGENASSIST128rm")>; +def: InstRW<[BWWriteResGroup188], (instregex "VAESKEYGENASSIST128rm")>; + +def BWWriteResGroup189 : SchedWriteRes<[BWPort0,BWPort015]> { + let Latency = 34; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[BWWriteResGroup189], (instregex "VSQRTPDYr")>; + +def BWWriteResGroup190 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> { + let Latency = 34; + let NumMicroOps = 8; + let ResourceCycles = [2,2,2,1,1]; +} +def: InstRW<[BWWriteResGroup190], (instregex "DIV(16|32|64)m")>; +def: InstRW<[BWWriteResGroup190], (instregex "DIV8m")>; + +def BWWriteResGroup191 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort06,BWPort0156]> { + let Latency = 34; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)ri")>; +def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)rr")>; +def: InstRW<[BWWriteResGroup191], (instregex "IN8ri")>; +def: InstRW<[BWWriteResGroup191], (instregex "IN8rr")>; + +def BWWriteResGroup193 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> { + let Latency = 35; + let NumMicroOps = 8; + let ResourceCycles = [2,2,2,1,1]; +} +def: InstRW<[BWWriteResGroup193], (instregex "IDIV(16|32|64)m")>; +def: InstRW<[BWWriteResGroup193], (instregex "IDIV8m")>; + +def BWWriteResGroup194 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)ir")>; +def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)rr")>; +def: InstRW<[BWWriteResGroup194], (instregex "OUT8ir")>; +def: InstRW<[BWWriteResGroup194], (instregex "OUT8rr")>; + +def BWWriteResGroup195 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> { + let Latency = 40; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[BWWriteResGroup195], (instregex "VSQRTPDYm")>; + +def BWWriteResGroup196 : SchedWriteRes<[BWPort5,BWPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[BWWriteResGroup196], (instregex "RDTSCP")>; + +def BWWriteResGroup197 : SchedWriteRes<[BWPort0,BWPort01,BWPort23,BWPort05,BWPort06,BWPort015,BWPort0156]> { + let Latency = 60; + let NumMicroOps = 64; + let ResourceCycles = [2,2,8,1,10,2,39]; +} +def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>; +def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>; + +def BWWriteResGroup198 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> { + let Latency = 63; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[BWWriteResGroup198], (instregex "FXRSTOR64")>; + +def BWWriteResGroup199 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> { + let Latency = 63; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[BWWriteResGroup199], (instregex "FXRSTOR")>; + +def BWWriteResGroup200 : SchedWriteRes<[BWPort5,BWPort01,BWPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[BWWriteResGroup200], (instregex "FNINIT")>; + +def BWWriteResGroup201 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156]> { + let Latency = 80; + let NumMicroOps = 32; + let ResourceCycles = [7,7,3,3,1,11]; +} +def: InstRW<[BWWriteResGroup201], (instregex "DIV(16|32|64)r")>; + +def BWWriteResGroup202 : SchedWriteRes<[BWPort0,BWPort1,BWPort4,BWPort5,BWPort6,BWPort237,BWPort06,BWPort0156]> { + let Latency = 115; + let NumMicroOps = 100; + let ResourceCycles = [9,9,11,8,1,11,21,30]; +} +def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>; +def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>; + +} // SchedModel + diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td index 03c8ccb53afe..46612554b1fa 100644 --- a/lib/Target/X86/X86SchedHaswell.td +++ b/lib/Target/X86/X86SchedHaswell.td @@ -17,14 +17,14 @@ def HaswellModel : SchedMachineModel { // instructions per cycle. let IssueWidth = 4; let MicroOpBufferSize = 192; // Based on the reorder buffer. - let LoadLatency = 4; + let LoadLatency = 5; let MispredictPenalty = 16; // Based on the LSD (loop-stream detector) queue size and benchmarking data. let LoopMicroOpBufferSize = 50; - // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow - // the scheduler to assign a default model to unrecognized opcodes. + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. let CompleteModel = 0; } @@ -70,9 +70,9 @@ def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4, // Integer division issued on port 0. def HWDivider : ProcResource<1>; -// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4 +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 // cycles after the memory operand. -def : ReadAdvance<ReadAfterLd, 4>; +def : ReadAdvance<ReadAfterLd, 5>; // Many SchedWrites are defined in pairs with and without a folded load. // Instructions with folded loads are usually micro-fused, so they only appear @@ -85,10 +85,10 @@ multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW, // Register variant is using a single cycle on ExePort. def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } - // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the + // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the // latency. def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> { - let Latency = !add(Lat, 4); + let Latency = !add(Lat, 5); } } @@ -99,7 +99,7 @@ def : WriteRes<WriteRMW, [HWPort4]>; // Store_addr on 237. // Store_data on 4. def : WriteRes<WriteStore, [HWPort237, HWPort4]>; -def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; } +def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 5; } def : WriteRes<WriteMove, [HWPort0156]>; def : WriteRes<WriteZero, []>; @@ -134,6 +134,7 @@ defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>; defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>; defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>; defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>; +defm : HWWriteResPair<WriteFMA, HWPort01, 5>; defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>; defm : HWWriteResPair<WriteFBlend, HWPort015, 1>; defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>; @@ -434,31 +435,7 @@ def : InstRW<[WriteALULd], (instregex "MOV16rm")>; // MOVSX, MOVZX. // r,m. -def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; - -// CMOVcc. -// r,r. -def : InstRW<[Write2P0156_Lat2], - (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>; -// r,m. -def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], - (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>; - -// XCHG. -// r,r. -def WriteXCHG : SchedWriteRes<[HWPort0156]> { - let Latency = 2; - let ResourceCycles = [3]; -} - -def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; - -// r,m. -def WriteXCHGrm : SchedWriteRes<[]> { - let Latency = 21; - let NumMicroOps = 8; -} -def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>; +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm8")>; // XLAT. def WriteXLAT : SchedWriteRes<[]> { @@ -471,12 +448,6 @@ def : InstRW<[WriteXLAT], (instregex "XLAT")>; // m. def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>; -// PUSHF. -def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> { - let NumMicroOps = 4; -} -def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>; - // PUSHA. def WritePushA : SchedWriteRes<[]> { let NumMicroOps = 19; @@ -487,178 +458,14 @@ def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>; // m. def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>; -// POPF. -def WritePopF : SchedWriteRes<[]> { - let NumMicroOps = 9; -} -def : InstRW<[WritePopF], (instregex "POPF(16|32)")>; - // POPA. def WritePopA : SchedWriteRes<[]> { let NumMicroOps = 18; } def : InstRW<[WritePopA], (instregex "POPA(16|32)")>; -// LAHF SAHF. -def : InstRW<[WriteP06], (instregex "(S|L)AHF")>; - -// BSWAP. -// r32. -def WriteBSwap32 : SchedWriteRes<[HWPort15]>; -def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>; - -// r64. -def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>; - -// MOVBE. -// r16,m16 / r64,m64. -def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>; - -// r32, m32. -def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>; - -// m16,r16. -def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>; - -// m32,r32. -def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>; - -// m64,r64. -def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> { - let NumMicroOps = 4; -} -def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>; - //-- Arithmetic instructions --// -// ADD SUB. -// m,r/i. -def : InstRW<[Write2P0156_2P237_P4], - (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", - "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>; - -// ADC SBB. -// r,r/i. -def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)", - "(ADC|SBB)(16|32|64)ri8", - "(ADC|SBB)64ri32", - "(ADC|SBB)(8|16|32|64)rr_REV")>; - -// r,m. -def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>; - -// m,r/i. -def : InstRW<[Write3P0156_2P237_P4], - (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", - "(ADC|SBB)(16|32|64)mi8", - "(ADC|SBB)64mi32")>; - -// INC DEC NOT NEG. -// m. -def : InstRW<[WriteP0156_2P237_P4], - (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m", - "(INC|DEC)64(16|32)m")>; - -// MUL IMUL. -// r16. -def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 4; - let NumMicroOps = 4; -} -def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>; - -// m16. -def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { - let Latency = 8; - let NumMicroOps = 5; -} -def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>; - -// r32. -def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 4; - let NumMicroOps = 3; -} -def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>; - -// m32. -def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { - let Latency = 8; - let NumMicroOps = 4; -} -def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>; - -// r64. -def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> { - let Latency = 3; - let NumMicroOps = 2; -} -def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>; - -// m64. -def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { - let Latency = 7; - let NumMicroOps = 3; -} -def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>; - -// r16,r16. -def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> { - let Latency = 4; - let NumMicroOps = 2; -} -def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>; - -// r16,m16. -def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> { - let Latency = 8; - let NumMicroOps = 3; -} -def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>; - -// MULX. -// r32,r32,r32. -def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [1, 2]; -} -def : InstRW<[WriteMulX32], (instregex "MULX32rr")>; - -// r32,r32,m32. -def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> { - let Latency = 8; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; -} -def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>; - -// r64,r64,r64. -def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> { - let Latency = 4; - let NumMicroOps = 2; -} -def : InstRW<[WriteMulX64], (instregex "MULX64rr")>; - -// r64,r64,m64. -def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> { - let Latency = 8; - let NumMicroOps = 3; -} -def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>; - // DIV. // r8. def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { @@ -667,27 +474,6 @@ def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { } def : InstRW<[WriteDiv8], (instregex "DIV8r")>; -// r16. -def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 23; - let NumMicroOps = 10; -} -def : InstRW<[WriteDiv16], (instregex "DIV16r")>; - -// r32. -def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 22; - let NumMicroOps = 10; -} -def : InstRW<[WriteDiv32], (instregex "DIV32r")>; - -// r64. -def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 32; - let NumMicroOps = 36; -} -def : InstRW<[WriteDiv64], (instregex "DIV64r")>; - // IDIV. // r8. def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { @@ -696,259 +482,23 @@ def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { } def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>; -// r16. -def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 23; - let NumMicroOps = 10; -} -def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>; - -// r32. -def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 22; - let NumMicroOps = 9; -} -def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>; - -// r64. -def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> { - let Latency = 39; - let NumMicroOps = 59; -} -def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>; - -//-- Logic instructions --// - -// AND OR XOR. -// m,r/i. -def : InstRW<[Write2P0156_2P237_P4], - (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", - "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; - -// SHR SHL SAR. -// m,i. -def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; -} -def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; - -// r,cl. -def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>; - -// m,cl. -def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> { - let NumMicroOps = 6; - let ResourceCycles = [3, 2, 1]; -} -def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>; - -// ROR ROL. -// r,1. -def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>; - -// m,i. -def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 5; - let ResourceCycles = [2, 2, 1]; -} -def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>; - -// r,cl. -def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>; - -// m,cl. -def WriteRotateRMWCL : SchedWriteRes<[]> { - let NumMicroOps = 6; -} -def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>; - -// RCR RCL. -// r,1. -def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>; - -// m,1. -def WriteRCm1 : SchedWriteRes<[]> { - let NumMicroOps = 6; -} -def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>; - -// r,i. -def WriteRCri : SchedWriteRes<[HWPort0156]> { - let Latency = 6; - let NumMicroOps = 8; -} -def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>; - -// m,i. -def WriteRCmi : SchedWriteRes<[]> { - let NumMicroOps = 11; -} -def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>; - -// SHRD SHLD. -// r,r,i. -def WriteShDrr : SchedWriteRes<[HWPort1]> { - let Latency = 3; -} -def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>; - -// m,r,i. -def WriteShDmr : SchedWriteRes<[]> { - let NumMicroOps = 5; -} -def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>; - -// r,r,cl. -def WriteShlDCL : SchedWriteRes<[HWPort0156]> { - let Latency = 3; - let NumMicroOps = 4; -} -def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>; - -// r,r,cl. -def WriteShrDCL : SchedWriteRes<[HWPort0156]> { - let Latency = 4; - let NumMicroOps = 4; -} -def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>; - -// m,r,cl. -def WriteShDmrCL : SchedWriteRes<[]> { - let NumMicroOps = 7; -} -def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>; - // BT. -// r,r/i. -def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>; - // m,r. def WriteBTmr : SchedWriteRes<[]> { let NumMicroOps = 10; } def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>; -// m,i. -def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; - // BTR BTS BTC. -// r,r,i. -def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; - // m,r. def WriteBTRSCmr : SchedWriteRes<[]> { let NumMicroOps = 11; } def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>; -// m,i. -def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>; - -// BSF BSR. -// r,r. -def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>; -// r,m. -def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>; - -// SETcc. -// r. -def : InstRW<[WriteShift], - (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>; -// m. -def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteSetCCm], - (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>; - -// CLD STD. -def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>; - -// LZCNT TZCNT. -// r,r. -def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>; -// r,m. -def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>; - -// ANDN. -// r,r. -def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>; -// r,m. -def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>; - -// BLSI BLSMSK BLSR. -// r,r. -def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>; -// r,m. -def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>; - -// BEXTR. -// r,r,r. -def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>; -// r,m,r. -def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>; - -// BZHI. -// r,r,r. -def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>; - -// PDEP PEXT. -// r,r,r. -def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; -// r,m,r. -def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; - //-- Control transfer instructions --// -// J(E|R)CXZ. -def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>; - -// LOOP. -def WriteLOOP : SchedWriteRes<[]> { - let NumMicroOps = 7; -} -def : InstRW<[WriteLOOP], (instregex "LOOP")>; - -// LOOP(N)E -def WriteLOOPE : SchedWriteRes<[]> { - let NumMicroOps = 11; -} -def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>; - // CALL. -// r. -def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>; - -// m. -def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; -} -def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>; - -// RET. -def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>; - // i. def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> { let NumMicroOps = 4; @@ -977,12 +527,6 @@ def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>; // LODSD/Q. def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>; -// STOS. -def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>; - // MOVS. def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { let Latency = 4; @@ -991,9 +535,6 @@ def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> { } def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>; -// SCAS. -def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>; - // CMPS. def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> { let Latency = 4; @@ -1002,57 +543,9 @@ def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> { } def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>; -//-- Synchronization instructions --// - -// XADD. -def WriteXADD : SchedWriteRes<[]> { - let NumMicroOps = 5; -} -def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>; - -// CMPXCHG. -def WriteCMPXCHG : SchedWriteRes<[]> { - let NumMicroOps = 6; -} -def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; - -// CMPXCHG8B. -def WriteCMPXCHG8B : SchedWriteRes<[]> { - let NumMicroOps = 15; -} -def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>; - -// CMPXCHG16B. -def WriteCMPXCHG16B : SchedWriteRes<[]> { - let NumMicroOps = 22; -} -def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>; - //-- Other --// -// PAUSE. -def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> { - let NumMicroOps = 5; - let ResourceCycles = [1, 3]; -} -def : InstRW<[WritePAUSE], (instregex "PAUSE")>; - -// LEAVE. -def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>; - -// XGETBV. -def WriteXGETBV : SchedWriteRes<[]> { - let NumMicroOps = 8; -} -def : InstRW<[WriteXGETBV], (instregex "XGETBV")>; - -// RDTSC. -def WriteRDTSC : SchedWriteRes<[]> { - let NumMicroOps = 15; -} -def : InstRW<[WriteRDTSC], (instregex "RDTSC")>; - -// RDPMC. +// RDPMC.f def WriteRDPMC : SchedWriteRes<[]> { let NumMicroOps = 34; } @@ -1072,13 +565,6 @@ def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>; // m80. def : InstRW<[WriteP01], (instregex "LD_Frr")>; -def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 4; - let NumMicroOps = 4; - let ResourceCycles = [2, 2]; -} -def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>; - // FBLD. // m80. def WriteFBLD : SchedWriteRes<[]> { @@ -1091,84 +577,12 @@ def : InstRW<[WriteFBLD], (instregex "FBLDm")>; // r. def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>; -// m80. -def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> { - let NumMicroOps = 7; - let ResourceCycles = [3, 2, 2]; -} -def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>; - -// FBSTP. -// m80. -def WriteFBSTP : SchedWriteRes<[]> { - let NumMicroOps = 226; -} -def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>; - -// FXCHG. -def : InstRW<[WriteNop], (instregex "XCH_F")>; - -// FILD. -def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 6; - let NumMicroOps = 2; -} -def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>; - -// FIST(P) FISTTP. -def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> { - let Latency = 7; - let NumMicroOps = 3; -} -def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>; - // FLDZ. def : InstRW<[WriteP01], (instregex "LD_F0")>; -// FLD1. -def : InstRW<[Write2P01], (instregex "LD_F1")>; - // FLDPI FLDL2E etc. def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; -// FCMOVcc. -def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; -} -def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>; - -// FNSTSW. -// AX. -def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> { - let NumMicroOps = 2; -} -def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>; - -// m16. -def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> { - let Latency = 6; - let NumMicroOps = 3; -} -def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>; - -// FLDCW. -def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> { - let Latency = 7; - let NumMicroOps = 3; -} -def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>; - -// FNSTCW. -def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> { - let NumMicroOps = 3; -} -def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>; - -// FINCSTP FDECSTP. -def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>; - // FFREE. def : InstRW<[WriteP01], (instregex "FFREE")>; @@ -1192,13 +606,6 @@ def : InstRW<[WriteP0], (instregex "ABS_F")>; // FCHS. def : InstRW<[WriteP0], (instregex "CHS_F")>; -// FCOM(P) FUCOM(P). -// r. -def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr", - "UCOM_FPr")>; -// m. -def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>; - // FCOMPP FUCOMPP. // r. def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>; @@ -1208,9 +615,6 @@ def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>; def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr", "UCOM_FIPr")>; -// FICOM(P). -def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>; - // FTST. def : InstRW<[WriteP1], (instregex "TST_F")>; @@ -1271,910 +675,3693 @@ def WriteFNINIT : SchedWriteRes<[]> { } def : InstRW<[WriteFNINIT], (instregex "FNINIT")>; -//=== Integer MMX and XMM Instructions ===// -//-- Move instructions --// +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// -// MOVD. -// r32/64 <- (x)mm. -def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr", - "VMOVPDI2DIrr", "MOVPDI2DIrr")>; +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} -// (x)mm <- r32/64. -def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr", - "VMOVDI2PDIrr", "MOVDI2PDIrr")>; +// x,m / v,v,m. +def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1, 2, 1]; +} -// MOVQ. -// r64 <- (x)mm. -def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>; +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1, 2]; +} +// v <- v,m. +def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1, 2, 1]; +} -// (x)mm <- r64. -def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>; +//=== Floating Point XMM and YMM Instructions ===// -// (x)mm <- (x)mm. -def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>; +// Remaining instrs. -// (V)MOVDQA/U. -// x <- x. -def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr", - "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV", - "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>; +def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>; +def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPDr")>; +def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPSr")>; +def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSDr")>; +def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSSr")>; +def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPDr")>; +def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPSr")>; +def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSDr")>; +def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSSr")>; +def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPDr")>; +def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPSr")>; + +def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F32m")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F64m")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F80m")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTF128")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTI128")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VLDDQUYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPDYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPSYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDDUPYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQAYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQUYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVNTDQAYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPDYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPSYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTDYrm")>; +def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTQYrm")>; + +def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64rm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64to64rm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVQ64rm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOV64toPQIrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOV8rm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDDUPrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDI2PDIrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVQI2PQIrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSDrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSSrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHNTA")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT0")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT1")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT2")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "VMOV64toPQIrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDDUPrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSDrm")>; +def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSSrm")>; + +def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVSDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "ST_FP32m")>; +def: InstRW<[HWWriteResGroup1], (instregex "ST_FP64m")>; +def: InstRW<[HWWriteResGroup1], (instregex "ST_FP80m")>; +def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>; +def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>; + +def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>; +def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>; + +def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup3], (instregex "COMP_FST0r")>; +def: InstRW<[HWWriteResGroup3], (instregex "COM_FST0r")>; +def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[HWWriteResGroup3], (instregex "UCOM_FPr")>; +def: InstRW<[HWWriteResGroup3], (instregex "UCOM_Fr")>; +def: InstRW<[HWWriteResGroup3], (instregex "VMASKMOVDQU")>; + +def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV)?")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPDYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPSYrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>; +def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>; + +def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup5], (instregex "JMP(16|32|64)r")>; -// MOVDQ2Q. -def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>; +def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>; +def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>; -// MOVQ2DQ. -def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>; +def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>; +def: InstRW<[HWWriteResGroup7], (instregex "CQO")>; +def: InstRW<[HWWriteResGroup7], (instregex "JAE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JAE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JA_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JA_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JBE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JBE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JB_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JB_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JGE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JGE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JG_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JG_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JLE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JLE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JL_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JL_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JMP_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JMP_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNE_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNE_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNO_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNO_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNP_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNP_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNS_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JNS_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JO_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JO_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JP_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JP_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "JS_1")>; +def: InstRW<[HWWriteResGroup7], (instregex "JS_4")>; +def: InstRW<[HWWriteResGroup7], (instregex "RORX(32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SARX(32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHLX(32|64)rr")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>; +def: InstRW<[HWWriteResGroup7], (instregex "SHRX(32|64)rr")>; + +def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSI(32|64)rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK(32|64)rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BLSR(32|64)rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "BZHI(32|64)rr")>; +def: InstRW<[HWWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>; +def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>; + +def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>; +def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>; +def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>; + +def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "CBW")>; +def: InstRW<[HWWriteResGroup10], (instregex "CLC")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMC")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>; +def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[HWWriteResGroup10], (instregex "NEG(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>; +def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>; +def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>; +def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>; +def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>; +def: InstRW<[HWWriteResGroup10], (instregex "SLDT64m")>; +def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>; +def: InstRW<[HWWriteResGroup10], (instregex "STC")>; +def: InstRW<[HWWriteResGroup10], (instregex "STRm")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "XCHG(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR8i8")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>; +def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>; + +def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>; +def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>; + +def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11_1], (instregex "CVTSS2SDrm")>; +def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTPH2PSYrm")>; +def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTSS2SDrm")>; +def: InstRW<[HWWriteResGroup11_1], (instregex "VPSLLVQrm")>; +def: InstRW<[HWWriteResGroup11_1], (instregex "VPSRLVQrm")>; +def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPDrm")>; +def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPSrm")>; +def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLDYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLQYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLWYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRADYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRAWYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLDYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLQYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLVQYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLWYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPDYrm")>; +def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPSYrm")>; + +def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup12], (instregex "ADDSDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "ADDSSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "CMPSDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "CMPSSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "COMISDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "COMISSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOM32m")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOM64m")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>; +def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>; +def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)m")>; +def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)rm(i8)?")>; +def: InstRW<[HWWriteResGroup12], (instregex "IMUL8m")>; +def: InstRW<[HWWriteResGroup12], (instregex "LZCNT(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[HWWriteResGroup12], (instregex "MUL(16|32|64)m")>; +def: InstRW<[HWWriteResGroup12], (instregex "MUL8m")>; +def: InstRW<[HWWriteResGroup12], (instregex "PDEP(32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "PEXT(32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "SUBSDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "SUBSSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "TZCNT(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup12], (instregex "UCOMISDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "UCOMISSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VADDSDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VADDSSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VCMPSDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VCMPSSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VCOMISDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VCOMISSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VSUBSDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VSUBSSrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISDrm")>; +def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISSrm")>; + +def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup13], (instregex "ANDNPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ANDNPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ANDPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ANDPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "INSERTPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "ORPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKSSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKSSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKUSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PACKUSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PALIGNRrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PBLENDWrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFDmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFHWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PSHUFLWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "SHUFPDrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "SHUFPSrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VANDPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VINSERTPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VORPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSrmi")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VXORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "VXORPSrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "XORPDrm")>; +def: InstRW<[HWWriteResGroup13], (instregex "XORPSrm")>; + +def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPSYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPSYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VORPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VORPSYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSDWYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSWBYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSDWYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSWBYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPALIGNRYrmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPBLENDWYrmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBQYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXWQYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFBYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFDYmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFHWYmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFLWYmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHBWYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHDQYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHQDQYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHWDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLBWYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLDQYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLQDQYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLWDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPDYrmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPSYrmi")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPDYrm")>; +def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPSYrm")>; + +def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PINSRWirmi")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFWmi")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHBWirm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHDQirm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHWDirm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLBWirm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLDQirm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLWDirm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPSrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPSrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRBrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRWrmi")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBWrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXDQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBWrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXDQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPSrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPSrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRBrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRWrmi")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBWrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXDQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBWrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXDQrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWDrm")>; +def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWQrm")>; + +def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64")>; +def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>; -// PACKSSWB/DW. -// mm <- mm. -def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> { +def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup15], (instregex "RORX32mi")>; +def: InstRW<[HWWriteResGroup15], (instregex "RORX64mi")>; +def: InstRW<[HWWriteResGroup15], (instregex "SARX32rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SARX64rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHLX32rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHLX64rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHRX32rm")>; +def: InstRW<[HWWriteResGroup15], (instregex "SHRX64rm")>; + +def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSI(32|64)rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK(32|64)rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BLSR(32|64)rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "BZHI(32|64)rm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSBrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSDrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSWrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDQirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXUBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINUBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNWrm64")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBDirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBQirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSBirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBWirm")>; +def: InstRW<[HWWriteResGroup16], (instregex "MOVBE(16|32|64)rm")>; + +def HWWriteResGroup16_1 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16_1], (instregex "PABSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PABSDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PABSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDQrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PADDWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQQrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNBrm128")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNDrm128")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNWrm128")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBQrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDQrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQQrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNBrm128")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNDrm128")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNWrm128")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBDrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBQrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSBrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSWrm")>; +def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBWrm")>; + +def HWWriteResGroup16_2 : SchedWriteRes<[HWPort23,HWPort15]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDQYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQQYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNBYrm256")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNDYrm256")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNWYrm256")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBDYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBQYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSBYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSWYrm")>; +def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBWYrm")>; + +def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup17], (instregex "BLENDPDrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "BLENDPSrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "PANDNrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PANDrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PORrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "PXORrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VINSERTI128rm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPANDNrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPANDrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDrmi")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPORrm")>; +def: InstRW<[HWWriteResGroup17], (instregex "VPXORrm")>; + +def HWWriteResGroup17_1 : SchedWriteRes<[HWPort23,HWPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDNirm")>; +def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDirm")>; +def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PORirm")>; +def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PXORirm")>; + +def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPDYrmi")>; +def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPSYrmi")>; +def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDNYrm")>; +def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDYrm")>; +def: InstRW<[HWWriteResGroup17_2], (instregex "VPBLENDDYrmi")>; +def: InstRW<[HWWriteResGroup17_2], (instregex "VPORYrm")>; +def: InstRW<[HWWriteResGroup17_2], (instregex "VPXORYrm")>; + +def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup18], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "ADD8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "AND(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "AND8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP8mi")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP8mr")>; +def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr)?")>; +def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup18], (instregex "TEST8mi")>; +def: InstRW<[HWWriteResGroup18], (instregex "TEST8mr")>; +def: InstRW<[HWWriteResGroup18], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup18], (instregex "XOR8rm")>; + +def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>; + +def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> { let Latency = 2; let NumMicroOps = 3; - let ResourceCycles = [3]; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup20], (instregex "EXTRACTPSmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRBmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRDmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRQmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "PEXTRWmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "STMXCSR")>; +def: InstRW<[HWWriteResGroup20], (instregex "VEXTRACTPSmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRBmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRDmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRQmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRWmr")>; +def: InstRW<[HWWriteResGroup20], (instregex "VSTMXCSR")>; + +def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr", - "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>; +def: InstRW<[HWWriteResGroup21], (instregex "FNSTCW16m")>; -// mm <- m64. -def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> { - let Latency = 4; +def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort06]> { + let Latency = 2; let NumMicroOps = 3; - let ResourceCycles = [1, 3]; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup22], (instregex "SETAEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETBm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETGEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETGm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETLEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETLm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNEm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNOm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNPm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETNSm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETOm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETPm")>; +def: InstRW<[HWWriteResGroup22], (instregex "SETSm")>; + +def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm", - "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>; +def: InstRW<[HWWriteResGroup23], (instregex "MOVBE(32|64)mr")>; -// VPMOVSX/ZX BW BD BQ DW DQ. -// y <- x. -def WriteVPMOVSX : SchedWriteRes<[HWPort5]> { - let Latency = 3; - let NumMicroOps = 1; +def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>; +def: InstRW<[HWWriteResGroup23_16], (instregex "MOVBE16mr")>; -// PBLENDW. -// x,x,i / v,v,v,i -def WritePBLENDWr : SchedWriteRes<[HWPort5]>; -def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>; +def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr)?")>; +def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSQ")>; +def: InstRW<[HWWriteResGroup24], (instregex "STOSW")>; -// x,m,i / v,v,m,i -def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> { +def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup25], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup25], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR8m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SAR8mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL8m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHL8mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR8m1")>; +def: InstRW<[HWWriteResGroup25], (instregex "SHR8mi")>; + +def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "ADD8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "ADD8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "AND8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "DEC(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "DEC8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "INC(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "INC8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NEG(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NEG8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NOT(16|32|64)m")>; +def: InstRW<[HWWriteResGroup26], (instregex "NOT8m")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "OR8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm")>; +def: InstRW<[HWWriteResGroup26], (instregex "PUSH(16|32|64)rmm")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "SUB8mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR8mi")>; +def: InstRW<[HWWriteResGroup26], (instregex "XOR8mr")>; + +def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> { + let Latency = 2; let NumMicroOps = 2; - let Latency = 4; - let ResourceCycles = [1, 1]; + let ResourceCycles = [2]; } -def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>; - -// VPBLENDD. -// v,v,v,i. -def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>; -def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>; - -// v,v,m,i -def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> { +def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0")>; +def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPSrr0")>; +def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWirri")>; +def: InstRW<[HWWriteResGroup27], (instregex "PBLENDVBrr0")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRBrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRDrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRQrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "PINSRWrri")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDYrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSYrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBYrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRBrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRDrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRQrr")>; +def: InstRW<[HWWriteResGroup27], (instregex "VPINSRWrri")>; + +def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> { + let Latency = 2; let NumMicroOps = 2; - let Latency = 4; - let ResourceCycles = [1, 1]; + let ResourceCycles = [2]; } -def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>; +def: InstRW<[HWWriteResGroup28], (instregex "FDECSTP")>; -// MASKMOVQ. -def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> { - let Latency = 13; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 2]; +def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; } -def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL8r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROL8ri")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR8r1")>; +def: InstRW<[HWWriteResGroup29], (instregex "ROR8ri")>; -// MASKMOVDQU. -def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> { - let Latency = 14; - let NumMicroOps = 10; - let ResourceCycles = [4, 2, 4]; +def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; } -def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>; +def: InstRW<[HWWriteResGroup30], (instregex "LFENCE")>; +def: InstRW<[HWWriteResGroup30], (instregex "MFENCE")>; +def: InstRW<[HWWriteResGroup30], (instregex "WAIT")>; +def: InstRW<[HWWriteResGroup30], (instregex "XGETBV")>; -// VPMASKMOV D/Q. -// v,v,m. -def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> { - let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; +def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup31], (instregex "CVTPS2PDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "CVTSS2SDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "EXTRACTPSrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWirri")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRBrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWri")>; +def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWrr_REV")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSLLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSLLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSLLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRADrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRAWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PSRLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "PTESTrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTPS2PDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VCVTSS2SDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VEXTRACTPSrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRBrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWri")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWrr_REV")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSLLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSLLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSLLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRADrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRAWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRLDrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRLQrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPSRLWrr")>; +def: InstRW<[HWWriteResGroup31], (instregex "VPTESTrr")>; + +def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteVPMASKMOVr, ReadAfterLd], - (instregex "VPMASKMOV(D|Q)(Y?)rm")>; +def: InstRW<[HWWriteResGroup32], (instregex "CLFLUSH")>; -// m, v,v. -def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { - let Latency = 13; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; +def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; +def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>; -// PMOVMSKB. -def WritePMOVMSKB : SchedWriteRes<[HWPort0]> { - let Latency = 3; +def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>; +def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr")>; +def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>; -// PEXTR B/W/D/Q. -// r32,x,i. -def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> { +def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> { let Latency = 2; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>; +def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup35], (instregex "CWD")>; +def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV)?")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>; +def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV)?")>; +def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>; +def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>; + +def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0")>; +def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPSrm0")>; +def: InstRW<[HWWriteResGroup36], (instregex "PBLENDVBrm0")>; +def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDrm")>; +def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQrm")>; + +def HWWriteResGroup36_1 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; } -def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPDYrm")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPSYrm")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VPBLENDVBYrm")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVDYrm")>; +def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVQYrm")>; -// m8,x,i. -def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> { +def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 7; let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>; +def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm")>; +def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSWBirm")>; +def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKUSWBirm")>; -// VPBROADCAST B/W. -// x, m8/16. -def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { - let Latency = 5; +def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 7; let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let ResourceCycles = [1,2]; } -def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd], - (instregex "VPBROADCAST(B|W)rm")>; +def: InstRW<[HWWriteResGroup37], (instregex "LEAVE64")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASB")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASL")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASQ")>; +def: InstRW<[HWWriteResGroup37], (instregex "SCASW")>; -// y, m8/16 -def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> { +def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup38], (instregex "PSLLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSLLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSLLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRADrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRAWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PSRLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "PTESTrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSLLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSLLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSLLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRADrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRAWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRLDrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRLQrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPSRLWrm")>; +def: InstRW<[HWWriteResGroup38], (instregex "VPTESTrm")>; + +def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> { let Latency = 7; let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd], - (instregex "VPBROADCAST(B|W)Yrm")>; +def: InstRW<[HWWriteResGroup39], (instregex "FLDCW16m")>; -// VPGATHERDD. -// x. -def WriteVPGATHERDD128 : SchedWriteRes<[]> { - let NumMicroOps = 20; +def HWWriteResGroup40 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>; +def: InstRW<[HWWriteResGroup40], (instregex "LDMXCSR")>; +def: InstRW<[HWWriteResGroup40], (instregex "VLDMXCSR")>; -// y. -def WriteVPGATHERDD256 : SchedWriteRes<[]> { - let NumMicroOps = 34; +def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>; +def: InstRW<[HWWriteResGroup41], (instregex "LRETQ")>; +def: InstRW<[HWWriteResGroup41], (instregex "RETL")>; +def: InstRW<[HWWriteResGroup41], (instregex "RETQ")>; -// VPGATHERQD. -// x. -def WriteVPGATHERQD128 : SchedWriteRes<[]> { - let NumMicroOps = 15; +def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>; +def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>; -// y. -def WriteVPGATHERQD256 : SchedWriteRes<[]> { - let NumMicroOps = 22; +def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup43], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "ADC8rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup43], (instregex "SBB8rm")>; + +def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>; +def: InstRW<[HWWriteResGroup44], (instregex "CALL(16|32|64)r")>; -// VPGATHERDQ. -// x. -def WriteVPGATHERDQ128 : SchedWriteRes<[]> { - let NumMicroOps = 12; +def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>; +def: InstRW<[HWWriteResGroup45], (instregex "CALL64pcrel32")>; +def: InstRW<[HWWriteResGroup45], (instregex "SETAm")>; +def: InstRW<[HWWriteResGroup45], (instregex "SETBEm")>; -// y. -def WriteVPGATHERDQ256 : SchedWriteRes<[]> { - let NumMicroOps = 20; +def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROL8m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROL8mi")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR8m1")>; +def: InstRW<[HWWriteResGroup46], (instregex "ROR8mi")>; + +def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; } -def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>; +def: InstRW<[HWWriteResGroup47], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup47], (instregex "XADD8rm")>; -// VPGATHERQQ. -// x. -def WriteVPGATHERQQ128 : SchedWriteRes<[]> { - let NumMicroOps = 14; +def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; } -def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>; +def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>; +def: InstRW<[HWWriteResGroup48], (instregex "FARCALL64")>; -// y. -def WriteVPGATHERQQ256 : SchedWriteRes<[]> { - let NumMicroOps = 22; +def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPDrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPSrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "PMOVMSKBrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBYrr")>; +def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBrr")>; + +def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADD_FPrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADD_FST0r")>; +def: InstRW<[HWWriteResGroup50], (instregex "ADD_FrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "COMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8)?")>; +def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>; +def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr")>; +def: InstRW<[HWWriteResGroup50], (instregex "MUL8r")>; +def: InstRW<[HWWriteResGroup50], (instregex "PDEP(32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "PEXT(32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[HWWriteResGroup50], (instregex "SHRD(16|32|64)rri8")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FPrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FST0r")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUB_FPrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUB_FST0r")>; +def: InstRW<[HWWriteResGroup50], (instregex "SUB_FrST0")>; +def: InstRW<[HWWriteResGroup50], (instregex "TZCNT(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrri")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>; +def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>; + +def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 3; + let NumMicroOps = 4; } -def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>; +def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8)?")>; -//-- Arithmetic instructions --// +def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; +} +def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8)?")>; -//////////////////////////////////////////////////////////////////////////////// -// Horizontal add/sub instructions. -//////////////////////////////////////////////////////////////////////////////// +def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSSYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTF128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTI128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VINSERTF128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VINSERTI128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYri")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYri")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrr")>; +def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrr")>; + +def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup52], (instregex "ADDPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CMPPDrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "CMPPSrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "CVTDQ2PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CVTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "CVTTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "SUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSrmi")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PSrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDrm")>; +def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSrm")>; + +def HWWriteResGroup52_1 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F32m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F64m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F16m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F32m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F64m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F32m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F64m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F32m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F64m")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPDYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPSYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPDYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPSYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPDYrmi")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPSYrmi")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PDYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PSYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PDYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PSYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPDYrm")>; +def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPSYrm")>; + +def HWWriteResGroup53 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERM2I128rm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMPDYmi")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMPSYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPERMQYmi")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBWYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXDQYrm")>; +def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWQYrm")>; + +def HWWriteResGroup53_1 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm")>; +def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXDQYrm")>; +def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXWDYrm")>; +def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVZXWDYrm")>; -// HADD, HSUB PS/PD -// x,x / v,v,v. -def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> { - let Latency = 5; +def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> { + let Latency = 3; let NumMicroOps = 3; - let ResourceCycles = [1, 2]; + let ResourceCycles = [3]; } +def: InstRW<[HWWriteResGroup54], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup54], (instregex "XADD8rr")>; +def: InstRW<[HWWriteResGroup54], (instregex "XCHG8rr")>; -// x,m / v,v,m. -def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; +def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; } +def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDYrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDYrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDYrr")>; +def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDrr")>; -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> { +def HWWriteResGroup56 : SchedWriteRes<[HWPort5,HWPort15]> { let Latency = 3; let NumMicroOps = 3; - let ResourceCycles = [1, 2]; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBSWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBWrr64")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHADDDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHADDSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHADDWrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHSUBDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHSUBSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "PHSUBWrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr256")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr128")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr256")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWYrr")>; +def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWrr")>; + +def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; } -// v <- v,m. -def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> { - let Latency = 6; +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr")>; +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSWBirr")>; +def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKUSWBirr")>; + +def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 3; let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; + let ResourceCycles = [1,2]; } +def: InstRW<[HWWriteResGroup58], (instregex "CLD")>; -// PHADD|PHSUB (S) W/D. -// v <- v,v. -def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> { +def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> { let Latency = 3; let NumMicroOps = 3; - let ResourceCycles = [1, 2]; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup59], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup59], (instregex "CMOVBE(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL8r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCL8ri")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)ri")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR8r1")>; +def: InstRW<[HWWriteResGroup59], (instregex "RCR8ri")>; + +def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup60], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "ROL8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "ROR8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SAR8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHL8rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup60], (instregex "SHR8rCL")>; + +def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64", - "MMX_PHADDSWrr64", - "MMX_PHSUB(W|D)rr64", - "MMX_PHSUBSWrr64", - "(V?)PH(ADD|SUB)(W|D)(Y?)rr", - "(V?)PH(ADD|SUB)SWrr(256)?")>; +def: InstRW<[HWWriteResGroup61], (instregex "FNSTSWm")>; -// v <- v,m. -def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 6; +def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> { + let Latency = 4; let NumMicroOps = 3; - let ResourceCycles = [1, 2, 1]; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP16m")>; +def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP32m")>; +def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP64m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_F16m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_F32m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_FP16m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_FP32m")>; +def: InstRW<[HWWriteResGroup62], (instregex "IST_FP64m")>; + +def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WritePHADDSUBm, ReadAfterLd], - (instregex "MMX_PHADD(W?)rm64", - "MMX_PHADDSWrm64", - "MMX_PHSUB(W|D)rm64", - "MMX_PHSUBSWrm64", - "(V?)PH(ADD|SUB)(W|D)(Y?)rm", - "(V?)PH(ADD|SUB)SWrm(128|256)?")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDYrm")>; +def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDYrm")>; -// PCMPGTQ. -// v <- v,v. -def WritePCMPGTQr : SchedWriteRes<[HWPort0]> { - let Latency = 5; - let NumMicroOps = 1; +def HWWriteResGroup63_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; +def: InstRW<[HWWriteResGroup63_1], (instregex "VPSLLVDrm")>; +def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRAVDrm")>; +def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRLVDrm")>; -// v <- v,m. -def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBSWrm64")>; +def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBWrm64")>; -// PMULLD. -// x,x / y,y,y. -def WritePMULLDr : SchedWriteRes<[HWPort0]> { +def HWWriteResGroup64_1 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { let Latency = 10; - let NumMicroOps = 2; - let ResourceCycles = [2]; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>; +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDDYrm")>; +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDSWrm256")>; +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDWYrm")>; +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBDYrm")>; +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBSWrm256")>; +def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBWYrm")>; -// x,m / y,y,m. -def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 10; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; +def HWWriteResGroup64_2 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDDrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDSWrm128")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDWrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBDrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBSWrm128")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBWrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDDrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDSWrm128")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDWrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBDrm")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBSWrm128")>; +def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBWrm")>; + +def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; } -def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>; +def: InstRW<[HWWriteResGroup65], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup65], (instregex "CMOVBE(16|32|64)rm")>; -//-- Logic instructions --// +def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCL8m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCL8mi")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR8m1")>; +def: InstRW<[HWWriteResGroup66], (instregex "RCR8mi")>; + +def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,1,2,1]; +} +def: InstRW<[HWWriteResGroup67], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup67], (instregex "ROR8mCL")>; -// PTEST. -// v,v. -def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; +def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[HWWriteResGroup68], (instregex "ADC(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "ADC8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "ADD8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "AND8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "OR8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "SUB8mi")>; +def: InstRW<[HWWriteResGroup68], (instregex "XCHG(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup68], (instregex "XCHG8rm")>; +def: InstRW<[HWWriteResGroup68], (instregex "XOR8mi")>; + +def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[HWWriteResGroup69], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "ADC8mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG8rm")>; +def: InstRW<[HWWriteResGroup69], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "ROL8mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SAR8mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mi")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB8mi")>; +def: InstRW<[HWWriteResGroup69], (instregex "SBB8mr")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHL8mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup69], (instregex "SHR8mCL")>; + +def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> { + let Latency = 4; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SIrr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SIrr")>; + +def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSLLDYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSLLQYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSLLWYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRADYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRAWYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRLDYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRLQYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPSRLWYrr")>; +def: InstRW<[HWWriteResGroup71], (instregex "VPTESTYrr")>; + +def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>; +def: InstRW<[HWWriteResGroup72], (instregex "FNSTSW16r")>; -// v,m. -def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; +def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup73], (instregex "CVTDQ2PDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2DQrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2PSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSD2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSI642SDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "CVTTPD2DQrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTDQ2PDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2DQrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2PSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTPS2PHrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSD2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI642SDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SDrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SSrr")>; +def: InstRW<[HWWriteResGroup73], (instregex "VCVTTPD2DQrr")>; + +def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>; +def: InstRW<[HWWriteResGroup74], (instregex "IMUL64r")>; +def: InstRW<[HWWriteResGroup74], (instregex "MUL64r")>; +def: InstRW<[HWWriteResGroup74], (instregex "MULX64rr")>; -// PSLL,PSRL,PSRA W/D/Q. -// x,x / v,v,x. -def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; } -def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>; +def: InstRW<[HWWriteResGroup74_16], (instregex "IMUL16r")>; +def: InstRW<[HWWriteResGroup74_16], (instregex "MUL16r")>; -// PSLL,PSRL DQ. -def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>; +def HWWriteResGroup74_32 : SchedWriteRes<[HWPort1,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 3; +} +def: InstRW<[HWWriteResGroup74_32], (instregex "IMUL32r")>; +def: InstRW<[HWWriteResGroup74_32], (instregex "MUL32r")>; -//-- Other --// +def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup75], (instregex "FICOM16m")>; +def: InstRW<[HWWriteResGroup75], (instregex "FICOM32m")>; +def: InstRW<[HWWriteResGroup75], (instregex "FICOMP16m")>; +def: InstRW<[HWWriteResGroup75], (instregex "FICOMP32m")>; -// EMMS. -def WriteEMMS : SchedWriteRes<[]> { - let Latency = 13; - let NumMicroOps = 31; +def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "CVTTSS2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SIrm")>; + +def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>; +def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>; -//=== Floating Point XMM and YMM Instructions ===// -//-- Move instructions --// +def HWWriteResGroup77_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup77_1], (instregex "VPTESTYrm")>; -// MOVMSKP S/D. -// r32 <- x. -def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> { - let Latency = 3; +def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTDQ2PDrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2DQrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "CVTTPD2DQrm")>; +def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTTPD2PIirm")>; +def: InstRW<[HWWriteResGroup78], (instregex "VCVTDQ2PDrm")>; -// r32 <- y. -def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> { - let Latency = 2; +def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>; +def: InstRW<[HWWriteResGroup78_1], (instregex "CVTSD2SSrm")>; +def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[HWWriteResGroup78_1], (instregex "VCVTSD2SSrm")>; -// VPERM2F128. -def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>; -def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>; +def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup79], (instregex "MULX64rm")>; -// BLENDVP S/D. -def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>; -def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>; +def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBYrm")>; +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBrm")>; +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWYrm")>; +def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWrm")>; -// VBROADCASTF128. -def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>; +def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[HWWriteResGroup81], (instregex "FNCLEX")>; -// EXTRACTPS. -// r32,x,i. -def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> { - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; } -def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; +def: InstRW<[HWWriteResGroup82], (instregex "VZEROUPPER")>; -// m32,x,i. -def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> { +def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> { let Latency = 4; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; } -def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; +def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>; -// VEXTRACTF128. -// x,y,i. -def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>; +def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQYmr")>; +def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQmr")>; + +def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>; -// m128,y,i. -def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> { - let Latency = 4; +def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[HWWriteResGroup86], (instregex "SHRD(16|32|64)mri8")>; + +def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm")>; +def: InstRW<[HWWriteResGroup87], (instregex "LSL(16|32|64)rm")>; + +def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[HWWriteResGroup88], (instregex "PUSHF16")>; +def: InstRW<[HWWriteResGroup88], (instregex "PUSHF64")>; + +def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDWDirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHUWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULLWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MMX_PSADBWirr")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_FPrST0")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_FST0r")>; +def: InstRW<[HWWriteResGroup89], (instregex "MUL_FrST0")>; +def: InstRW<[HWWriteResGroup89], (instregex "PCMPGTQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMADDUBSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMADDWDrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULHRSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULHUWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULHWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULLWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PMULUDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "PSADBWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RCPPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RCPSSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RSQRTPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "RSQRTSSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWYrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWrr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRCPPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRCPSSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTPSr")>; +def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTSSr")>; + +def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup90], (instregex "MULPDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "MULPSrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "MULSDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "MULSSrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPDYrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPSYrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULPSrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULSDrr")>; +def: InstRW<[HWWriteResGroup90], (instregex "VMULSSrr")>; +def: InstRW<[HWWriteResGroup90], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>; + +def HWWriteResGroup91 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 10; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDWDirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHUWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULLWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "MMX_PSADBWirm")>; +def: InstRW<[HWWriteResGroup91], (instregex "RCPSSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "RSQRTSSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VRCPSSm")>; +def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTSSm")>; + +def HWWriteResGroup91_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 18; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>; +def: InstRW<[HWWriteResGroup91_1], (instregex "SQRTSSm")>; +def: InstRW<[HWWriteResGroup91_1], (instregex "VDIVSSrm")>; -// VINSERTF128. -// y,y,x,i. -def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>; +def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup91_2], (instregex "PCMPGTQrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDUBSWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDWDrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMULDQrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHRSWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHUWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMULLWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PMULUDQrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "PSADBWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "RCPPSm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "RSQRTPSm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPCMPGTQrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDUBSWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDWDrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULDQrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHRSWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHUWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULLWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULUDQrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VPSADBWrm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VRCPPSm")>; +def: InstRW<[HWWriteResGroup91_2], (instregex "VRSQRTPSm")>; + +def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F32m")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F64m")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPCMPGTQYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDUBSWYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDWDYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULDQYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHRSWYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHUWYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHWYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULLWYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULUDQYrm")>; +def: InstRW<[HWWriteResGroup91_3], (instregex "VPSADBWYrm")>; + +def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup92], (instregex "MULPDrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "MULPSrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULPDrm")>; +def: InstRW<[HWWriteResGroup92], (instregex "VMULPSrm")>; +def: InstRW<[HWWriteResGroup92], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>; -// y,y,m128,i. -def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> { - let Latency = 4; +def HWWriteResGroup92_1 : SchedWriteRes<[HWPort01,HWPort23]> { + let Latency = 12; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>; +def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPDYrm")>; +def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPSYrm")>; +def: InstRW<[HWWriteResGroup92_1], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>; -// VMASKMOVP S/D. -// v,v,m. -def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> { - let Latency = 4; +def HWWriteResGroup92_2 : SchedWriteRes<[HWPort01,HWPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup92_2], (instregex "MULSDrm")>; +def: InstRW<[HWWriteResGroup92_2], (instregex "MULSSrm")>; +def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSDrm")>; +def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSSrm")>; +def: InstRW<[HWWriteResGroup92_2], + (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>; + +def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 5; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [1,2]; +} +def: InstRW<[HWWriteResGroup93], (instregex "CVTSI642SSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HADDPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HADDPSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HSUBPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "HSUBPSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VCVTSI642SSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSYrr")>; +def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSrr")>; + +def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>; +def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>; -// m128,x,x. -def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { - let Latency = 13; +def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[HWWriteResGroup95], (instregex "MULX32rr")>; + +def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[HWWriteResGroup96], (instregex "HADDPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "HADDPSrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "HSUBPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "HSUBPSrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDrm")>; +def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSrm")>; + +def HWWriteResGroup96_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 12; let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; + let ResourceCycles = [1,2,1]; } -def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>; +def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPDYrm")>; +def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPSYrm")>; +def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPDYrm")>; +def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPSYrm")>; -// m256,y,y. -def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> { - let Latency = 14; +def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 10; let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>; +def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>; -// VGATHERDPS. -// x. -def WriteVGATHERDPS128 : SchedWriteRes<[]> { - let NumMicroOps = 20; +def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>; +def: InstRW<[HWWriteResGroup98], (instregex "MULX32rm")>; -// y. -def WriteVGATHERDPS256 : SchedWriteRes<[]> { - let NumMicroOps = 34; +def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; } -def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>; +def: InstRW<[HWWriteResGroup99], (instregex "PAUSE")>; -// VGATHERQPS. -// x. -def WriteVGATHERQPS128 : SchedWriteRes<[]> { - let NumMicroOps = 15; +def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; } -def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>; +def: InstRW<[HWWriteResGroup100], (instregex "XSETBV")>; -// y. -def WriteVGATHERQPS256 : SchedWriteRes<[]> { - let NumMicroOps = 22; +def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; } -def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>; +def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG8rr")>; -// VGATHERDPD. -// x. -def WriteVGATHERDPD128 : SchedWriteRes<[]> { - let NumMicroOps = 12; +def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTPS2PHYrr")>; +def: InstRW<[HWWriteResGroup102], (instregex "VCVTTPD2DQYrr")>; -// y. -def WriteVGATHERDPD256 : SchedWriteRes<[]> { - let NumMicroOps = 20; +def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI16m")>; +def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI32m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI16m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI32m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI16m")>; +def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI32m")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPDm")>; +def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPSm")>; + +def HWWriteResGroup103_1 : SchedWriteRes<[HWPort1,HWPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPDm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPSm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSDm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSSm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPDm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPSm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSDm")>; +def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSSm")>; + +def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>; +def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>; -// VGATHERQPD. -// x. -def WriteVGATHERQPD128 : SchedWriteRes<[]> { - let NumMicroOps = 14; +def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; } -def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>; +def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[HWWriteResGroup105], (instregex "SHRD(16|32|64)rrCL")>; -// y. -def WriteVGATHERQPD256 : SchedWriteRes<[]> { - let NumMicroOps = 22; +def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>; +def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>; -//-- Conversion instructions --// +def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[HWWriteResGroup107], (instregex "SLDT(16|32|64)r")>; -// CVTPD2PS. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>; +def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[HWWriteResGroup108], (instregex "STD")>; -// x,m128. -def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>; +def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 12; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,1,2]; +} +def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[HWWriteResGroup109], (instregex "SHRD(16|32|64)mrCL")>; -// x,y. -def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup110 : SchedWriteRes<[HWPort5]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESDECLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESDECrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESENCLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "AESENCrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESDECLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESDECrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESENCLASTrr")>; +def: InstRW<[HWWriteResGroup110], (instregex "VAESENCrr")>; -// x,m256. -def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 9; +def HWWriteResGroup111 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 13; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup111], (instregex "AESDECLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESDECrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESENCLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "AESENCrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESDECLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESDECrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESENCLASTrm")>; +def: InstRW<[HWWriteResGroup111], (instregex "VAESENCrm")>; + +def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 7; let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; + let ResourceCycles = [1,2]; } -def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>; +def: InstRW<[HWWriteResGroup112], (instregex "MPSADBWrri")>; +def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWYrri")>; +def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWrri")>; -// CVTSD2SS. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>; +def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[HWWriteResGroup113], (instregex "MPSADBWrmi")>; +def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWrmi")>; -// x,m64. -def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>; +def HWWriteResGroup113_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[HWWriteResGroup113_1], (instregex "VMPSADBWYrmi")>; -// CVTPS2PD. -// x,x. -def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [2,2,1,2]; } -def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>; +def: InstRW<[HWWriteResGroup114], (instregex "LOOP")>; -// x,m64. -// y,m128. -def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>; +def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI16m")>; +def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI32m")>; -// y,x. -def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 5; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[HWWriteResGroup116], (instregex "DPPDrri")>; +def: InstRW<[HWWriteResGroup116], (instregex "VDPPDrri")>; -// CVTSS2SD. -// x,x. -def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> { - let Latency = 2; - let NumMicroOps = 2; - let ResourceCycles = [1, 1]; +def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; } -def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>; +def: InstRW<[HWWriteResGroup117], (instregex "DPPDrmi")>; +def: InstRW<[HWWriteResGroup117], (instregex "VDPPDrmi")>; -// x,m32. -def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 5; +def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> { + let Latency = 10; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [2]; } -def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>; - -// CVTDQ2PD. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>; - -// y,x. -def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>; - -// CVT(T)PD2DQ. -// x,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>; -// x,m128. -def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>; -// x,y. -def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>; -// x,m256. -def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>; - -// CVT(T)PS2PI. -// mm,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>; - -// CVTPI2PD. -// x,mm. -def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>; - -// CVT(T)PD2PI. -// mm,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>; - -// CVSTSI2SS. -// x,r32. -def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>; - -// CVT(T)SS2SI. -// r32,x. -def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>; -// r32,m32. -def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>; - -// CVTSI2SD. -// x,r32/64. -def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>; - -// CVTSD2SI. -// r32/64 -def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>; -// r32,m32. -def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>; - -// VCVTPS2PH. -// x,v,i. -def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>; -// m,v,i. -def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>; - -// VCVTPH2PS. -// v,x. -def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>; +def: InstRW<[HWWriteResGroup118], (instregex "PMULLDrr")>; +def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDYrr")>; +def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDrr")>; -//-- Arithmetic instructions --// +def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 16; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup119], (instregex "PMULLDrm")>; +def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDrm")>; -// HADD, HSUB PS/PD -// x,x / v,v,v. -def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> { - let Latency = 5; +def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 17; let NumMicroOps = 3; - let ResourceCycles = [1, 2]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>; +def: InstRW<[HWWriteResGroup119_1], (instregex "VPMULLDYrm")>; -// x,m / v,v,m. -def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> { - let Latency = 9; - let NumMicroOps = 4; - let ResourceCycles = [1, 2, 1]; +def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 16; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,4,1,2]; } -def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>; +def: InstRW<[HWWriteResGroup120], (instregex "RCL(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup120], (instregex "RCL8mCL")>; -// MULL SS/SD PS/PD. -// x,x / v,v,v. -def WriteMULr : SchedWriteRes<[HWPort01]> { - let Latency = 5; +def HWWriteResGroup121 : SchedWriteRes<[HWPort0]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; +def: InstRW<[HWWriteResGroup121], (instregex "DIVPSrr")>; +def: InstRW<[HWWriteResGroup121], (instregex "DIVSSrr")>; -// x,m / v,v,m. -def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 9; +def HWWriteResGroup122 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 17; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>; +def: InstRW<[HWWriteResGroup122], (instregex "DIVPSrm")>; -// VDIVPS. -// y,y,y. -def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 19; // 18-21 cycles. +def HWWriteResGroup122_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup122_1], (instregex "DIVSSrm")>; + +def HWWriteResGroup123 : SchedWriteRes<[HWPort0]> { + let Latency = 11; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [3]; } -def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>; +def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRIrr")>; +def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRM128rr")>; +def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRIrr")>; +def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRM128rr")>; -// y,y,m256. -def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 23; // 18-21 + 4 cycles. - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; +def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort5]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>; +def: InstRW<[HWWriteResGroup124], (instregex "PCLMULQDQrr")>; +def: InstRW<[HWWriteResGroup124], (instregex "VPCLMULQDQrr")>; -// VDIVPD. -// y,y,y. -def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 27; // 19-35 cycles. +def HWWriteResGroup125 : SchedWriteRes<[HWPort0,HWPort015]> { + let Latency = 11; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>; +def: InstRW<[HWWriteResGroup125], (instregex "VRCPPSYr")>; +def: InstRW<[HWWriteResGroup125], (instregex "VRSQRTPSYr")>; -// y,y,m256. -def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 31; // 19-35 + 4 cycles. +def HWWriteResGroup126 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 17; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [3,1]; } -def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>; +def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRIrm")>; +def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRM128rm")>; +def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRIrm")>; +def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRM128rm")>; -// VRCPPS. -// y,y. -def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 7; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; +def HWWriteResGroup127 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>; +def: InstRW<[HWWriteResGroup127], (instregex "PCLMULQDQrm")>; +def: InstRW<[HWWriteResGroup127], (instregex "VPCLMULQDQrm")>; -// y,m256. -def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 11; +def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> { + let Latency = 18; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>; +def: InstRW<[HWWriteResGroup128], (instregex "VRCPPSYm")>; +def: InstRW<[HWWriteResGroup128], (instregex "VRSQRTPSYm")>; -// ROUND SS/SD PS/PD. -// v,v,i. -def WriteROUNDr : SchedWriteRes<[HWPort1]> { - let Latency = 6; +def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,2,3]; +} +def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL")>; +def: InstRW<[HWWriteResGroup129], (instregex "RCR(16|32|64)rCL")>; + +def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,4,1,3]; +} +def: InstRW<[HWWriteResGroup130], (instregex "RCL8rCL")>; + +def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[HWWriteResGroup131], (instregex "LOOPE")>; +def: InstRW<[HWWriteResGroup131], (instregex "LOOPNE")>; + +def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 17; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[HWWriteResGroup132], (instregex "CMPXCHG8B")>; + +def HWWriteResGroup133 : SchedWriteRes<[HWPort0]> { + let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup133], (instregex "SQRTPSr")>; +def: InstRW<[HWWriteResGroup133], (instregex "SQRTSSr")>; +def: InstRW<[HWWriteResGroup133], (instregex "VDIVPSrr")>; +def: InstRW<[HWWriteResGroup133], (instregex "VDIVSSrr")>; + +def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup134], (instregex "DIVSDrm")>; +def: InstRW<[HWWriteResGroup134], (instregex "SQRTPSm")>; +def: InstRW<[HWWriteResGroup134], (instregex "VDIVPSrm")>; +def: InstRW<[HWWriteResGroup134], (instregex "VSQRTSSm")>; + +def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,3,1,3]; +} +def: InstRW<[HWWriteResGroup135], (instregex "RCR(16|32|64)mCL")>; +def: InstRW<[HWWriteResGroup135], (instregex "RCR8mCL")>; + +def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup136], (instregex "DIVPDrr")>; +def: InstRW<[HWWriteResGroup136], (instregex "DIVSDrr")>; +def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPSr")>; +def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSSr")>; + +def HWWriteResGroup137 : SchedWriteRes<[HWPort5]> { + let Latency = 14; let NumMicroOps = 2; let ResourceCycles = [2]; } -def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>; +def: InstRW<[HWWriteResGroup137], (instregex "AESIMCrr")>; +def: InstRW<[HWWriteResGroup137], (instregex "VAESIMCrr")>; -// v,m,i. -def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> { - let Latency = 10; +def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup138], (instregex "DIVPDrm")>; +def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSm")>; + +def HWWriteResGroup139 : SchedWriteRes<[HWPort5,HWPort23]> { + let Latency = 20; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>; +def: InstRW<[HWWriteResGroup139], (instregex "AESIMCrm")>; +def: InstRW<[HWWriteResGroup139], (instregex "VAESIMCrm")>; -// DPPS. -// x,x,i / v,v,v,i. -def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { +def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> { let Latency = 14; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup140], (instregex "DPPSrri")>; +def: InstRW<[HWWriteResGroup140], (instregex "VDPPSYrri")>; +def: InstRW<[HWWriteResGroup140], (instregex "VDPPSrri")>; + +def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[HWWriteResGroup141], (instregex "DPPSrmi")>; +def: InstRW<[HWWriteResGroup141], (instregex "VDPPSrmi")>; + +def HWWriteResGroup141_1 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> { + let Latency = 21; + let NumMicroOps = 5; + let ResourceCycles = [2,1,1,1]; +} +def: InstRW<[HWWriteResGroup141_1], (instregex "VDPPSYrmi")>; + +def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,3,1,4]; +} +def: InstRW<[HWWriteResGroup142], (instregex "RCR8rCL")>; + +def HWWriteResGroup143 : SchedWriteRes<[HWPort23,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 15; + let ResourceCycles = [1,14]; +} +def: InstRW<[HWWriteResGroup143], (instregex "POPF16")>; + +def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 21; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[HWWriteResGroup144], (instregex "INSB")>; +def: InstRW<[HWWriteResGroup144], (instregex "INSL")>; +def: InstRW<[HWWriteResGroup144], (instregex "INSW")>; + +def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; } -def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>; +def: InstRW<[HWWriteResGroup145], (instregex "VZEROALL")>; -// x,m,i / v,v,m,i. -def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> { +def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 22; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[HWWriteResGroup146], (instregex "CMPXCHG16B")>; + +def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[HWWriteResGroup147], (instregex "XCH_F")>; + +def HWWriteResGroup148 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> { let Latency = 18; - let NumMicroOps = 6; - let ResourceCycles = [2, 1, 1, 1, 1]; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; } -def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>; +def: InstRW<[HWWriteResGroup148], (instregex "PCMPESTRIrr")>; +def: InstRW<[HWWriteResGroup148], (instregex "VPCMPESTRIrr")>; -// DPPD. -// x,x,i. -def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> { - let Latency = 9; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; +def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; } -def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>; +def: InstRW<[HWWriteResGroup149], (instregex "CPUID")>; +def: InstRW<[HWWriteResGroup149], (instregex "RDTSC")>; -// x,m,i. -def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> { - let Latency = 13; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; +def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; } -def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>; +def: InstRW<[HWWriteResGroup150], (instregex "PCMPESTRIrm")>; +def: InstRW<[HWWriteResGroup150], (instregex "VPCMPESTRIrm")>; -// VFMADD. -// v,v,v. -def WriteFMADDr : SchedWriteRes<[HWPort01]> { - let Latency = 5; +def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [3,1,15]; +} +def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>; + +def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[HWWriteResGroup152], (instregex "PCMPESTRM128rr")>; +def: InstRW<[HWWriteResGroup152], (instregex "VPCMPESTRM128rr")>; + +def HWWriteResGroup153 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> { + let Latency = 25; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} +def: InstRW<[HWWriteResGroup153], (instregex "PCMPESTRM128rm")>; +def: InstRW<[HWWriteResGroup153], (instregex "VPCMPESTRM128rm")>; + +def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> { + let Latency = 20; let NumMicroOps = 1; + let ResourceCycles = [1]; } -def : InstRW<[WriteFMADDr], - (instregex - // 3p forms. - "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?", - // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r", - // 4s/4s_int forms. - "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", - // 4p forms. - "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>; - -// v,v,m. -def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> { - let Latency = 9; +def: InstRW<[HWWriteResGroup154], (instregex "DIV_FPrST0")>; +def: InstRW<[HWWriteResGroup154], (instregex "DIV_FST0r")>; +def: InstRW<[HWWriteResGroup154], (instregex "DIV_FrST0")>; +def: InstRW<[HWWriteResGroup154], (instregex "SQRTPDr")>; +def: InstRW<[HWWriteResGroup154], (instregex "SQRTSDr")>; +def: InstRW<[HWWriteResGroup154], (instregex "VDIVPDrr")>; +def: InstRW<[HWWriteResGroup154], (instregex "VDIVSDrr")>; + +def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 27; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteFMADDm], - (instregex - // 3p forms. - "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?", - // 3s forms. - "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m", - // 4s/4s_int forms. - "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", - // 4p forms. - "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>; +def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F32m")>; +def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F64m")>; +def: InstRW<[HWWriteResGroup155], (instregex "VSQRTPDm")>; -//-- Math instructions --// +def HWWriteResGroup155_1 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 26; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[HWWriteResGroup155_1], (instregex "SQRTPDm")>; +def: InstRW<[HWWriteResGroup155_1], (instregex "VDIVPDrm")>; +def: InstRW<[HWWriteResGroup155_1], (instregex "VSQRTSDm")>; -// VSQRTPS. -// y,y. -def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 19; - let NumMicroOps = 3; - let ResourceCycles = [2, 1]; +def HWWriteResGroup155_2 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 25; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>; +def: InstRW<[HWWriteResGroup155_2], (instregex "SQRTSDm")>; +def: InstRW<[HWWriteResGroup155_2], (instregex "VDIVSDrm")>; -// y,m256. -def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 23; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; +def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; } -def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>; +def: InstRW<[HWWriteResGroup156], (instregex "MWAITrr")>; -// VSQRTPD. -// y,y. -def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 28; +def HWWriteResGroup157 : SchedWriteRes<[HWPort0]> { + let Latency = 21; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup157], (instregex "VSQRTPDr")>; +def: InstRW<[HWWriteResGroup157], (instregex "VSQRTSDr")>; + +def HWWriteResGroup159 : SchedWriteRes<[HWPort0,HWPort015]> { + let Latency = 21; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [2,1]; } -def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>; +def: InstRW<[HWWriteResGroup159], (instregex "VDIVPSYrr")>; +def: InstRW<[HWWriteResGroup159], (instregex "VSQRTPSYr")>; -// y,m256. -def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 32; +def HWWriteResGroup160 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> { + let Latency = 28; let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; + let ResourceCycles = [2,1,1]; } -def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>; +def: InstRW<[HWWriteResGroup160], (instregex "VDIVPSYrm")>; +def: InstRW<[HWWriteResGroup160], (instregex "VSQRTPSYm")>; -// RSQRT SS/PS. -// x,x. -def WriteRSQRTr : SchedWriteRes<[HWPort0]> { - let Latency = 5; +def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 30; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>; +def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI16m")>; +def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI32m")>; -// x,m128. -def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> { - let Latency = 9; +def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> { + let Latency = 24; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FPrST0")>; +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FST0r")>; +def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FrST0")>; + +def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> { + let Latency = 31; let NumMicroOps = 2; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1,1]; } -def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>; +def: InstRW<[HWWriteResGroup163], (instregex "DIV_F32m")>; +def: InstRW<[HWWriteResGroup163], (instregex "DIV_F64m")>; -// RSQRTPS 256. -// y,y. -def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> { - let Latency = 7; +def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 30; + let NumMicroOps = 27; + let ResourceCycles = [1,5,1,1,19]; +} +def: InstRW<[HWWriteResGroup164], (instregex "XSAVE64")>; + +def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> { + let Latency = 31; + let NumMicroOps = 28; + let ResourceCycles = [1,6,1,1,19]; +} +def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT)?")>; + +def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> { + let Latency = 34; let NumMicroOps = 3; - let ResourceCycles = [2, 1]; + let ResourceCycles = [1,1,1]; } -def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>; +def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI16m")>; +def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI32m")>; -// y,m256. -def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> { - let Latency = 11; - let NumMicroOps = 4; - let ResourceCycles = [2, 1, 1]; +def HWWriteResGroup167 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> { + let Latency = 34; + let NumMicroOps = 11; + let ResourceCycles = [2,7,1,1]; } -def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>; +def: InstRW<[HWWriteResGroup167], (instregex "AESKEYGENASSIST128rm")>; +def: InstRW<[HWWriteResGroup167], (instregex "VAESKEYGENASSIST128rm")>; -//-- Logic instructions --// +def HWWriteResGroup168 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> { + let Latency = 29; + let NumMicroOps = 11; + let ResourceCycles = [2,7,2]; +} +def: InstRW<[HWWriteResGroup168], (instregex "AESKEYGENASSIST128rr")>; +def: InstRW<[HWWriteResGroup168], (instregex "VAESKEYGENASSIST128rr")>; -// AND, ANDN, OR, XOR PS/PD. -// x,x / v,v,v. -def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>; -// x,m / v,v,m. -def : InstRW<[WriteP5Ld, ReadAfterLd], - (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>; +def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)ri")>; +def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)rr")>; +def: InstRW<[HWWriteResGroup170], (instregex "IN8ri")>; +def: InstRW<[HWWriteResGroup170], (instregex "IN8rr")>; -//-- Other instructions --// +def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> { + let Latency = 36; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)ir")>; +def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)rr")>; +def: InstRW<[HWWriteResGroup171], (instregex "OUT8ir")>; +def: InstRW<[HWWriteResGroup171], (instregex "OUT8rr")>; + +def HWWriteResGroup172 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> { + let Latency = 31; + let NumMicroOps = 31; + let ResourceCycles = [8,1,21,1]; +} +def: InstRW<[HWWriteResGroup172], (instregex "MMX_EMMS")>; + +def HWWriteResGroup173 : SchedWriteRes<[HWPort0,HWPort015]> { + let Latency = 35; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[HWWriteResGroup173], (instregex "VDIVPDYrr")>; +def: InstRW<[HWWriteResGroup173], (instregex "VSQRTPDYr")>; -// VZEROUPPER. -def WriteVZEROUPPER : SchedWriteRes<[]> { +def HWWriteResGroup174 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> { + let Latency = 42; let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[HWWriteResGroup174], (instregex "VDIVPDYrm")>; +def: InstRW<[HWWriteResGroup174], (instregex "VSQRTPDYm")>; + +def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> { + let Latency = 41; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[HWWriteResGroup175], (instregex "VMCLEARm")>; + +def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[HWWriteResGroup176], (instregex "RDTSCP")>; + +def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> { + let Latency = 61; + let NumMicroOps = 64; + let ResourceCycles = [2,2,8,1,10,2,39]; +} +def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>; +def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>; + +def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> { + let Latency = 64; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[HWWriteResGroup178], (instregex "FXRSTOR64")>; + +def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> { + let Latency = 64; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[HWWriteResGroup179], (instregex "FXRSTOR")>; + +def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[HWWriteResGroup180], (instregex "FNINIT")>; + +def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> { + let Latency = 98; + let NumMicroOps = 32; + let ResourceCycles = [7,7,3,3,1,11]; +} +def: InstRW<[HWWriteResGroup181], (instregex "DIV(16|32|64)r")>; + +def HWWriteResGroup182 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156]> { + let Latency = 112; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[HWWriteResGroup182], (instregex "IDIV(16|32|64)r")>; + +def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort06,HWPort0156]> { + let Latency = 115; + let NumMicroOps = 100; + let ResourceCycles = [9,9,11,8,1,11,21,30]; } -def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>; +def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>; +def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>; -// VZEROALL. -def WriteVZEROALL : SchedWriteRes<[]> { +def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> { + let Latency = 26; let NumMicroOps = 12; + let ResourceCycles = [2,2,1,3,2,2]; } -def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>; +def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, + VPGATHERDQrm, + VPGATHERDDrm)>; -// LDMXCSR. -def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> { - let Latency = 6; - let NumMicroOps = 3; - let ResourceCycles = [1, 1, 1]; +def HWWriteResGroup185 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 24; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; } -def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>; +def: InstRW<[HWWriteResGroup185], (instrs VGATHERQPDYrm, + VPGATHERQQYrm)>; -// STMXCSR. -def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> { - let Latency = 7; - let NumMicroOps = 4; - let ResourceCycles = [1, 1, 1, 1]; +def HWWriteResGroup186 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 28; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup186], (instrs VPGATHERQDYrm)>; + +def HWWriteResGroup187 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 25; + let NumMicroOps = 22; + let ResourceCycles = [5,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup187], (instrs VPGATHERQDrm)>; + +def HWWriteResGroup188 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 27; + let NumMicroOps = 20; + let ResourceCycles = [3,3,4,1,5,4]; +} +def: InstRW<[HWWriteResGroup188], (instrs VGATHERDPDYrm, + VPGATHERDQYrm)>; + +def HWWriteResGroup189 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 27; + let NumMicroOps = 34; + let ResourceCycles = [5,3,8,1,9,8]; +} +def: InstRW<[HWWriteResGroup189], (instrs VGATHERDPSYrm, + VPGATHERDDYrm)>; + +def HWWriteResGroup190 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 23; + let NumMicroOps = 14; + let ResourceCycles = [3,3,2,1,3,2]; +} +def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPDrm, + VPGATHERQQrm)>; + +def HWWriteResGroup191 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 28; + let NumMicroOps = 15; + let ResourceCycles = [3,3,2,1,4,2]; +} +def: InstRW<[HWWriteResGroup191], (instrs VGATHERQPSYrm)>; + +def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> { + let Latency = 25; + let NumMicroOps = 15; + let ResourceCycles = [3,3,2,1,4,2]; } -def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>; +def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm, + VGATHERDPSrm)>; } // SchedModel diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td index b8ec5883152c..4466d30f14c7 100644 --- a/lib/Target/X86/X86SchedSandyBridge.td +++ b/lib/Target/X86/X86SchedSandyBridge.td @@ -24,8 +24,8 @@ def SandyBridgeModel : SchedMachineModel { // Based on the LSD (loop-stream detector) queue size. let LoopMicroOpBufferSize = 28; - // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow - // the scheduler to assign a default model to unrecognized opcodes. + // This flag is set to allow the scheduler to assign + // a default model to unrecognized opcodes. let CompleteModel = 0; } @@ -48,6 +48,7 @@ def SBPort23 : ProcResource<2>; def SBPort4 : ProcResource<1>; // Many micro-ops are capable of issuing on multiple ports. +def SBPort01 : ProcResGroup<[SBPort0, SBPort1]>; def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>; def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>; def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>; @@ -115,10 +116,10 @@ def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> { // Scalar and vector floating point. defm : SBWriteResPair<WriteFAdd, SBPort1, 3>; defm : SBWriteResPair<WriteFMul, SBPort0, 5>; -defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles. +defm : SBWriteResPair<WriteFDiv, SBPort0, 24>; defm : SBWriteResPair<WriteFRcp, SBPort0, 5>; defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>; -defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>; +defm : SBWriteResPair<WriteFSqrt, SBPort0, 14>; defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>; defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>; defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>; @@ -134,11 +135,11 @@ def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> { } // Vector integer operations. -defm : SBWriteResPair<WriteVecShift, SBPort05, 1>; -defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>; -defm : SBWriteResPair<WriteVecALU, SBPort15, 1>; +defm : SBWriteResPair<WriteVecShift, SBPort5, 1>; +defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>; +defm : SBWriteResPair<WriteVecALU, SBPort1, 3>; defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>; -defm : SBWriteResPair<WriteShuffle, SBPort15, 1>; +defm : SBWriteResPair<WriteShuffle, SBPort5, 1>; defm : SBWriteResPair<WriteBlend, SBPort15, 1>; def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> { let Latency = 2; @@ -148,13 +149,15 @@ def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> { let Latency = 6; let ResourceCycles = [1, 1, 1]; } -def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> { - let Latency = 6; - let ResourceCycles = [1, 1, 1]; +def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; } -def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> { - let Latency = 6; - let ResourceCycles = [1, 1, 1, 1]; +def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; } //////////////////////////////////////////////////////////////////////////////// @@ -204,13 +207,15 @@ def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> { } // Packed Compare Implicit Length Strings, Return Index -def : WriteRes<WritePCmpIStrI, [SBPort015]> { - let Latency = 3; +def : WriteRes<WritePCmpIStrI, [SBPort0]> { + let Latency = 11; + let NumMicroOps = 3; let ResourceCycles = [3]; } -def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> { - let Latency = 3; - let ResourceCycles = [3, 1]; +def : WriteRes<WritePCmpIStrILd, [SBPort0,SBPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; } // Packed Compare Explicit Length Strings, Return Index @@ -224,22 +229,26 @@ def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> { } // AES Instructions. -def : WriteRes<WriteAESDecEnc, [SBPort015]> { - let Latency = 8; - let ResourceCycles = [2]; +def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; } -def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> { - let Latency = 8; - let ResourceCycles = [2, 1]; +def : WriteRes<WriteAESDecEncLd, [SBPort5,SBPort23,SBPort015]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; } -def : WriteRes<WriteAESIMC, [SBPort015]> { - let Latency = 8; +def : WriteRes<WriteAESIMC, [SBPort5]> { + let Latency = 12; + let NumMicroOps = 2; let ResourceCycles = [2]; } -def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> { - let Latency = 8; - let ResourceCycles = [2, 1]; +def : WriteRes<WriteAESIMCLd, [SBPort5,SBPort23]> { + let Latency = 18; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; } def : WriteRes<WriteAESKeyGen, [SBPort015]> { @@ -267,9 +276,2583 @@ def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; } def : WriteRes<WriteFence, [SBPort23, SBPort4]>; def : WriteRes<WriteNop, []>; -// AVX2 is not supported on that architecture, but we should define the basic +// AVX2/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>; defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>; defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>; +defm : SBWriteResPair<WriteFMA, SBPort01, 5>; + +// Remaining SNB instrs. + +def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup0], (instregex "CVTSS2SDrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSLLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSLLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSLLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRADri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRAWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "PSRLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VCVTSS2SDrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPMOVMSKBrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSLLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSLLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSLLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRADri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRAWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRLDri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRLQri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VPSRLWri")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDYrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSYrr")>; +def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSrr")>; + +def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup1], (instregex "COMP_FST0r")>; +def: InstRW<[SBWriteResGroup1], (instregex "COM_FST0r")>; +def: InstRW<[SBWriteResGroup1], (instregex "UCOM_FPr")>; +def: InstRW<[SBWriteResGroup1], (instregex "UCOM_Fr")>; + +def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup2], (instregex "ANDNPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ANDNPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ANDPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ANDPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "FDECSTP")>; +def: InstRW<[SBWriteResGroup2], (instregex "FFREE")>; +def: InstRW<[SBWriteResGroup2], (instregex "FINCSTP")>; +def: InstRW<[SBWriteResGroup2], (instregex "FNOP")>; +def: InstRW<[SBWriteResGroup2], (instregex "INSERTPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "JAE_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JAE_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JA_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JA_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JBE_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JBE_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JB_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JB_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JE_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JE_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JGE_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JGE_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JG_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JG_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JLE_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JLE_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JL_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JL_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JMP64r")>; +def: InstRW<[SBWriteResGroup2], (instregex "JMP_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JMP_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNE_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNE_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNO_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNO_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNP_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNP_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNS_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JNS_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JO_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JO_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JP_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JP_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "JS_1")>; +def: InstRW<[SBWriteResGroup2], (instregex "JS_4")>; +def: InstRW<[SBWriteResGroup2], (instregex "LD_Frr")>; +def: InstRW<[SBWriteResGroup2], (instregex "LOOP")>; +def: InstRW<[SBWriteResGroup2], (instregex "LOOPE")>; +def: InstRW<[SBWriteResGroup2], (instregex "LOOPNE")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOV64toPQIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVAPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVAPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVDDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVDI2PDIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVHLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVLHPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSHDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSLDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVSSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVUPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "MOVUPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ORPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "RETQ")>; +def: InstRW<[SBWriteResGroup2], (instregex "SHUFPDrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "SHUFPSrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "ST_FPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "ST_Frr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VANDPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VEXTRACTF128rr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VINSERTF128rr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VINSERTPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOV64toPQIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVSSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VORPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERM2F128rr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDYrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSYrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSrri")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VXORPDYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VXORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VXORPSYrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "VXORPSrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "XORPDrr")>; +def: InstRW<[SBWriteResGroup2], (instregex "XORPSrr")>; + +def SBWriteResGroup3 : SchedWriteRes<[SBPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup3], (instregex "LEA(16|32|64)(_32)?r")>; + +def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup4], (instregex "BLENDPDrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "BLENDPSrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup4], (instregex "CDQ")>; +def: InstRW<[SBWriteResGroup4], (instregex "CQO")>; +def: InstRW<[SBWriteResGroup4], (instregex "LAHF")>; +def: InstRW<[SBWriteResGroup4], (instregex "SAHF")>; +def: InstRW<[SBWriteResGroup4], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SAR8ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETAEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETBr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETGEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETGr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETLEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETLr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNEr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNOr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNPr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETNSr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETOr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETPr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SETSr")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL8r1")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHL8ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "SHR8ri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDYrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSYrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSrri")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQAYrr")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQArr")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUYrr")>; +def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUrr")>; + +def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSBrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSDrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSWrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PADDQirr")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[SBWriteResGroup5], (instregex "PABSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PABSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PABSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKSSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKSSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKUSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PACKUSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PADDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PALIGNRrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PAVGBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PAVGWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PBLENDWrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMAXUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMINUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFDri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFHWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSHUFLWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSIGNBrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSIGNDrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSIGNWrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSLLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSRLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PSUBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPABSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPABSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPABSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSWBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPADDWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPALIGNRrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPAVGBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPAVGWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPBLENDWrri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINSDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINUBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINUDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMINUWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFDri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFHWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFLWri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNBrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNDrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNWrr128")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSLLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSRLDQri")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSBrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPSUBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLWDrr")>; + +def SBWriteResGroup6 : SchedWriteRes<[SBPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "ADD8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "AND8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "CBW")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMC")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "CMP8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "CWDE")>; +def: InstRW<[SBWriteResGroup6], (instregex "DEC(16|32|64)r")>; +def: InstRW<[SBWriteResGroup6], (instregex "DEC8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "INC(16|32|64)r")>; +def: InstRW<[SBWriteResGroup6], (instregex "INC8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOV(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOV8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOV8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVDQArr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVDQUrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVPQI2QIrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[SBWriteResGroup6], (instregex "NEG(16|32|64)r")>; +def: InstRW<[SBWriteResGroup6], (instregex "NEG8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "NOT(16|32|64)r")>; +def: InstRW<[SBWriteResGroup6], (instregex "NOT8r")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "OR8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PANDNrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PANDrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "PXORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "STC")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "SUB8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "TEST8rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPANDNrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPANDrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "VPXORrr")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR8i8")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR8ri")>; +def: InstRW<[SBWriteResGroup6], (instregex "XOR8rr")>; + +def SBWriteResGroup7 : SchedWriteRes<[SBPort0]> { + let Latency = 2; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPDrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPSrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "MOVPDI2DIrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "MOVPQIto64rr")>; +def: InstRW<[SBWriteResGroup7], (instregex "PMOVMSKBrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[SBWriteResGroup7], (instregex "VMOVPQIto64rr")>; + +def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPDrr0")>; +def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPSrr0")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROL8ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "ROR8ri")>; +def: InstRW<[SBWriteResGroup9], (instregex "SETAr")>; +def: InstRW<[SBWriteResGroup9], (instregex "SETBEr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDYrr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDrr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSYrr")>; +def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSrr")>; + +def SBWriteResGroup10 : SchedWriteRes<[SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup10], (instregex "VPBLENDVBrr")>; + +def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup11], (instregex "SCASB")>; +def: InstRW<[SBWriteResGroup11], (instregex "SCASL")>; +def: InstRW<[SBWriteResGroup11], (instregex "SCASQ")>; +def: InstRW<[SBWriteResGroup11], (instregex "SCASW")>; + +def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup12], (instregex "COMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "COMISSrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "UCOMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "UCOMISSrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VCOMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VCOMISSrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISDrr")>; +def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISSrr")>; + +def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup13], (instregex "CVTPS2PDrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "PTESTrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VPTESTYrr")>; +def: InstRW<[SBWriteResGroup13], (instregex "VPTESTrr")>; + +def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup14], (instregex "PSLLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSLLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSLLWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRADrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRAWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "PSRLWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSLLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSLLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSLLWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRADrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRAWrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRLDrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRLQrr")>; +def: InstRW<[SBWriteResGroup14], (instregex "VPSRLWrr")>; + +def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup15], (instregex "CWD")>; +def: InstRW<[SBWriteResGroup15], (instregex "FNSTSW16r")>; + +def SBWriteResGroup16 : SchedWriteRes<[SBPort1,SBPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup16], (instregex "BSWAP(16|32|64)r")>; + +def SBWriteResGroup17 : SchedWriteRes<[SBPort5,SBPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup17], (instregex "PINSRBrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "PINSRDrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "PINSRQrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "PINSRWrri")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRBrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRDrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRQrr")>; +def: InstRW<[SBWriteResGroup17], (instregex "VPINSRWrri")>; + +def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup18], (instregex "JRCXZ")>; +def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>; + +def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "ADC8ri")>; +def: InstRW<[SBWriteResGroup19], (instregex "ADC8rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)ri")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB8ri")>; +def: InstRW<[SBWriteResGroup19], (instregex "SBB8rr")>; +def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[SBWriteResGroup19], (instregex "SHRD(16|32|64)rri8")>; + +def SBWriteResGroup20 : SchedWriteRes<[SBPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMADDUBSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMADDWDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULHRSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULHUWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULHWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULLDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULLWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PMULUDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "PSADBWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMADDUBSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMADDWDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULHRSWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULHUWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULHWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULLDrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULLWrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPMULUDQrr")>; +def: InstRW<[SBWriteResGroup20], (instregex "VPSADBWrr")>; + +def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup21], (instregex "ADDPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADD_FPrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADD_FST0r")>; +def: InstRW<[SBWriteResGroup21], (instregex "ADD_FrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup21], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPPDrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPPSrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CMPSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r8")>; +def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r64")>; +def: InstRW<[SBWriteResGroup21], (instregex "CVTDQ2PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CVTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "CVTTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[SBWriteResGroup21], (instregex "MUL8r")>; +def: InstRW<[SBWriteResGroup21], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup21], (instregex "PUSHFS64")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FPrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FST0r")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUBSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUB_FPrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUB_FST0r")>; +def: InstRW<[SBWriteResGroup21], (instregex "SUB_FrST0")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDYrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSYrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSrri")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCMPSSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPDr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPSr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSYrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBSDrr")>; +def: InstRW<[SBWriteResGroup21], (instregex "VSUBSSrr")>; + +def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup22], (instregex "EXTRACTPSrr")>; +def: InstRW<[SBWriteResGroup22], (instregex "VEXTRACTPSrr")>; + +def SBWriteResGroup23 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRBrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRDrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRQrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "PEXTRWri")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRBrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRDrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRQrr")>; +def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRWri")>; + +def SBWriteResGroup23_2 : SchedWriteRes<[SBPort05]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "ROL8rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "ROR8rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "SAR8rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "SHL8rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[SBWriteResGroup23_2], (instregex "SHR8rCL")>; + +def SBWriteResGroup24 : SchedWriteRes<[SBPort15]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBSWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBWrr64")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHADDDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHADDSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHADDWrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHSUBDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHSUBSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "PHSUBWrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHADDDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHADDSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHADDWrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBDrr")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBSWrr128")>; +def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBWrr")>; + +def SBWriteResGroup25 : SchedWriteRes<[SBPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup25], (instregex "ADC8i8")>; +def: InstRW<[SBWriteResGroup25], (instregex "LEAVE64")>; +def: InstRW<[SBWriteResGroup25], (instregex "OUT32rr")>; +def: InstRW<[SBWriteResGroup25], (instregex "OUT8rr")>; +def: InstRW<[SBWriteResGroup25], (instregex "SBB8i8")>; +def: InstRW<[SBWriteResGroup25], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup25], (instregex "XADD8rr")>; + +def SBWriteResGroup25_2 : SchedWriteRes<[SBPort5,SBPort05]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVBE_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVB_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVE_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNBE_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNB_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNE_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNP_F")>; +def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVP_F")>; + +def SBWriteResGroup26 : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup26], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup26], (instregex "CMOVBE(16|32|64)rr")>; + +def SBWriteResGroup26_2 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIPr")>; +def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIr")>; +def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIPr")>; +def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIr")>; + +def SBWriteResGroup27 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup27], (instregex "MUL(16|32|64)r")>; + +def SBWriteResGroup28 : SchedWriteRes<[SBPort1,SBPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup28], (instregex "CVTDQ2PDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2DQrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2PSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTSD2SSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTSI642SDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTSI2SDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "CVTTPD2DQrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTSD2SSrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI642SDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI2SDrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQYrr")>; +def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQrr")>; + +def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup29], (instregex "MOV64sr")>; + +def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup29_2], (instregex "OUT32ir")>; +def: InstRW<[SBWriteResGroup29_2], (instregex "OUT8ir")>; +def: InstRW<[SBWriteResGroup29_2], (instregex "PAUSE")>; + +def SBWriteResGroup29_3 : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[SBWriteResGroup29_3], (instregex "SHRD(16|32|64)rrCL")>; + +def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup30], (instregex "MULPDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MULPSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MULSDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MULSSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "MUL_FPrST0")>; +def: InstRW<[SBWriteResGroup30], (instregex "MUL_FST0r")>; +def: InstRW<[SBWriteResGroup30], (instregex "MUL_FrST0")>; +def: InstRW<[SBWriteResGroup30], (instregex "PCMPGTQrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[SBWriteResGroup30], (instregex "RCPPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "RCPSSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "RSQRTPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "RSQRTSSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPDYrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPSYrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULPSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULSDrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VMULSSrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VPCMPGTQrr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[SBWriteResGroup30], (instregex "VRCPPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VRCPSSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTPSr")>; +def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTSSr")>; + +def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup31], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOV8rm")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[SBWriteResGroup31], (instregex "PREFETCH")>; + +def SBWriteResGroup32 : SchedWriteRes<[SBPort0,SBPort1]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SIrr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SIrr")>; + +def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup33], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOV8mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVAPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVAPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVDQAmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVDQUmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVHPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVHPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVLPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVLPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTDQmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTI_64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVPDI2DImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVPQI2QImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVPQIto64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVSDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVSSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVUPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "MOVUPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "PUSH64i8")>; +def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16|32|64)r")>; +def: InstRW<[SBWriteResGroup33], (instregex "VEXTRACTF128mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVPDI2DImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQI2QImr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQIto64mr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVSDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVSSmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSYmr")>; +def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSmr")>; + +def SBWriteResGroup34 : SchedWriteRes<[SBPort0,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup34], (instregex "MPSADBWrri")>; +def: InstRW<[SBWriteResGroup34], (instregex "VMPSADBWrri")>; + +def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup35], (instregex "CLI")>; +def: InstRW<[SBWriteResGroup35], (instregex "CVTSI642SSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "CVTSI2SSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HADDPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HADDPSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HSUBPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "HSUBPSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI642SSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI2SSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSYrr")>; +def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSrr")>; + +def SBWriteResGroup35_2 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP16m")>; +def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP32m")>; +def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP64m")>; +def: InstRW<[SBWriteResGroup35_2], (instregex "PUSHGS64")>; + +def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup36], (instregex "CALL64pcrel32")>; +def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r")>; +def: InstRW<[SBWriteResGroup36], (instregex "EXTRACTPSmr")>; +def: InstRW<[SBWriteResGroup36], (instregex "VEXTRACTPSmr")>; + +def SBWriteResGroup37 : SchedWriteRes<[SBPort4,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDmr")>; +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSmr")>; + +def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup38], (instregex "SETAEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETBm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETGEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETGm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETLEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETLm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNEm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNOm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNPm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETNSm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETOm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETPm")>; +def: InstRW<[SBWriteResGroup38], (instregex "SETSm")>; + +def SBWriteResGroup39 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup39], (instregex "PEXTRBmr")>; +def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRBmr")>; +def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRDmr")>; +def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRWmr")>; + +def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup40], (instregex "MOV8mi")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSB")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSL")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSQ")>; +def: InstRW<[SBWriteResGroup40], (instregex "STOSW")>; + +def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup41], (instregex "FNINIT")>; + +def SBWriteResGroup42 : SchedWriteRes<[SBPort05,SBPort015]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG8rr")>; + +def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup43], (instregex "SETAm")>; +def: InstRW<[SBWriteResGroup43], (instregex "SETBEm")>; + +def SBWriteResGroup44 : SchedWriteRes<[SBPort0,SBPort4,SBPort5,SBPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup44], (instregex "LDMXCSR")>; +def: InstRW<[SBWriteResGroup44], (instregex "STMXCSR")>; +def: InstRW<[SBWriteResGroup44], (instregex "VLDMXCSR")>; +def: InstRW<[SBWriteResGroup44], (instregex "VSTMXCSR")>; + +def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup45], (instregex "PEXTRDmr")>; +def: InstRW<[SBWriteResGroup45], (instregex "PEXTRQmr")>; +def: InstRW<[SBWriteResGroup45], (instregex "VPEXTRQmr")>; +def: InstRW<[SBWriteResGroup45], (instregex "PUSHF16")>; +def: InstRW<[SBWriteResGroup45], (instregex "PUSHF64")>; + +def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup46], (instregex "CLFLUSH")>; + +def SBWriteResGroup47 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup47], (instregex "FXRSTOR")>; + +def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup48], (instregex "LDDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOV64toPQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVAPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVAPSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDI2PDIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVNTDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVQI2PQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSHDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSLDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVSSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVUPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "MOVUPSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r")>; +def: InstRW<[SBWriteResGroup48], (instregex "VBROADCASTSSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUYrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOV64toPQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQUrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVNTDQArm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSHDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSLDUPrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVSSrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPDrm")>; +def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPSrm")>; + +def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup49], (instregex "JMP(16|32|64)m")>; +def: InstRW<[SBWriteResGroup49], (instregex "MOV16sm")>; + +def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup50], (instregex "BT(16|32|64)mi8")>; + +def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSBrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSDrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSWrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNWrm64")>; + +def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup52], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "ADD8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "AND(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "AND8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP8mi")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP8mr")>; +def: InstRW<[SBWriteResGroup52], (instregex "CMP8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "LODSL")>; +def: InstRW<[SBWriteResGroup52], (instregex "LODSQ")>; +def: InstRW<[SBWriteResGroup52], (instregex "OR(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "OR8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "SUB8rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup52], (instregex "XOR8rm")>; + +def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup53], (instregex "ST_F32m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_F64m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_FP32m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_FP64m")>; +def: InstRW<[SBWriteResGroup53], (instregex "ST_FP80m")>; + +def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPDYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPSYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVDDUPYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQAYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQUYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPDYrm")>; +def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPSYrm")>; + +def SBWriteResGroup55 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup55], (instregex "CVTPS2PDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "CVTSS2SDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDYrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VCVTSS2SDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VTESTPDrm")>; +def: InstRW<[SBWriteResGroup55], (instregex "VTESTPSrm")>; + +def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup56], (instregex "ANDNPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ANDNPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ANDPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ANDPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "INSERTPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "MOVLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "ORPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "SHUFPDrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "SHUFPSrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDNPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDNPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VANDPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128")>; +def: InstRW<[SBWriteResGroup56], (instregex "VINSERTPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VORPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPDrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPSrmi")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VXORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "VXORPSrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "XORPDrm")>; +def: InstRW<[SBWriteResGroup56], (instregex "XORPSrm")>; + +def SBWriteResGroup57 : SchedWriteRes<[SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup57], (instregex "AESDECLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "AESDECrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "AESENCLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "AESENCrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESDECLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESDECrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESENCLASTrr")>; +def: InstRW<[SBWriteResGroup57], (instregex "VAESENCrr")>; + +def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup58], (instregex "BLENDPDrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "BLENDPSrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPDrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPSrmi")>; +def: InstRW<[SBWriteResGroup58], (instregex "VINSERTF128rm")>; + +def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PABSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PABSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PABSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKSSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKSSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKUSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PACKUSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PADDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PALIGNRrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PAVGBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PAVGWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PBLENDWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PINSRWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMAXUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMINUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFDmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFHWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSHUFLWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSIGNBrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSIGNDrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSIGNWrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PSUBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPABSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPABSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPABSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSWBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPADDWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPALIGNRrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPAVGBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPAVGWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPBLENDWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPINSRWrmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINSDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINUBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINUDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMINUWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFDmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFHWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFLWmi")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNBrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNDrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNWrm128")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSBrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPSUBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLWDrm")>; + +def SBWriteResGroup60 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup60], (instregex "PANDNrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "PANDrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "PORrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "PXORrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPANDNrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPANDrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPORrm")>; +def: InstRW<[SBWriteResGroup60], (instregex "VPXORrm")>; + +def SBWriteResGroup61 : SchedWriteRes<[SBPort0,SBPort05]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup61], (instregex "VRCPPSYr")>; +def: InstRW<[SBWriteResGroup61], (instregex "VRSQRTPSYr")>; + +def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup62], (instregex "VERRm")>; +def: InstRW<[SBWriteResGroup62], (instregex "VERWm")>; + +def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup63], (instregex "LODSB")>; +def: InstRW<[SBWriteResGroup63], (instregex "LODSW")>; + +def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup64], (instregex "FARJMP64")>; + +def SBWriteResGroup65 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup65], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "ADC8rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup65], (instregex "SBB8rm")>; + +def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup66], (instregex "FNSTSWm")>; + +def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r")>; +def: InstRW<[SBWriteResGroup67], (instregex "STR(16|32|64)r")>; + +def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup68], (instregex "CALL(16|32|64)m")>; +def: InstRW<[SBWriteResGroup68], (instregex "FNSTCW16m")>; + +def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[SBWriteResGroup69], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[SBWriteResGroup69], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[SBWriteResGroup69], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SAR8mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL8m1")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHL8mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup69], (instregex "SHR8mi")>; + +def SBWriteResGroup70 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "ADD8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "ADD8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "AND8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "DEC(16|32|64)m")>; +def: InstRW<[SBWriteResGroup70], (instregex "DEC8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "INC(16|32|64)m")>; +def: InstRW<[SBWriteResGroup70], (instregex "INC8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NEG(16|32|64)m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NEG8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NOT(16|32|64)m")>; +def: InstRW<[SBWriteResGroup70], (instregex "NOT8m")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "OR8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "SUB8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "TEST(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "TEST8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "TEST8mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR8mi")>; +def: InstRW<[SBWriteResGroup70], (instregex "XOR8mr")>; + +def SBWriteResGroup71 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[SBWriteResGroup71], (instregex "VTESTPDYrm")>; +def: InstRW<[SBWriteResGroup71], (instregex "VTESTPSYrm")>; + +def SBWriteResGroup72 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup72], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup72], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m64")>; +def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m8")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOM32m")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOM64m")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOMP32m")>; +def: InstRW<[SBWriteResGroup72], (instregex "FCOMP64m")>; +def: InstRW<[SBWriteResGroup72], (instregex "MUL8m")>; + +def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup73], (instregex "VANDNPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VANDNPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VANDPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VANDPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VORPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VORPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPDYrmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPSYrmi")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VXORPDYrm")>; +def: InstRW<[SBWriteResGroup73], (instregex "VXORPSYrm")>; + +def SBWriteResGroup74 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPDYrmi")>; +def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPSYrmi")>; + +def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPDrm0")>; +def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPSrm0")>; +def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPDrm")>; +def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPSrm")>; +def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPDrm")>; +def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPSrm")>; + +def SBWriteResGroup76 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup76], (instregex "PBLENDVBrr0")>; +def: InstRW<[SBWriteResGroup76], (instregex "VPBLENDVBrm")>; + +def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup77], (instregex "COMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "COMISSrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "UCOMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "UCOMISSrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VCOMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VCOMISSrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISDrm")>; +def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISSrm")>; + +def SBWriteResGroup78 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup78], (instregex "PTESTrm")>; +def: InstRW<[SBWriteResGroup78], (instregex "VPTESTrm")>; + +def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup79], (instregex "PSLLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSLLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSLLWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRADrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRAWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "PSRLWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSLLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSLLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSLLWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRADrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRAWrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRLDrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRLQrm")>; +def: InstRW<[SBWriteResGroup79], (instregex "VPSRLWrm")>; + +def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBSWrm64")>; +def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBWrm64")>; + +def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG8rm")>; + +def SBWriteResGroup82 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup82], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup82], (instregex "CMOVBE(16|32|64)rm")>; + +def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SBWriteResGroup83], (instregex "CMPSB")>; +def: InstRW<[SBWriteResGroup83], (instregex "CMPSL")>; +def: InstRW<[SBWriteResGroup83], (instregex "CMPSQ")>; +def: InstRW<[SBWriteResGroup83], (instregex "CMPSW")>; + +def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup84], (instregex "FLDCW16m")>; + +def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup85], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup85], (instregex "ROL8mi")>; +def: InstRW<[SBWriteResGroup85], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup85], (instregex "ROR8mi")>; + +def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,2]; +} +def: InstRW<[SBWriteResGroup86], (instregex "MOVSB")>; +def: InstRW<[SBWriteResGroup86], (instregex "MOVSL")>; +def: InstRW<[SBWriteResGroup86], (instregex "MOVSQ")>; +def: InstRW<[SBWriteResGroup86], (instregex "MOVSW")>; +def: InstRW<[SBWriteResGroup86], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup86], (instregex "XADD8rm")>; + +def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SBWriteResGroup87], (instregex "FARCALL64")>; + +def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[SBWriteResGroup88], (instregex "SHRD(16|32|64)mri8")>; + +def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup89], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMADDUBSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMADDWDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULHRSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULHUWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULHWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULLDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULLWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PMULUDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "PSADBWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMADDUBSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMADDWDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULHRSWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULHUWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULHWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULLDrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULLWrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPMULUDQrm")>; +def: InstRW<[SBWriteResGroup89], (instregex "VPSADBWrm")>; + +def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup90], (instregex "ADDPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPPDrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPPSrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CMPSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTDQ2PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTSI642SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTSI2SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "CVTTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[SBWriteResGroup90], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "SUBSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPPDrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPPSrmi")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCMPSSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI642SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI2SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSDm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSSm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBPDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBPSrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBSDrm")>; +def: InstRW<[SBWriteResGroup90], (instregex "VSUBSSrm")>; + +def SBWriteResGroup91 : SchedWriteRes<[SBPort23,SBPort05]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPDYrm")>; +def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPSYrm")>; +def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPSYrm")>; + +def SBWriteResGroup92 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup92], (instregex "DPPDrri")>; +def: InstRW<[SBWriteResGroup92], (instregex "VDPPDrri")>; + +def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SIrm")>; +def: InstRW<[SBWriteResGroup93], (instregex "MUL(16|32|64)m")>; + +def SBWriteResGroup94 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup94], (instregex "VPTESTYrm")>; + +def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup95], (instregex "LD_F32m")>; +def: InstRW<[SBWriteResGroup95], (instregex "LD_F64m")>; +def: InstRW<[SBWriteResGroup95], (instregex "LD_F80m")>; + +def SBWriteResGroup96 : SchedWriteRes<[SBPort23,SBPort15]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SBWriteResGroup96], (instregex "PHADDDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHADDSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHADDWrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHSUBDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHSUBSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "PHSUBWrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHADDDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHADDSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHADDWrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBDrm")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBSWrm128")>; +def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBWrm")>; + +def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup97], (instregex "IST_F16m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_F32m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_FP16m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_FP32m")>; +def: InstRW<[SBWriteResGroup97], (instregex "IST_FP64m")>; + +def SBWriteResGroup97_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,3]; +} +def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "ROL8mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "ROR8mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "SAR8mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "SHL8mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[SBWriteResGroup97_2], (instregex "SHR8mCL")>; + +def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,3]; +} +def: InstRW<[SBWriteResGroup98], (instregex "ADC(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup98], (instregex "ADC8mi")>; +def: InstRW<[SBWriteResGroup98], (instregex "SBB(16|32|64)mi")>; +def: InstRW<[SBWriteResGroup98], (instregex "SBB8mi")>; + +def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,2,2,1]; +} +def: InstRW<[SBWriteResGroup99], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup99], (instregex "ADC8mr")>; +def: InstRW<[SBWriteResGroup99], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup99], (instregex "SBB8mr")>; + +def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort05,SBPort015]> { + let Latency = 9; + let NumMicroOps = 6; + let ResourceCycles = [1,1,2,1,1]; +} +def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup100], (instregex "BTC(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup100], (instregex "BTR(16|32|64)mr")>; +def: InstRW<[SBWriteResGroup100], (instregex "BTS(16|32|64)mr")>; + +def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup101], (instregex "ADD_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ADD_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ILD_F16m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ILD_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "ILD_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUB_F32m")>; +def: InstRW<[SBWriteResGroup101], (instregex "SUB_F64m")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDPSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCMPPDYrmi")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCMPPSYrmi")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PSYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPDm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPSm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VSUBPDYrm")>; +def: InstRW<[SBWriteResGroup101], (instregex "VSUBPSYrm")>; + +def SBWriteResGroup102 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SIrm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SIrm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SIrm")>; + +def SBWriteResGroup103 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup103], (instregex "CVTDQ2PDrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2DQrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2PSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTSD2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTSI642SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTSI2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "CVTTPD2DQrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTTPD2PIirm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDYrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2DQrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2PSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTSD2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI642SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI2SSrm")>; +def: InstRW<[SBWriteResGroup103], (instregex "VCVTTPD2DQrm")>; + +def SBWriteResGroup103_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> { + let Latency = 10; + let NumMicroOps = 7; + let ResourceCycles = [1,2,3,1]; +} +def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[SBWriteResGroup103_2], (instregex "SHRD(16|32|64)mrCL")>; + +def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup104], (instregex "MULPDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "MULPSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "MULSDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "MULSSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "PCMPGTQrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[SBWriteResGroup104], (instregex "RCPPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "RCPSSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "RSQRTPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "RSQRTSSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULPDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULPSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULSDrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VMULSSrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VPCMPGTQrm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRCPPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRCPSSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTPSm")>; +def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTSSm")>; + +def SBWriteResGroup105 : SchedWriteRes<[SBPort0]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRIrr")>; +def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRM128rr")>; +def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRIrr")>; +def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRM128rr")>; + +def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup106], (instregex "FICOM16m")>; +def: InstRW<[SBWriteResGroup106], (instregex "FICOM32m")>; +def: InstRW<[SBWriteResGroup106], (instregex "FICOMP16m")>; +def: InstRW<[SBWriteResGroup106], (instregex "FICOMP32m")>; + +def SBWriteResGroup107 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2DQYrm")>; +def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2PSYrm")>; +def: InstRW<[SBWriteResGroup107], (instregex "VCVTTPD2DQYrm")>; + +def SBWriteResGroup108 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SBWriteResGroup108], (instregex "MPSADBWrmi")>; +def: InstRW<[SBWriteResGroup108], (instregex "VMPSADBWrmi")>; + +def SBWriteResGroup109 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup109], (instregex "HADDPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "HADDPSrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "HSUBPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "HSUBPSrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHADDPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHADDPSrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPDrm")>; +def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPSrm")>; + +def SBWriteResGroup110 : SchedWriteRes<[SBPort5]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SBWriteResGroup110], (instregex "AESIMCrr")>; +def: InstRW<[SBWriteResGroup110], (instregex "VAESIMCrr")>; + +def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup111], (instregex "MUL_F32m")>; +def: InstRW<[SBWriteResGroup111], (instregex "MUL_F64m")>; +def: InstRW<[SBWriteResGroup111], (instregex "VMULPDYrm")>; +def: InstRW<[SBWriteResGroup111], (instregex "VMULPSYrm")>; + +def SBWriteResGroup112 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup112], (instregex "DPPSrri")>; +def: InstRW<[SBWriteResGroup112], (instregex "VDPPSYrri")>; +def: InstRW<[SBWriteResGroup112], (instregex "VDPPSrri")>; + +def SBWriteResGroup113 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SBWriteResGroup113], (instregex "VHADDPDYrm")>; +def: InstRW<[SBWriteResGroup113], (instregex "VHADDPSYrm")>; +def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPDYrm")>; +def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPSYrm")>; + +def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI16m")>; +def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI32m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI16m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI32m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI16m")>; +def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI32m")>; + +def SBWriteResGroup115 : SchedWriteRes<[SBPort5,SBPort23,SBPort015]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup115], (instregex "AESDECLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "AESDECrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "AESENCLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "AESENCrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESDECLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESDECrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESENCLASTrm")>; +def: InstRW<[SBWriteResGroup115], (instregex "VAESENCrm")>; + +def SBWriteResGroup116 : SchedWriteRes<[SBPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup116], (instregex "DIVPSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "DIVSSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "SQRTPSr")>; +def: InstRW<[SBWriteResGroup116], (instregex "SQRTSSr")>; +def: InstRW<[SBWriteResGroup116], (instregex "VDIVPSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "VDIVSSrr")>; +def: InstRW<[SBWriteResGroup116], (instregex "VSQRTPSr")>; + +def SBWriteResGroup117 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 14; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup117], (instregex "VSQRTSSm")>; + +def SBWriteResGroup118 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SBWriteResGroup118], (instregex "VRCPPSYm")>; +def: InstRW<[SBWriteResGroup118], (instregex "VRSQRTPSYm")>; + +def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI16m")>; +def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI32m")>; + +def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SBWriteResGroup120], (instregex "DPPDrmi")>; +def: InstRW<[SBWriteResGroup120], (instregex "VDPPDrmi")>; + +def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 17; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRIrm")>; +def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRM128rm")>; +def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRIrm")>; +def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRM128rm")>; + +def SBWriteResGroup122 : SchedWriteRes<[SBPort5,SBPort23]> { + let Latency = 18; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup122], (instregex "AESIMCrm")>; +def: InstRW<[SBWriteResGroup122], (instregex "VAESIMCrm")>; + +def SBWriteResGroup123 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup123], (instregex "DIVPSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "DIVSSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "SQRTPSm")>; +def: InstRW<[SBWriteResGroup123], (instregex "SQRTSSm")>; +def: InstRW<[SBWriteResGroup123], (instregex "VDIVPSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "VDIVSSrm")>; +def: InstRW<[SBWriteResGroup123], (instregex "VSQRTPSm")>; + +def SBWriteResGroup124 : SchedWriteRes<[SBPort0]> { + let Latency = 21; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup124], (instregex "VSQRTSDr")>; + +def SBWriteResGroup125 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup125], (instregex "VSQRTSDm")>; + +def SBWriteResGroup126 : SchedWriteRes<[SBPort0]> { + let Latency = 22; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup126], (instregex "DIVPDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "DIVSDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "SQRTPDr")>; +def: InstRW<[SBWriteResGroup126], (instregex "SQRTSDr")>; +def: InstRW<[SBWriteResGroup126], (instregex "VDIVPDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "VDIVSDrr")>; +def: InstRW<[SBWriteResGroup126], (instregex "VSQRTPDr")>; + +def SBWriteResGroup127 : SchedWriteRes<[SBPort0]> { + let Latency = 24; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FPrST0")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FST0r")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FrST0")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIV_FPrST0")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIV_FST0r")>; +def: InstRW<[SBWriteResGroup127], (instregex "DIV_FrST0")>; + +def SBWriteResGroup128 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 28; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup128], (instregex "DIVPDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "DIVSDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "SQRTPDm")>; +def: InstRW<[SBWriteResGroup128], (instregex "SQRTSDm")>; +def: InstRW<[SBWriteResGroup128], (instregex "VDIVPDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "VDIVSDrm")>; +def: InstRW<[SBWriteResGroup128], (instregex "VSQRTPDm")>; + +def SBWriteResGroup129 : SchedWriteRes<[SBPort0,SBPort05]> { + let Latency = 29; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup129], (instregex "VDIVPSYrr")>; +def: InstRW<[SBWriteResGroup129], (instregex "VSQRTPSYr")>; + +def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> { + let Latency = 31; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F32m")>; +def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F64m")>; +def: InstRW<[SBWriteResGroup130], (instregex "DIV_F32m")>; +def: InstRW<[SBWriteResGroup130], (instregex "DIV_F64m")>; + +def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> { + let Latency = 34; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI16m")>; +def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI32m")>; +def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI16m")>; +def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI32m")>; + +def SBWriteResGroup132 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> { + let Latency = 36; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SBWriteResGroup132], (instregex "VDIVPSYrm")>; +def: InstRW<[SBWriteResGroup132], (instregex "VSQRTPSYm")>; + +def SBWriteResGroup133 : SchedWriteRes<[SBPort0,SBPort05]> { + let Latency = 45; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SBWriteResGroup133], (instregex "VDIVPDYrr")>; +def: InstRW<[SBWriteResGroup133], (instregex "VSQRTPDYr")>; + +def SBWriteResGroup134 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> { + let Latency = 52; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SBWriteResGroup134], (instregex "VDIVPDYrm")>; +def: InstRW<[SBWriteResGroup134], (instregex "VSQRTPDYm")>; + +def SBWriteResGroup135 : SchedWriteRes<[SBPort0]> { + let Latency = 114; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SBWriteResGroup135], (instregex "VSQRTSSr")>; + } // SchedModel diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td new file mode 100644 index 000000000000..9a417b2d3e82 --- /dev/null +++ b/lib/Target/X86/X86SchedSkylakeClient.td @@ -0,0 +1,3993 @@ +//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Skylake Client to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SkylakeClientModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SKylake can + // decode 6 instructions per cycle. + let IssueWidth = 6; + let MicroOpBufferSize = 224; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SkylakeClientModel in { + +// Skylake Client can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def SKLPort0 : ProcResource<1>; +def SKLPort1 : ProcResource<1>; +def SKLPort2 : ProcResource<1>; +def SKLPort3 : ProcResource<1>; +def SKLPort4 : ProcResource<1>; +def SKLPort5 : ProcResource<1>; +def SKLPort6 : ProcResource<1>; +def SKLPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SKLPort01 : ProcResGroup<[SKLPort0, SKLPort1]>; +def SKLPort23 : ProcResGroup<[SKLPort2, SKLPort3]>; +def SKLPort237 : ProcResGroup<[SKLPort2, SKLPort3, SKLPort7]>; +def SKLPort04 : ProcResGroup<[SKLPort0, SKLPort4]>; +def SKLPort05 : ProcResGroup<[SKLPort0, SKLPort5]>; +def SKLPort06 : ProcResGroup<[SKLPort0, SKLPort6]>; +def SKLPort15 : ProcResGroup<[SKLPort1, SKLPort5]>; +def SKLPort16 : ProcResGroup<[SKLPort1, SKLPort6]>; +def SKLPort56 : ProcResGroup<[SKLPort5, SKLPort6]>; +def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>; +def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>; +def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>; + +// 60 Entry Unified Scheduler +def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4, + SKLPort5, SKLPort6, SKLPort7]> { + let BufferSize=60; +} + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [SKLPort23, ExePort]> { + let Latency = !add(Lat, 5); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [SKLPort4]>; + +// Arithmetic. +defm : SKLWriteResPair<WriteALU, SKLPort0156, 1>; // Simple integer ALU op. +defm : SKLWriteResPair<WriteIMul, SKLPort1, 3>; // Integer multiplication. +def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. +def SKLDivider : ProcResource<1>; // Integer division issued on port 0. +def : WriteRes<WriteIDiv, [SKLPort0, SKLDivider]> { // Integer division. + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [SKLPort23, SKLPort0, SKLDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads. + +// Integer shifts and rotates. +defm : SKLWriteResPair<WriteShift, SKLPort06, 1>; + +// Loads, stores, and moves, not folded with other operations. +def : WriteRes<WriteLoad, [SKLPort23]> { let Latency = 5; } +def : WriteRes<WriteStore, [SKLPort237, SKLPort4]>; +def : WriteRes<WriteMove, [SKLPort0156]>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes<WriteZero, []>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : SKLWriteResPair<WriteJump, SKLPort06, 1>; + +// Floating point. This covers both scalar and vector operations. +defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare. +defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication. +defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division. +defm : SKLWriteResPair<WriteFSqrt, SKLPort0, 15>; // Floating point square root. +defm : SKLWriteResPair<WriteFRcp, SKLPort0, 5>; // Floating point reciprocal estimate. +defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate. +defm : SKLWriteResPair<WriteFMA, SKLPort01, 4>; // Fused Multiply Add. +defm : SKLWriteResPair<WriteFShuffle, SKLPort5, 1>; // Floating point vector shuffles. +defm : SKLWriteResPair<WriteFBlend, SKLPort015, 1>; // Floating point vector blends. +def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals. +defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts. +defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply. +defm : SKLWriteResPair<WriteShuffle, SKLPort5, 1>; // Vector shuffles. +defm : SKLWriteResPair<WriteBlend, SKLPort15, 1>; // Vector blends. + +def : WriteRes<WriteVarBlend, [SKLPort5]> { // Vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [SKLPort5, SKLPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteMPSAD, [SKLPort0, SKLPort5]> { // Vector MPSAD. + let Latency = 6; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteMPSADLd, [SKLPort23, SKLPort0, SKLPort5]> { + let Latency = 6; + let ResourceCycles = [1, 1, 2]; +} + +// Vector bitwise operations. +// These are often used on both floating point and integer vectors. +defm : SKLWriteResPair<WriteVecLogic, SKLPort015, 1>; // Vector and/or/xor. + +// Conversion between integer and float. +defm : SKLWriteResPair<WriteCvtF2I, SKLPort1, 3>; // Float -> Integer. +defm : SKLWriteResPair<WriteCvtI2F, SKLPort1, 4>; // Integer -> Float. +defm : SKLWriteResPair<WriteCvtF2F, SKLPort1, 3>; // Float -> Float size conversion. + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +// String instructions. +def : WriteRes<WritePCmpIStrM, [SKLPort0]> { + let Latency = 10; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [SKLPort0, SKLPort23]> { + let Latency = 10; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort16, SKLPort5]> { + let Latency = 10; + let ResourceCycles = [3, 2, 4]; +} +def : WriteRes<WritePCmpEStrMLd, [SKLPort05, SKLPort16, SKLPort23]> { + let Latency = 10; + let ResourceCycles = [6, 2, 1]; +} + // Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SKLPort0]> { + let Latency = 11; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [SKLPort0, SKLPort23]> { + let Latency = 11; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SKLPort05, SKLPort16]> { + let Latency = 11; + let ResourceCycles = [6, 2]; +} +def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort16, SKLPort5, SKLPort23]> { + let Latency = 11; + let ResourceCycles = [3, 2, 2, 1]; +} + +// AES instructions. +def : WriteRes<WriteAESDecEnc, [SKLPort5]> { // Decryption, encryption. + let Latency = 7; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [SKLPort5, SKLPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteAESIMC, [SKLPort5]> { // InvMixColumn. + let Latency = 14; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [SKLPort5, SKLPort23]> { + let Latency = 14; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5]> { // Key Generation. + let Latency = 10; + let ResourceCycles = [2, 8]; +} +def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23]> { + let Latency = 10; + let ResourceCycles = [2, 7, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SKLPort0, SKLPort5]> { + let Latency = 7; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteCLMulLd, [SKLPort0, SKLPort5, SKLPort23]> { + let Latency = 7; + let ResourceCycles = [2, 1, 1]; +} + +// Catch-all for expensive system instructions. +def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : SKLWriteResPair<WriteFShuffle256, SKLPort5, 3>; // Fp 256-bit width vector shuffles. +defm : SKLWriteResPair<WriteShuffle256, SKLPort5, 3>; // 256-bit width vector shuffles. +def : WriteRes<WriteVarVecShift, [SKLPort0, SKLPort5]> { // Variable vector shifts. + let Latency = 2; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteVarVecShiftLd, [SKLPort0, SKLPort5, SKLPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1, 1]; +} + +// Old microcoded instructions that nobody use. +def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes<WriteFence, [SKLPort23, SKLPort4]>; + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes<WriteFHAdd, [SKLPort1]> { + let Latency = 3; +} + +// x,m / v,v,m. +def : WriteRes<WriteFHAddLd, [SKLPort1, SKLPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes<WritePHAdd, [SKLPort15]>; + +// v <- v,m. +def : WriteRes<WritePHAddLd, [SKLPort15, SKLPort23]> { + let Latency = 5; + let ResourceCycles = [1, 1]; +} + +// Remaining instrs. + +def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQDirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTDirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXSWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXUBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINSWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINUBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWri")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWrr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSWirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSBirr")>; +def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSWirr")>; + +def SKLWriteResGroup2 : SchedWriteRes<[SKLPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup2], (instregex "MMX_MASKMOVQ64")>; + +def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup3], (instregex "COMP_FST0r")>; +def: InstRW<[SKLWriteResGroup3], (instregex "COM_FST0r")>; +def: InstRW<[SKLWriteResGroup3], (instregex "INSERTPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64rr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64to64rr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFWri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOV64toPQIrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVDDUPrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVDI2PDIrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVHLPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVLHPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVSDrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVSHDUPrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVSLDUPrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSDWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSWBrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSDWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSWBrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PALIGNRrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PBLENDWrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFBrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFDri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFHWri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFLWri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PSLLDQri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PSRLDQri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPDrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPSrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_FPr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_Fr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VBROADCASTSSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VINSERTPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOV64toPQIrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVHLPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVLHPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRYrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWYrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQYri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDYrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSYrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSrri")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSrr")>; + +def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>; + +def SKLWriteResGroup5 : SchedWriteRes<[SKLPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup5], (instregex "PABSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PABSDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PABSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PADDSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PADDSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PAVGBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PAVGWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQQrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMINSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMINSDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMINSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMINUBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMINUDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PMINUWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNBrr128")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNDrr128")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNWrr128")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSLLDri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSLLQri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSLLWri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSRADri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSRAWri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSRLDri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSRLQri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSRLWri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBYrr256")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBrr128")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDYrr256")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDrr128")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWYrr256")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWrr128")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWYri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWri")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWYrr")>; +def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWrr")>; + +def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup6], (instregex "FINCSTP")>; +def: InstRW<[SKLWriteResGroup6], (instregex "FNOP")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSBrr64")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSDrr64")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSWrr64")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDBirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDDirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDQirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDWirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDNirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PORirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBBirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBDirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBQirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBWirr")>; +def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PXORirr")>; + +def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup7], (instregex "ADC8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup7], (instregex "ADCX(32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "ADOX(32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CDQ")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CLAC")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "CQO")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JAE_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JAE_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JA_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JA_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JBE_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JBE_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JB_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JB_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JE_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JE_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JGE_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JGE_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JG_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JG_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JLE_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JLE_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JL_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JL_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JMP_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JMP_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNE_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNE_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNO_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNO_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNP_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNP_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNS_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JNS_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JO_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JO_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JP_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JP_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JS_1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "JS_4")>; +def: InstRW<[SKLWriteResGroup7], (instregex "RORX(32|64)ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SAR8r1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SAR8ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SARX(32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SBB8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETAEr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETBr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETEr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETGEr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETGr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETLEr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETLr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETNEr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETNOr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETNPr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETNSr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETOr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETPr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SETSr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHL8r1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHL8ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHLX(32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHR8r1")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHR8ri")>; +def: InstRW<[SKLWriteResGroup7], (instregex "SHRX(32|64)rr")>; +def: InstRW<[SKLWriteResGroup7], (instregex "STAC")>; + +def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BLSI(32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BLSMSK(32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BLSR(32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "BZHI(32|64)rr")>; +def: InstRW<[SKLWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>; + +def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "ANDPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "ANDPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPDrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPSrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQArr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MOVPQI2QIrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "MOVSSrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "ORPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "ORPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PADDBrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PADDDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PADDQrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PADDWrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PANDNrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PANDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PORrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PSUBBrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PSUBDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PSUBQrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PSUBWrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "PXORrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDYrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSYrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VORPDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VORPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VORPSYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VORPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPANDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPANDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDYrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDrri")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPORYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPORrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPXORYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VPXORrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSYrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "XORPDrr")>; +def: InstRW<[SKLWriteResGroup9], (instregex "XORPSrr")>; + +def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ADD8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ADD8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "ADD8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "AND8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "AND8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "AND8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CBW")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CLC")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CMC")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CMP8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CMP8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CMP8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "CWDE")>; +def: InstRW<[SKLWriteResGroup10], (instregex "DEC(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "DEC8r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "INC(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "INC8r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "LAHF")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOV8ri(_alt)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOV8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "NEG(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "NEG8r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "NOOP")>; +def: InstRW<[SKLWriteResGroup10], (instregex "NOT(16|32|64)r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "NOT8r")>; +def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "OR8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "OR8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "OR8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SAHF")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SIDT64m")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SLDT64m")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SMSW16m")>; +def: InstRW<[SKLWriteResGroup10], (instregex "STC")>; +def: InstRW<[SKLWriteResGroup10], (instregex "STRm")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SUB8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SUB8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SUB8rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "SYSCALL")>; +def: InstRW<[SKLWriteResGroup10], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "TEST8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "TEST8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "TEST8rr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XCHG(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XOR8i8")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XOR8ri")>; +def: InstRW<[SKLWriteResGroup10], (instregex "XOR8rr(_REV)?")>; + +def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVNTQmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVQ64mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mi")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQAmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQUmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTDQmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTI_64mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTImr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVPDI2DImr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQI2QImr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQIto64mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVSDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVSSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP32m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP64m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP80m")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTF128mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTI128mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPDI2DImr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQI2QImr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQIto64mr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSYmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSmr")>; +def: InstRW<[SKLWriteResGroup11], (instregex "VMPTRSTm")>; + +def SKLWriteResGroup12 : SchedWriteRes<[SKLPort0]> { + let Latency = 2; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup12], (instregex "COMISDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "COMISSrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64grr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPSrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MOVPDI2DIrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "MOVPQIto64rr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "PMOVMSKBrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISSrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISSrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPQIto64rr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBYrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDYrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSYrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISDrr")>; +def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISSrr")>; + +def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "MMX_PINSRWirri")>; +def: InstRW<[SKLWriteResGroup13], (instregex "PINSRBrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "PINSRDrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "PINSRQrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "PINSRWrri")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRBrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRDrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRQrr")>; +def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRWrri")>; + +def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup14], (instregex "FDECSTP")>; +def: InstRW<[SKLWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>; + +def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROL8r1")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROL8ri")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROR8r1")>; +def: InstRW<[SKLWriteResGroup15], (instregex "ROR8ri")>; +def: InstRW<[SKLWriteResGroup15], (instregex "SETAr")>; +def: InstRW<[SKLWriteResGroup15], (instregex "SETBEr")>; + +def SKLWriteResGroup16 : SchedWriteRes<[SKLPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPDrr0")>; +def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPSrr0")>; +def: InstRW<[SKLWriteResGroup16], (instregex "PBLENDVBrr0")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDYrr")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDrr")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSYrr")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSrr")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBYrr")>; +def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBrr")>; + +def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup17], (instregex "LFENCE")>; +def: InstRW<[SKLWriteResGroup17], (instregex "WAIT")>; +def: InstRW<[SKLWriteResGroup17], (instregex "XGETBV")>; + +def SKLWriteResGroup18 : SchedWriteRes<[SKLPort0,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup18], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVDQU")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDYmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQYmr")>; +def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQmr")>; + +def SKLWriteResGroup19 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup19], (instregex "PSLLDrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSLLQrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSLLWrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSRADrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSRAWrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSRLDrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSRLQrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "PSRLWrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLDrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLQrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLWrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSRADrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSRAWrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLDrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLQrr")>; +def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLWrr")>; + +def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup20], (instregex "CLFLUSH")>; + +def SKLWriteResGroup21 : SchedWriteRes<[SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup21], (instregex "SFENCE")>; + +def SKLWriteResGroup22 : SchedWriteRes<[SKLPort06,SKLPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr")>; +def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>; + +def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8")>; +def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri")>; +def: InstRW<[SKLWriteResGroup23], (instregex "CWD")>; +def: InstRW<[SKLWriteResGroup23], (instregex "JRCXZ")>; +def: InstRW<[SKLWriteResGroup23], (instregex "SBB8i8")>; +def: InstRW<[SKLWriteResGroup23], (instregex "SBB8ri")>; + +def SKLWriteResGroup24 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup24], (instregex "EXTRACTPSmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRBmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRDmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRQmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRWmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "STMXCSR")>; +def: InstRW<[SKLWriteResGroup24], (instregex "VEXTRACTPSmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRBmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRDmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRQmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRWmr")>; +def: InstRW<[SKLWriteResGroup24], (instregex "VSTMXCSR")>; + +def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup25], (instregex "FNSTCW16m")>; + +def SKLWriteResGroup26 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup26], (instregex "SETAEm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETBm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETEm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETGEm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETGm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETLEm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETLm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETNEm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETNOm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETNPm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETNSm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETOm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETPm")>; +def: InstRW<[SKLWriteResGroup26], (instregex "SETSm")>; + +def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>; + +def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>; +def: InstRW<[SKLWriteResGroup28], (instregex "PUSH64i8")>; +def: InstRW<[SKLWriteResGroup28], (instregex "STOSB")>; +def: InstRW<[SKLWriteResGroup28], (instregex "STOSL")>; +def: InstRW<[SKLWriteResGroup28], (instregex "STOSQ")>; +def: InstRW<[SKLWriteResGroup28], (instregex "STOSW")>; + +def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup29], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "IMUL64rr(i8)?")>; +def: InstRW<[SKLWriteResGroup29], (instregex "IMUL8r")>; +def: InstRW<[SKLWriteResGroup29], (instregex "LZCNT(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "MUL8r")>; +def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "PEXT(32|64)rr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup29], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "SHRD(16|32|64)rri8")>; +def: InstRW<[SKLWriteResGroup29], (instregex "TZCNT(16|32|64)rr")>; + +def SKLWriteResGroup29_16 : SchedWriteRes<[SKLPort1, SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup29_16], (instregex "IMUL16rr(i8)?")>; + +def SKLWriteResGroup29_32 : SchedWriteRes<[SKLPort1]> { + let Latency = 3; + let NumMicroOps = 1; +} +def: InstRW<[SKLWriteResGroup29_32], (instregex "IMUL32rr(i8)?")>; + +def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FPrST0")>; +def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FST0r")>; +def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FrST0")>; +def: InstRW<[SKLWriteResGroup30], (instregex "MMX_PSADBWirr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "PCMPGTQrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "PSADBWrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FPrST0")>; +def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FST0r")>; +def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FrST0")>; +def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FPrST0")>; +def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FST0r")>; +def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FrST0")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSSYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTF128rr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTI128rr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTF128rr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTI128rr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2F128rr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2I128rr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPERMDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPDYri")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPSYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPERMQYri")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBWYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXDQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBWYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXDQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWDYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWQYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWYrr")>; +def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWrr")>; + +def SKLWriteResGroup31 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup31], (instregex "EXTRACTPSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "MMX_PEXTRWirri")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRBrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRQrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWri")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWrr_REV")>; +def: InstRW<[SKLWriteResGroup31], (instregex "PTESTrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VEXTRACTPSrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRBrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRDrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRQrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWri")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWrr_REV")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTYrr")>; +def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTrr")>; + +def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup32], (instregex "FNSTSW16r")>; + +def SKLWriteResGroup33 : SchedWriteRes<[SKLPort06]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup33], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "ROL8rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "ROR8rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "SAR8rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "SHL8rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup33], (instregex "SHR8rCL")>; + +def SKLWriteResGroup34 : SchedWriteRes<[SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup34], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup34], (instregex "XADD8rr")>; +def: InstRW<[SKLWriteResGroup34], (instregex "XCHG8rr")>; + +def SKLWriteResGroup35 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHSUBSWrr64")>; + +def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup36], (instregex "PHADDSWrr128")>; +def: InstRW<[SKLWriteResGroup36], (instregex "PHSUBSWrr128")>; +def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr128")>; +def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr256")>; +def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr128")>; +def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr256")>; + +def SKLWriteResGroup37 : SchedWriteRes<[SKLPort5,SKLPort05]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDrr64")>; +def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBWrr64")>; + +def SKLWriteResGroup38 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup38], (instregex "PHADDDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PHADDWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDYrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWYrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDYrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWYrr")>; +def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWrr")>; + +def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr")>; +def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSWBirr")>; +def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKUSWBirr")>; + +def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup40], (instregex "CLD")>; + +def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup41], (instregex "MFENCE")>; + +def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCL8r1")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCL8ri")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)r1")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)ri")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCR8r1")>; +def: InstRW<[SKLWriteResGroup42], (instregex "RCR8ri")>; + +def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup43], (instregex "FNSTSWm")>; + +def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup44], (instregex "SETAm")>; +def: InstRW<[SKLWriteResGroup44], (instregex "SETBEm")>; + +def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup45], (instregex "CALL(16|32|64)r")>; + +def SKLWriteResGroup46 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup46], (instregex "CALL64pcrel32")>; + +def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup47], (instregex "AESDECLASTrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "AESDECrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "AESENCLASTrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "AESENCrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDWDirr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHUWirr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHWirr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULLWirr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FPrST0")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FST0r")>; +def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FrST0")>; +def: InstRW<[SKLWriteResGroup47], (instregex "RCPPSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "RCPSSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTPSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTSSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECLASTrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCLASTrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCrr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSYr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VRCPSSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSYr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSr")>; +def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTSSr")>; + +def SKLWriteResGroup48 : SchedWriteRes<[SKLPort01]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup48], (instregex "ADDPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "ADDPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "ADDSDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "ADDSSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "MULPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "MULPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "MULSDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "MULSSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "SUBPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "SUBPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "SUBSDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "SUBSSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDSDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDSSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VMULSDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VMULSSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSYrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSDrr")>; +def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSSrr")>; +def: InstRW<[SKLWriteResGroup48], + (instregex + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>; + +def SKLWriteResGroup49 : SchedWriteRes<[SKLPort015]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup49], (instregex "CMPPDrri")>; +def: InstRW<[SKLWriteResGroup49], (instregex "CMPPSrri")>; +def: InstRW<[SKLWriteResGroup49], (instregex "CMPSDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "CMPSSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "CVTDQ2PSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "CVTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "CVTTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMADDUBSWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMADDWDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMULDQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMULHRSWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMULHUWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMULHWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMULLWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "PMULUDQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDYrri")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDrri")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSYrri")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSrri")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SSrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQYrr")>; +def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQrr")>; + +def SKLWriteResGroup50 : SchedWriteRes<[SKLPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup50], (instregex "MPSADBWrri")>; +def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWYrri")>; +def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWrri")>; + +def SKLWriteResGroup51 : SchedWriteRes<[SKLPort1,SKLPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup51], (instregex "IMUL64r")>; +def: InstRW<[SKLWriteResGroup51], (instregex "MUL64r")>; +def: InstRW<[SKLWriteResGroup51], (instregex "MULX64rr")>; + +def SKLWriteResGroup51_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def: InstRW<[SKLWriteResGroup51_16], (instregex "IMUL16r")>; +def: InstRW<[SKLWriteResGroup51_16], (instregex "MUL16r")>; + +def SKLWriteResGroup52 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLDYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLQYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLWYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSRADYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSRAWYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLDYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLQYrr")>; +def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLWYrr")>; + +def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP16m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP32m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP64m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "IST_F16m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "IST_F32m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP16m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP32m")>; +def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP64m")>; + +def SKLWriteResGroup54 : SchedWriteRes<[SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[SKLWriteResGroup54], (instregex "FNCLEX")>; + +def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup55], (instregex "PAUSE")>; + +def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup56], (instregex "VZEROUPPER")>; + +def SKLWriteResGroup57 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup57], (instregex "LAR(16|32|64)rr")>; + +def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64rm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64to64rm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVQ64rm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOV64toPQIrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOV8rm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVDDUPrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVDI2PDIrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVQI2PQIrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVSDrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVSSrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHNTA")>; +def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT0")>; +def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT1")>; +def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT2")>; +def: InstRW<[SKLWriteResGroup58], (instregex "VMOV64toPQIrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDDUPrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSDrm")>; +def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSSrm")>; + +def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup59], (instregex "CVTDQ2PDrr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[SKLWriteResGroup59], (instregex "VCVTDQ2PDrr")>; + +def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2DQrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2PSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTPS2PDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTSD2SSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI642SDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTSS2SDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "CVTTPD2DQrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2DQrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2PSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPH2PSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PHrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSD2SSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI642SDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SSrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSS2SDrr")>; +def: InstRW<[SKLWriteResGroup60], (instregex "VCVTTPD2DQrr")>; + +def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup61], (instregex "STR(16|32|64)r")>; + +def SKLWriteResGroup62 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup62], (instregex "IMUL32r")>; +def: InstRW<[SKLWriteResGroup62], (instregex "MUL32r")>; +def: InstRW<[SKLWriteResGroup62], (instregex "MULX32rr")>; + +def SKLWriteResGroup63 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[SKLWriteResGroup63], (instregex "XSETBV")>; + +def SKLWriteResGroup64 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG8rr")>; + +def SKLWriteResGroup65 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF16")>; +def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF64")>; + +def SKLWriteResGroup66 : SchedWriteRes<[SKLPort5]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup66], (instregex "PCLMULQDQrr")>; +def: InstRW<[SKLWriteResGroup66], (instregex "VPCLMULQDQrr")>; + +def SKLWriteResGroup67 : SchedWriteRes<[SKLPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup67], (instregex "LDDQUrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPDrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPSrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQArm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQUrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVNTDQArm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVSHDUPrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVSLDUPrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPDrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPSrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VLDDQUrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPDrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPSrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQArm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQUrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVNTDQArm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSHDUPrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSLDUPrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPDrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPSrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTDrm")>; +def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTQrm")>; + +def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup68], (instregex "MMX_CVTPI2PSirr")>; + +def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQDirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTDirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXSWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXUBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINSWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINUBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLDrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLQrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLWrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRADrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRAWrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLDrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLQrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLWrm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSWirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSBirm")>; +def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSWirm")>; + +def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SIrr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SIrr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SIrr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SIrr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SIrr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SIrr")>; + +def SKLWriteResGroup71 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PINSRWirmi")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFWmi")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHBWirm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHDQirm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHWDirm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLBWirm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLDQirm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLWDirm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPSrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPSrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PINSRBrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PINSRDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PINSRQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PINSRWrmi")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBWrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXDQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBWrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXDQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPSrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPSrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRBrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRWrmi")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBWrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXDQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBWrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXDQrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWDrm")>; +def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWQrm")>; + +def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64")>; +def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>; + +def SKLWriteResGroup73 : SchedWriteRes<[SKLPort23,SKLPort05]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSBrm64")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSDrm64")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSWrm64")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDBirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDDirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDQirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDWirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDNirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PORirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNWrm64")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBBirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBDirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBQirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBWirm")>; +def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PXORirm")>; + +def SKLWriteResGroup74 : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup74], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "ADC8rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "ADCX(32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "ADOX(32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "BT(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RORX32mi")>; +def: InstRW<[SKLWriteResGroup74], (instregex "RORX64mi")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SARX32rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SARX64rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SBB8rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SHLX32rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SHLX64rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SHRX32rm")>; +def: InstRW<[SKLWriteResGroup74], (instregex "SHRX64rm")>; + +def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "BLSI(32|64)rm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "BLSMSK(32|64)rm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "BLSR(32|64)rm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "BZHI(32|64)rm")>; +def: InstRW<[SKLWriteResGroup75], (instregex "MOVBE(16|32|64)rm")>; + +def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup76], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "ADD8rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "AND(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "AND8rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mi")>; +def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mr")>; +def: InstRW<[SKLWriteResGroup76], (instregex "CMP8rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "OR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "OR8rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)r(mr)?")>; +def: InstRW<[SKLWriteResGroup76], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "SUB8rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "TEST(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mi")>; +def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mr")>; +def: InstRW<[SKLWriteResGroup76], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup76], (instregex "XOR8rm")>; + +def SKLWriteResGroup77 : SchedWriteRes<[SKLPort5,SKLPort01]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup77], (instregex "HADDPDrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "HADDPSrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPDrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPSrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDYrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSYrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDYrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSYrr")>; +def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSrr")>; + +def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup78], (instregex "CVTSI642SSrr")>; +def: InstRW<[SKLWriteResGroup78], (instregex "VCVTSI642SSrr")>; + +def SKLWriteResGroup79 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[SKLWriteResGroup79], (instregex "SHRD(16|32|64)rrCL")>; + +def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>; + +def SKLWriteResGroup81 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup81], (instregex "VCVTPS2PHmr")>; + +def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup82], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup82], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SAR8m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SAR8mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHL8m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHL8mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHR8m1")>; +def: InstRW<[SKLWriteResGroup82], (instregex "SHR8mi")>; + +def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "AND8mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "AND8mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "DEC(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "DEC8m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "INC(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "INC8m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "NEG(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "NEG8m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "NOT(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "NOT8m")>; +def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "OR8mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "OR8mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm")>; +def: InstRW<[SKLWriteResGroup83], (instregex "PUSH(16|32|64)rmm")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mi")>; +def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mr")>; + +def SKLWriteResGroup84 : SchedWriteRes<[SKLPort6,SKLPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[SKLWriteResGroup84], (instregex "STD")>; + +def SKLWriteResGroup85 : SchedWriteRes<[SKLPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup85], (instregex "LD_F32m")>; +def: InstRW<[SKLWriteResGroup85], (instregex "LD_F64m")>; +def: InstRW<[SKLWriteResGroup85], (instregex "LD_F80m")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTF128")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTI128")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VLDDQUYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPDYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPSYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDDUPYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQAYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQUYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVNTDQAYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPDYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPSYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTDYrm")>; +def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTQYrm")>; + +def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup86], (instregex "VCVTDQ2PDYrr")>; + +def SKLWriteResGroup87 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup87], (instregex "COMISDrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "COMISSrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISDrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISSrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISDrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISSrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISDrm")>; +def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISSrm")>; + +def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup88], (instregex "INSERTPSrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSDWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSWBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSDWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSWBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PALIGNRrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PBLENDWrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFDmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFHWmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFLWmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHBWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHWDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLBWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLWDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPDrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPSrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPSrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPSrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VINSERTPSrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSDWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSWBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSDWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSWBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPALIGNRrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPBLENDWrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFBrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFDmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFHWmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFLWmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLWDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPDrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPSrmi")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPSrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPDrm")>; +def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPSrm")>; + +def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPH2PSYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PHYrr")>; +def: InstRW<[SKLWriteResGroup89], (instregex "VCVTTPD2DQYrr")>; + +def SKLWriteResGroup90 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup90], (instregex "PABSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PABSDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PABSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PADDSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PADDSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PAVGBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PAVGWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMINSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMINSDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMINSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMINUBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMINUDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PMINUWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNBrm128")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNDrm128")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNWrm128")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSLLDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSLLQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSLLWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSRADrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSRAWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSRLDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSRLQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSRLWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPABSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPABSDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPABSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNBrm128")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNDrm128")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNWrm128")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRADrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAVDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVDrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVQrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSWrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSBrm")>; +def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSWrm")>; + +def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "ANDPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "ANDPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPDrmi")>; +def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPSrmi")>; +def: InstRW<[SKLWriteResGroup91], (instregex "ORPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "ORPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PADDBrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PADDDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PADDQrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PADDWrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PANDNrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PANDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PORrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PSUBBrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PSUBDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PSUBQrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PSUBWrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "PXORrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VANDPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VANDPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPDrmi")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPSrmi")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTF128rm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTI128rm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VORPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VORPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPADDBrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPADDDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPADDQrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPADDWrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPANDNrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPANDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPBLENDDrmi")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVQrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPORrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBBrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBQrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBWrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VPXORrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VXORPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "VXORPSrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "XORPDrm")>; +def: InstRW<[SKLWriteResGroup91], (instregex "XORPSrm")>; + +def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm")>; +def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSWBirm")>; +def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKUSWBirm")>; + +def SKLWriteResGroup93 : SchedWriteRes<[SKLPort23,SKLPort06]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup93], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup93], (instregex "CMOVBE(16|32|64)rm")>; + +def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup94], (instregex "LEAVE64")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SCASB")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SCASL")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SCASQ")>; +def: InstRW<[SKLWriteResGroup94], (instregex "SCASW")>; + +def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SIrr")>; +def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SIrr")>; + +def SKLWriteResGroup96 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup96], (instregex "FLDCW16m")>; + +def SKLWriteResGroup97 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup97], (instregex "LDMXCSR")>; +def: InstRW<[SKLWriteResGroup97], (instregex "VLDMXCSR")>; + +def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ")>; +def: InstRW<[SKLWriteResGroup98], (instregex "RETQ")>; + +def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>; + +def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROL8m1")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROL8mi")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROR8m1")>; +def: InstRW<[SKLWriteResGroup100], (instregex "ROR8mi")>; + +def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup101], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup101], (instregex "XADD8rm")>; + +def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup102], (instregex "FARCALL64")>; + +def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,3,1,2]; +} +def: InstRW<[SKLWriteResGroup103], (instregex "LOOP")>; + +def SKLWriteResGroup104 : SchedWriteRes<[SKLPort0]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup104], (instregex "AESIMCrr")>; +def: InstRW<[SKLWriteResGroup104], (instregex "VAESIMCrr")>; + +def SKLWriteResGroup105 : SchedWriteRes<[SKLPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKLWriteResGroup105], (instregex "PMULLDrr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPDr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPSr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSDr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSSr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDYrr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDrr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPDr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPSr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSDr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSSr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPDr")>; +def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPSr")>; + +def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPDrm")>; +def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPSrm")>; + +def SKLWriteResGroup107 : SchedWriteRes<[SKLPort1,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup107], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup107], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup107], (instregex "IMUL64m")>; +def: InstRW<[SKLWriteResGroup107], (instregex "IMUL(32|64)rm(i8)?")>; +def: InstRW<[SKLWriteResGroup107], (instregex "IMUL8m")>; +def: InstRW<[SKLWriteResGroup107], (instregex "LZCNT(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup107], (instregex "MUL(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup107], (instregex "MUL8m")>; +def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm")>; +def: InstRW<[SKLWriteResGroup107], (instregex "PEXT(32|64)rm")>; +def: InstRW<[SKLWriteResGroup107], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup107], (instregex "TZCNT(16|32|64)rm")>; + +def SKLWriteResGroup107_16 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup107_16], (instregex "IMUL16rm(i8)?")>; + +def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 3; + let NumMicroOps = 5; +} +def: InstRW<[SKLWriteResGroup107_16_2], (instregex "IMUL16m")>; +def: InstRW<[SKLWriteResGroup107_16_2], (instregex "MUL16m")>; + +def SKLWriteResGroup107_32 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup107_32], (instregex "IMUL32m")>; +def: InstRW<[SKLWriteResGroup107_32], (instregex "MUL32m")>; + +def SKLWriteResGroup108 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup108], (instregex "FCOM32m")>; +def: InstRW<[SKLWriteResGroup108], (instregex "FCOM64m")>; +def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP32m")>; +def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP64m")>; +def: InstRW<[SKLWriteResGroup108], (instregex "MMX_PSADBWirm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSDWYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSWBYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSDWYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSWBYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPALIGNRYrmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPBLENDWYrmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTBYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTWYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBDYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBQYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXWQYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFBYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFDYmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFHWYmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFLWYmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHBWYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHDQYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHQDQYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHWDYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLBWYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLDQYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLQDQYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLWDYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPDYrmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPSYrmi")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPSYrm")>; + +def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup109], (instregex "VPABSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPABSDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPABSWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQQYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNBYrm256")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNDYrm256")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNWYrm256")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLQYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVQYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRADYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAVDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLQYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVDYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVQYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSWYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSBYrm")>; +def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSWYrm")>; + +def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPSYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VANDPDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VANDPSYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPDYrmi")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPSYrmi")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VORPDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VORPSYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPADDBYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPADDDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPADDQYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPADDWYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPANDNYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPANDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPBLENDDYrmi")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVQYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPORYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBBYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBQYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBWYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VPXORYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VXORPDYrm")>; +def: InstRW<[SKLWriteResGroup110], (instregex "VXORPSYrm")>; + +def SKLWriteResGroup111 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPDrm0")>; +def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPSrm0")>; +def: InstRW<[SKLWriteResGroup111], (instregex "PBLENDVBrm0")>; +def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPDrm")>; +def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPSrm")>; +def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBYrm")>; +def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBrm")>; + +def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHSUBSWrm64")>; + +def SKLWriteResGroup113 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort05]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDrm64")>; +def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBWrm64")>; + +def SKLWriteResGroup114 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup114], (instregex "VCVTPS2PHYmr")>; + +def SKLWriteResGroup115 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup115], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup115], (instregex "ROR8mCL")>; + +def SKLWriteResGroup116 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCL8m1")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCL8mi")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)m1")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCR8m1")>; +def: InstRW<[SKLWriteResGroup116], (instregex "RCR8mi")>; + +def SKLWriteResGroup117 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup117], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "ROL8mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "SAR8mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "SHL8mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup117], (instregex "SHR8mCL")>; + +def SKLWriteResGroup118 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup118], (instregex "ADC(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup118], (instregex "ADC8mi")>; + +def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[SKLWriteResGroup119], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup119], (instregex "ADC8mr")>; +def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG8rm")>; +def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mi")>; +def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mi")>; +def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mr")>; + +def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDWDirm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHUWirm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHWirm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULLWirm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "RCPSSm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "RSQRTSSm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "VRCPSSm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "VRSQRTSSm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPDYrm")>; +def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPSYrm")>; + +def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup121], (instregex "PCMPGTQrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "PSADBWrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "VPCMPGTQrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXBWYrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXDQYrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXWDYrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVZXWDYrm")>; +def: InstRW<[SKLWriteResGroup121], (instregex "VPSADBWrm")>; + +def SKLWriteResGroup122 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup122], (instregex "ADDSDrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "ADDSSrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "MULSDrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "MULSSrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "SUBSDrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "SUBSSrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "VADDSDrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "VADDSSrm")>; +def: InstRW<[SKLWriteResGroup122], + (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>; +def: InstRW<[SKLWriteResGroup122], (instregex "VMULSDrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "VMULSSrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSDrm")>; +def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSSrm")>; + +def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup123], (instregex "CMPSDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "CMPSSrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "CVTPS2PDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SSrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SSrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSSrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPH2PSrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPS2PDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SSrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SDrm")>; +def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SSrm")>; + +def SKLWriteResGroup124 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup124], (instregex "DPPDrri")>; +def: InstRW<[SKLWriteResGroup124], (instregex "VDPPDrri")>; + +def SKLWriteResGroup125 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPDYrm")>; +def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPSYrm")>; + +def SKLWriteResGroup126 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup126], (instregex "PTESTrm")>; +def: InstRW<[SKLWriteResGroup126], (instregex "VPTESTrm")>; + +def SKLWriteResGroup127 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup127], (instregex "MULX64rm")>; + +def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup128], (instregex "PHADDSWrm128")>; +def: InstRW<[SKLWriteResGroup128], (instregex "PHSUBSWrm128")>; +def: InstRW<[SKLWriteResGroup128], (instregex "VPHADDSWrm128")>; +def: InstRW<[SKLWriteResGroup128], (instregex "VPHSUBSWrm128")>; + +def SKLWriteResGroup129 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup129], (instregex "PHADDDrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "PHADDWrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBDrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBWrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDDrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDWrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBDrm")>; +def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBWrm")>; + +def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[SKLWriteResGroup130], (instregex "SHRD(16|32|64)mri8")>; + +def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup131], (instregex "LSL(16|32|64)rm")>; + +def SKLWriteResGroup132 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup132], (instregex "AESDECLASTrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "AESDECrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "AESENCLASTrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "AESENCrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "RCPPSm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "RSQRTPSm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECLASTrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCLASTrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCrm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "VRCPPSm")>; +def: InstRW<[SKLWriteResGroup132], (instregex "VRSQRTPSm")>; + +def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F32m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F64m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F16m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F32m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F64m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F32m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F64m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F32m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F64m")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPCMPGTQYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2F128rm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2I128rm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPERMDYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPDYmi")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPSYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPERMQYmi")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBDYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBQYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBWYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXDQYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXWQYrm")>; +def: InstRW<[SKLWriteResGroup133], (instregex "VPSADBWYrm")>; + +def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup134], (instregex "ADDPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "ADDPSrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPSrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "MULPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "MULPSrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "SUBPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "SUBPSrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VADDPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VADDPSrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPSrm")>; +def: InstRW<[SKLWriteResGroup134], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VMULPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VMULPSrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPDrm")>; +def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPSrm")>; + +def SKLWriteResGroup135 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup135], (instregex "CMPPDrmi")>; +def: InstRW<[SKLWriteResGroup135], (instregex "CMPPSrmi")>; +def: InstRW<[SKLWriteResGroup135], (instregex "CVTDQ2PSrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "CVTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "CVTSS2SDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "CVTTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PSrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PSrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMADDUBSWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMADDWDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMULDQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMULHRSWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMULHUWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMULHWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMULLWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "PMULUDQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPDrmi")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPSrmi")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPH2PSYrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCVTSS2SDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PSrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PSrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDUBSWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDWDrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMULDQrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHRSWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHUWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMULLWrm")>; +def: InstRW<[SKLWriteResGroup135], (instregex "VPMULUDQrm")>; + +def SKLWriteResGroup136 : SchedWriteRes<[SKLPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRIrr")>; +def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRM128rr")>; +def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRIrr")>; +def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRM128rr")>; + +def SKLWriteResGroup137 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup137], (instregex "MPSADBWrmi")>; +def: InstRW<[SKLWriteResGroup137], (instregex "VMPSADBWrmi")>; + +def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup138], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[SKLWriteResGroup138], (instregex "VPTESTYrm")>; + +def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup139], (instregex "CVTSD2SSrm")>; +def: InstRW<[SKLWriteResGroup139], (instregex "VCVTSD2SSrm")>; + +def SKLWriteResGroup140 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWrm256")>; +def: InstRW<[SKLWriteResGroup140], (instregex "VPHSUBSWrm256")>; + +def SKLWriteResGroup141 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDDYrm")>; +def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDWYrm")>; +def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBDYrm")>; +def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBWYrm")>; + +def SKLWriteResGroup142 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup142], (instregex "MULX32rm")>; + +def SKLWriteResGroup143 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 10; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,3]; +} +def: InstRW<[SKLWriteResGroup143], (instregex "ADD8mi")>; +def: InstRW<[SKLWriteResGroup143], (instregex "AND8mi")>; +def: InstRW<[SKLWriteResGroup143], (instregex "OR8mi")>; +def: InstRW<[SKLWriteResGroup143], (instregex "SUB8mi")>; +def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(16|32|64)rm")>; +def: InstRW<[SKLWriteResGroup143], (instregex "XCHG8rm")>; +def: InstRW<[SKLWriteResGroup143], (instregex "XOR8mi")>; + +def SKLWriteResGroup144 : SchedWriteRes<[SKLPort05,SKLPort0156]> { + let Latency = 10; + let NumMicroOps = 10; + let ResourceCycles = [9,1]; +} +def: InstRW<[SKLWriteResGroup144], (instregex "MMX_EMMS")>; + +def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup145], (instregex "DIVPSrr")>; +def: InstRW<[SKLWriteResGroup145], (instregex "DIVSSrr")>; +def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSYrr")>; +def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSrr")>; +def: InstRW<[SKLWriteResGroup145], (instregex "VDIVSSrr")>; + +def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F32m")>; +def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F64m")>; +def: InstRW<[SKLWriteResGroup146], (instregex "VRCPPSYm")>; +def: InstRW<[SKLWriteResGroup146], (instregex "VRSQRTPSYm")>; + +def SKLWriteResGroup147 : SchedWriteRes<[SKLPort01,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup147], (instregex "VADDPDYrm")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VADDPSYrm")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPDYrm")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPSYrm")>; +def: InstRW<[SKLWriteResGroup147], + (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VMULPDYrm")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VMULPSYrm")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPDYrm")>; +def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPSYrm")>; + +def SKLWriteResGroup148 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPDYrmi")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPSYrmi")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2PDYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PDYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PSYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PDYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PSYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDUBSWYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDWDYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMULDQYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHRSWYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHUWYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHWYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMULLWYrm")>; +def: InstRW<[SKLWriteResGroup148], (instregex "VPMULUDQYrm")>; + +def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup149], (instregex "FICOM16m")>; +def: InstRW<[SKLWriteResGroup149], (instregex "FICOM32m")>; +def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP16m")>; +def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP32m")>; +def: InstRW<[SKLWriteResGroup149], (instregex "VMPSADBWYrmi")>; + +def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup150], (instregex "CVTDQ2PDrm")>; +def: InstRW<[SKLWriteResGroup150], (instregex "VCVTDQ2PDrm")>; + +def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSS2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SIrm")>; + +def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2DQrm")>; +def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm")>; +def: InstRW<[SKLWriteResGroup152], (instregex "CVTTPD2DQrm")>; +def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTTPD2PIirm")>; + +def SKLWriteResGroup153 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[SKLWriteResGroup153], (instregex "SHRD(16|32|64)mrCL")>; + +def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL")>; +def: InstRW<[SKLWriteResGroup154], (instregex "RCR(16|32|64)rCL")>; + +def SKLWriteResGroup155 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,5,1,2]; +} +def: InstRW<[SKLWriteResGroup155], (instregex "RCL8rCL")>; + +def SKLWriteResGroup156 : SchedWriteRes<[SKLPort06,SKLPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[SKLWriteResGroup156], (instregex "LOOPE")>; +def: InstRW<[SKLWriteResGroup156], (instregex "LOOPNE")>; + +def SKLWriteResGroup157 : SchedWriteRes<[SKLPort0]> { + let Latency = 12; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSYr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSr")>; +def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTSSr")>; + +def SKLWriteResGroup158 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup158], (instregex "PCLMULQDQrm")>; +def: InstRW<[SKLWriteResGroup158], (instregex "VPCLMULQDQrm")>; + +def SKLWriteResGroup159 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup159], (instregex "HADDPDrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "HADDPSrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPDrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPSrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPDrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPSrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPDrm")>; +def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPSrm")>; + +def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKLWriteResGroup160], (instregex "CVTTSS2SI64rm")>; + +def SKLWriteResGroup161 : SchedWriteRes<[SKLPort0]> { + let Latency = 13; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup161], (instregex "SQRTPSr")>; +def: InstRW<[SKLWriteResGroup161], (instregex "SQRTSSr")>; + +def SKLWriteResGroup162 : SchedWriteRes<[SKLPort5,SKLPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI16m")>; +def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI32m")>; +def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI16m")>; +def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI32m")>; +def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI16m")>; +def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI32m")>; + +def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup163], (instregex "VCVTDQ2PDYrm")>; + +def SKLWriteResGroup164 : SchedWriteRes<[SKLPort5,SKLPort015]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKLWriteResGroup164], (instregex "DPPSrri")>; +def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSYrri")>; +def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSrri")>; + +def SKLWriteResGroup165 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPDYrm")>; +def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPSYrm")>; +def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPDYrm")>; +def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPSYrm")>; + +def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup166], (instregex "DIVPDrr")>; +def: InstRW<[SKLWriteResGroup166], (instregex "DIVSDrr")>; +def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDYrr")>; +def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDrr")>; +def: InstRW<[SKLWriteResGroup166], (instregex "VDIVSDrr")>; + +def SKLWriteResGroup167 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKLWriteResGroup167], (instregex "AESIMCrm")>; +def: InstRW<[SKLWriteResGroup167], (instregex "VAESIMCrm")>; + +def SKLWriteResGroup168 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup168], (instregex "PMULLDrm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPDm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPSm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSDm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSSm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "VPMULLDrm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPDm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPSm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSDm")>; +def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSSm")>; + +def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI16m")>; +def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI32m")>; + +def SKLWriteResGroup170 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,4,1,3]; +} +def: InstRW<[SKLWriteResGroup170], (instregex "RCR8rCL")>; + +def SKLWriteResGroup171 : SchedWriteRes<[SKLPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FPrST0")>; +def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FST0r")>; +def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FrST0")>; + +def SKLWriteResGroup172 : SchedWriteRes<[SKLPort23,SKLPort015]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKLWriteResGroup172], (instregex "VPMULLDYrm")>; +def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPDm")>; +def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPSm")>; + +def SKLWriteResGroup173 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKLWriteResGroup173], (instregex "DPPDrmi")>; +def: InstRW<[SKLWriteResGroup173], (instregex "VDPPDrmi")>; + +def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 15; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,5,1,1]; +} +def: InstRW<[SKLWriteResGroup174], (instregex "RCL(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup174], (instregex "RCL8mCL")>; + +def SKLWriteResGroup175 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup175], (instregex "DIVSSrm")>; +def: InstRW<[SKLWriteResGroup175], (instregex "VDIVSSrm")>; + +def SKLWriteResGroup176 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRIrm")>; +def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRM128rm")>; +def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRIrm")>; +def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRM128rm")>; + +def SKLWriteResGroup177 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 16; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[SKLWriteResGroup177], (instregex "CMPXCHG8B")>; + +def SKLWriteResGroup178 : SchedWriteRes<[SKLPort0156]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[SKLWriteResGroup178], (instregex "VZEROALL")>; + +def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 17; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup179], (instregex "DIVPSrm")>; +def: InstRW<[SKLWriteResGroup179], (instregex "VDIVPSrm")>; +def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTSSm")>; + +def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[SKLWriteResGroup180], (instregex "XCH_F")>; + +def SKLWriteResGroup181 : SchedWriteRes<[SKLPort0]> { + let Latency = 18; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDYr")>; +def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDr")>; +def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTSDr")>; + +def SKLWriteResGroup182 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 18; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup182], (instregex "SQRTSSm")>; +def: InstRW<[SKLWriteResGroup182], (instregex "VDIVPSYrm")>; +def: InstRW<[SKLWriteResGroup182], (instregex "VSQRTPSm")>; + +def SKLWriteResGroup183 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def: InstRW<[SKLWriteResGroup183], (instregex "PCMPESTRIrr")>; +def: InstRW<[SKLWriteResGroup183], (instregex "VPCMPESTRIrr")>; + +def SKLWriteResGroup184 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[SKLWriteResGroup184], (instregex "CPUID")>; +def: InstRW<[SKLWriteResGroup184], (instregex "RDTSC")>; + +def SKLWriteResGroup185 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 18; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,4,1,2]; +} +def: InstRW<[SKLWriteResGroup185], (instregex "RCR(16|32|64)mCL")>; +def: InstRW<[SKLWriteResGroup185], (instregex "RCR8mCL")>; + +def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup186], (instregex "DIVSDrm")>; +def: InstRW<[SKLWriteResGroup186], (instregex "SQRTPSm")>; +def: InstRW<[SKLWriteResGroup186], (instregex "VDIVSDrm")>; +def: InstRW<[SKLWriteResGroup186], (instregex "VSQRTPSYm")>; + +def SKLWriteResGroup187 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup187], (instregex "DPPSrmi")>; +def: InstRW<[SKLWriteResGroup187], (instregex "VDPPSrmi")>; + +def SKLWriteResGroup188 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015,SKLPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[SKLWriteResGroup188], (instregex "PCMPESTRM128rr")>; +def: InstRW<[SKLWriteResGroup188], (instregex "VPCMPESTRM128rr")>; + +def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FPrST0")>; +def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FST0r")>; +def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FrST0")>; +def: InstRW<[SKLWriteResGroup189], (instregex "SQRTPDr")>; +def: InstRW<[SKLWriteResGroup189], (instregex "SQRTSDr")>; + +def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup190], (instregex "DIVPDrm")>; +def: InstRW<[SKLWriteResGroup190], (instregex "VDIVPDrm")>; + +def SKLWriteResGroup191 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKLWriteResGroup191], (instregex "VDPPSYrmi")>; + +def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 20; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[SKLWriteResGroup192], (instregex "INSB")>; +def: InstRW<[SKLWriteResGroup192], (instregex "INSL")>; +def: InstRW<[SKLWriteResGroup192], (instregex "INSW")>; + +def SKLWriteResGroup193 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[SKLWriteResGroup193], (instregex "MWAITrr")>; + +def SKLWriteResGroup194 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> { + let Latency = 20; + let NumMicroOps = 11; + let ResourceCycles = [3,6,2]; +} +def: InstRW<[SKLWriteResGroup194], (instregex "AESKEYGENASSIST128rr")>; +def: InstRW<[SKLWriteResGroup194], (instregex "VAESKEYGENASSIST128rr")>; + +def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup195], (instregex "VDIVPDYrm")>; + +def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 22; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F32m")>; +def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F64m")>; + +def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 22; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm, + VGATHERDPDrm, + VGATHERQPDrm, + VGATHERQPSrm, + VPGATHERDDrm, + VPGATHERDQrm, + VPGATHERQDrm, + VPGATHERQQrm)>; + +def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> { + let Latency = 25; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm, + VGATHERQPDYrm, + VGATHERQPSYrm, + VPGATHERDDYrm, + VPGATHERDQYrm, + VPGATHERQDYrm, + VPGATHERQQYrm, + VGATHERDPDYrm)>; + +def SKLWriteResGroup197 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 23; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup197], (instregex "VSQRTSDm")>; + +def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[SKLWriteResGroup198], (instregex "CMPXCHG16B")>; + +def SKLWriteResGroup199 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 24; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTPDm")>; + +def SKLWriteResGroup200 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[SKLWriteResGroup200], (instregex "PCMPESTRIrm")>; +def: InstRW<[SKLWriteResGroup200], (instregex "VPCMPESTRIrm")>; + +def SKLWriteResGroup201 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 25; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup201], (instregex "SQRTSDm")>; +def: InstRW<[SKLWriteResGroup201], (instregex "VSQRTPDYm")>; + +def SKLWriteResGroup202 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 25; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI16m")>; +def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI32m")>; + +def SKLWriteResGroup203 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015,SKLPort0156]> { + let Latency = 25; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} +def: InstRW<[SKLWriteResGroup203], (instregex "PCMPESTRM128rm")>; +def: InstRW<[SKLWriteResGroup203], (instregex "VPCMPESTRM128rm")>; + +def SKLWriteResGroup204 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> { + let Latency = 25; + let NumMicroOps = 11; + let ResourceCycles = [3,6,1,1]; +} +def: InstRW<[SKLWriteResGroup204], (instregex "AESKEYGENASSIST128rm")>; +def: InstRW<[SKLWriteResGroup204], (instregex "VAESKEYGENASSIST128rm")>; + +def SKLWriteResGroup205 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 26; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup205], (instregex "SQRTPDm")>; + +def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort23]> { + let Latency = 27; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F32m")>; +def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F64m")>; + +def SKLWriteResGroup207 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> { + let Latency = 28; + let NumMicroOps = 8; + let ResourceCycles = [2,4,1,1]; +} +def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(16|32|64)m")>; +def: InstRW<[SKLWriteResGroup207], (instregex "IDIV8m")>; + +def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> { + let Latency = 30; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI16m")>; +def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI32m")>; + +def SKLWriteResGroup209 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)ri")>; +def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)rr")>; +def: InstRW<[SKLWriteResGroup209], (instregex "IN8ri")>; +def: InstRW<[SKLWriteResGroup209], (instregex "IN8rr")>; + +def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)ir")>; +def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)rr")>; +def: InstRW<[SKLWriteResGroup210], (instregex "OUT8ir")>; +def: InstRW<[SKLWriteResGroup210], (instregex "OUT8rr")>; + +def SKLWriteResGroup211 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> { + let Latency = 37; + let NumMicroOps = 31; + let ResourceCycles = [1,8,1,21]; +} +def: InstRW<[SKLWriteResGroup211], (instregex "XRSTOR(64)?")>; + +def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort15,SKLPort0156]> { + let Latency = 40; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[SKLWriteResGroup212], (instregex "VMCLEARm")>; + +def SKLWriteResGroup213 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 41; + let NumMicroOps = 39; + let ResourceCycles = [1,10,1,1,26]; +} +def: InstRW<[SKLWriteResGroup213], (instregex "XSAVE64")>; + +def SKLWriteResGroup214 : SchedWriteRes<[SKLPort5,SKLPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[SKLWriteResGroup214], (instregex "RDTSCP")>; + +def SKLWriteResGroup215 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 42; + let NumMicroOps = 40; + let ResourceCycles = [1,11,1,1,26]; +} +def: InstRW<[SKLWriteResGroup215], (instregex "^XSAVE$", "XSAVEC", "XSAVES")>; + +def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> { + let Latency = 46; + let NumMicroOps = 44; + let ResourceCycles = [1,11,1,1,30]; +} +def: InstRW<[SKLWriteResGroup216], (instregex "XSAVEOPT")>; + +def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,SKLPort0156]> { + let Latency = 62; + let NumMicroOps = 64; + let ResourceCycles = [2,8,5,10,39]; +} +def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>; +def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>; + +def SKLWriteResGroup218 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 63; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[SKLWriteResGroup218], (instregex "FXRSTOR64")>; + +def SKLWriteResGroup219 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> { + let Latency = 63; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[SKLWriteResGroup219], (instregex "FXRSTOR")>; + +def SKLWriteResGroup220 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[SKLWriteResGroup220], (instregex "FNINIT")>; + +def SKLWriteResGroup221 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> { + let Latency = 76; + let NumMicroOps = 32; + let ResourceCycles = [7,2,8,3,1,11]; +} +def: InstRW<[SKLWriteResGroup221], (instregex "DIV(16|32|64)r")>; + +def SKLWriteResGroup222 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> { + let Latency = 102; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[SKLWriteResGroup222], (instregex "IDIV(16|32|64)r")>; + +def SKLWriteResGroup223 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort237,SKLPort06,SKLPort0156]> { + let Latency = 106; + let NumMicroOps = 100; + let ResourceCycles = [9,1,11,16,1,11,21,30]; +} +def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>; +def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>; + +} // SchedModel diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td new file mode 100755 index 000000000000..439a2ffa36a4 --- /dev/null +++ b/lib/Target/X86/X86SchedSkylakeServer.td @@ -0,0 +1,6500 @@ +//=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Skylake Server to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +def SkylakeServerModel : SchedMachineModel { + // All x86 instructions are modeled as a single micro-op, and SKylake can + // decode 6 instructions per cycle. + let IssueWidth = 6; + let MicroOpBufferSize = 224; // Based on the reorder buffer. + let LoadLatency = 5; + let MispredictPenalty = 14; + + // Based on the LSD (loop-stream detector) queue size and benchmarking data. + let LoopMicroOpBufferSize = 50; + + // This flag is set to allow the scheduler to assign a default model to + // unrecognized opcodes. + let CompleteModel = 0; +} + +let SchedModel = SkylakeServerModel in { + +// Skylake Server can issue micro-ops to 8 different ports in one cycle. + +// Ports 0, 1, 5, and 6 handle all computation. +// Port 4 gets the data half of stores. Store data can be available later than +// the store address, but since we don't model the latency of stores, we can +// ignore that. +// Ports 2 and 3 are identical. They handle loads and the address half of +// stores. Port 7 can handle address calculations. +def SKXPort0 : ProcResource<1>; +def SKXPort1 : ProcResource<1>; +def SKXPort2 : ProcResource<1>; +def SKXPort3 : ProcResource<1>; +def SKXPort4 : ProcResource<1>; +def SKXPort5 : ProcResource<1>; +def SKXPort6 : ProcResource<1>; +def SKXPort7 : ProcResource<1>; + +// Many micro-ops are capable of issuing on multiple ports. +def SKXPort01 : ProcResGroup<[SKXPort0, SKXPort1]>; +def SKXPort23 : ProcResGroup<[SKXPort2, SKXPort3]>; +def SKXPort237 : ProcResGroup<[SKXPort2, SKXPort3, SKXPort7]>; +def SKXPort04 : ProcResGroup<[SKXPort0, SKXPort4]>; +def SKXPort05 : ProcResGroup<[SKXPort0, SKXPort5]>; +def SKXPort06 : ProcResGroup<[SKXPort0, SKXPort6]>; +def SKXPort15 : ProcResGroup<[SKXPort1, SKXPort5]>; +def SKXPort16 : ProcResGroup<[SKXPort1, SKXPort6]>; +def SKXPort56 : ProcResGroup<[SKXPort5, SKXPort6]>; +def SKXPort015 : ProcResGroup<[SKXPort0, SKXPort1, SKXPort5]>; +def SKXPort056 : ProcResGroup<[SKXPort0, SKXPort5, SKXPort6]>; +def SKXPort0156: ProcResGroup<[SKXPort0, SKXPort1, SKXPort5, SKXPort6]>; + +// 60 Entry Unified Scheduler +def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4, + SKXPort5, SKXPort6, SKXPort7]> { + let BufferSize=60; +} + +// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5 +// cycles after the memory operand. +def : ReadAdvance<ReadAfterLd, 5>; + +// Many SchedWrites are defined in pairs with and without a folded load. +// Instructions with folded loads are usually micro-fused, so they only appear +// as two micro-ops when queued in the reservation station. +// This multiclass defines the resource usage for variants with and without +// folded loads. +multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW, + ProcResourceKind ExePort, + int Lat> { + // Register variant is using a single cycle on ExePort. + def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } + + // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the + // latency. + def : WriteRes<SchedRW.Folded, [SKXPort23, ExePort]> { + let Latency = !add(Lat, 5); + } +} + +// A folded store needs a cycle on port 4 for the store data, but it does not +// need an extra port 2/3 cycle to recompute the address. +def : WriteRes<WriteRMW, [SKXPort4]>; + +// Arithmetic. +defm : SKXWriteResPair<WriteALU, SKXPort0156, 1>; // Simple integer ALU op. +defm : SKXWriteResPair<WriteIMul, SKXPort1, 3>; // Integer multiplication. +def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part. +def SKXDivider : ProcResource<1>; // Integer division issued on port 0. +def : WriteRes<WriteIDiv, [SKXPort0, SKXDivider]> { // Integer division. + let Latency = 25; + let ResourceCycles = [1, 10]; +} +def : WriteRes<WriteIDivLd, [SKXPort23, SKXPort0, SKXDivider]> { + let Latency = 29; + let ResourceCycles = [1, 1, 10]; +} + +def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads. + +// Integer shifts and rotates. +defm : SKXWriteResPair<WriteShift, SKXPort06, 1>; + +// Loads, stores, and moves, not folded with other operations. +def : WriteRes<WriteLoad, [SKXPort23]> { let Latency = 5; } +def : WriteRes<WriteStore, [SKXPort237, SKXPort4]>; +def : WriteRes<WriteMove, [SKXPort0156]>; + +// Idioms that clear a register, like xorps %xmm0, %xmm0. +// These can often bypass execution ports completely. +def : WriteRes<WriteZero, []>; + +// Branches don't produce values, so they have no latency, but they still +// consume resources. Indirect branches can fold loads. +defm : SKXWriteResPair<WriteJump, SKXPort06, 1>; + +// Floating point. This covers both scalar and vector operations. +defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare. +defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication. +defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division. +defm : SKXWriteResPair<WriteFSqrt, SKXPort0, 15>; // Floating point square root. +defm : SKXWriteResPair<WriteFRcp, SKXPort0, 5>; // Floating point reciprocal estimate. +defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate. +defm : SKXWriteResPair<WriteFMA, SKXPort015, 4>; // Fused Multiply Add. +defm : SKXWriteResPair<WriteFShuffle, SKXPort5, 1>; // Floating point vector shuffles. +defm : SKXWriteResPair<WriteFBlend, SKXPort015, 1>; // Floating point vector blends. +def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +// FMA Scheduling helper class. +// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; } + +// Vector integer operations. +defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals. +defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts. +defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply. +defm : SKXWriteResPair<WriteShuffle, SKXPort5, 1>; // Vector shuffles. +defm : SKXWriteResPair<WriteBlend, SKXPort15, 1>; // Vector blends. + +def : WriteRes<WriteVarBlend, [SKXPort5]> { // Vector variable blends. + let Latency = 2; + let ResourceCycles = [2]; +} +def : WriteRes<WriteVarBlendLd, [SKXPort5, SKXPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1]; +} + +def : WriteRes<WriteMPSAD, [SKXPort0, SKXPort5]> { // Vector MPSAD. + let Latency = 6; + let ResourceCycles = [1, 2]; +} +def : WriteRes<WriteMPSADLd, [SKXPort23, SKXPort0, SKXPort5]> { + let Latency = 6; + let ResourceCycles = [1, 1, 2]; +} + +// Vector bitwise operations. +// These are often used on both floating point and integer vectors. +defm : SKXWriteResPair<WriteVecLogic, SKXPort015, 1>; // Vector and/or/xor. + +// Conversion between integer and float. +defm : SKXWriteResPair<WriteCvtF2I, SKXPort1, 3>; // Float -> Integer. +defm : SKXWriteResPair<WriteCvtI2F, SKXPort1, 4>; // Integer -> Float. +defm : SKXWriteResPair<WriteCvtF2F, SKXPort1, 3>; // Float -> Float size conversion. + +// Strings instructions. +// Packed Compare Implicit Length Strings, Return Mask +// String instructions. +def : WriteRes<WritePCmpIStrM, [SKXPort0]> { + let Latency = 10; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrMLd, [SKXPort0, SKXPort23]> { + let Latency = 10; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Mask +def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort16, SKXPort5]> { + let Latency = 10; + let ResourceCycles = [3, 2, 4]; +} +def : WriteRes<WritePCmpEStrMLd, [SKXPort05, SKXPort16, SKXPort23]> { + let Latency = 10; + let ResourceCycles = [6, 2, 1]; +} + // Packed Compare Implicit Length Strings, Return Index +def : WriteRes<WritePCmpIStrI, [SKXPort0]> { + let Latency = 11; + let ResourceCycles = [3]; +} +def : WriteRes<WritePCmpIStrILd, [SKXPort0, SKXPort23]> { + let Latency = 11; + let ResourceCycles = [3, 1]; +} +// Packed Compare Explicit Length Strings, Return Index +def : WriteRes<WritePCmpEStrI, [SKXPort05, SKXPort16]> { + let Latency = 11; + let ResourceCycles = [6, 2]; +} +def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort16, SKXPort5, SKXPort23]> { + let Latency = 11; + let ResourceCycles = [3, 2, 2, 1]; +} + +// AES instructions. +def : WriteRes<WriteAESDecEnc, [SKXPort5]> { // Decryption, encryption. + let Latency = 7; + let ResourceCycles = [1]; +} +def : WriteRes<WriteAESDecEncLd, [SKXPort5, SKXPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} +def : WriteRes<WriteAESIMC, [SKXPort5]> { // InvMixColumn. + let Latency = 14; + let ResourceCycles = [2]; +} +def : WriteRes<WriteAESIMCLd, [SKXPort5, SKXPort23]> { + let Latency = 14; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteAESKeyGen, [SKXPort0, SKXPort5]> { // Key Generation. + let Latency = 10; + let ResourceCycles = [2, 8]; +} +def : WriteRes<WriteAESKeyGenLd, [SKXPort0, SKXPort5, SKXPort23]> { + let Latency = 10; + let ResourceCycles = [2, 7, 1]; +} + +// Carry-less multiplication instructions. +def : WriteRes<WriteCLMul, [SKXPort0, SKXPort5]> { + let Latency = 7; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteCLMulLd, [SKXPort0, SKXPort5, SKXPort23]> { + let Latency = 7; + let ResourceCycles = [2, 1, 1]; +} + +// Catch-all for expensive system instructions. +def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite; + +// AVX2. +defm : SKXWriteResPair<WriteFShuffle256, SKXPort5, 3>; // Fp 256-bit width vector shuffles. +defm : SKXWriteResPair<WriteShuffle256, SKXPort5, 3>; // 256-bit width vector shuffles. +def : WriteRes<WriteVarVecShift, [SKXPort0, SKXPort5]> { // Variable vector shifts. + let Latency = 2; + let ResourceCycles = [2, 1]; +} +def : WriteRes<WriteVarVecShiftLd, [SKXPort0, SKXPort5, SKXPort23]> { + let Latency = 6; + let ResourceCycles = [2, 1, 1]; +} + +// Old microcoded instructions that nobody use. +def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite; + +// Fence instructions. +def : WriteRes<WriteFence, [SKXPort23, SKXPort4]>; + +// Nop, not very useful expect it provides a model for nops! +def : WriteRes<WriteNop, []>; + +//////////////////////////////////////////////////////////////////////////////// +// Horizontal add/sub instructions. +//////////////////////////////////////////////////////////////////////////////// +// HADD, HSUB PS/PD +// x,x / v,v,v. +def : WriteRes<WriteFHAdd, [SKXPort1]> { + let Latency = 3; +} + +// x,m / v,v,m. +def : WriteRes<WriteFHAddLd, [SKXPort1, SKXPort23]> { + let Latency = 7; + let ResourceCycles = [1, 1]; +} + +// PHADD|PHSUB (S) W/D. +// v <- v,v. +def : WriteRes<WritePHAdd, [SKXPort15]>; + +// v <- v,m. +def : WriteRes<WritePHAddLd, [SKXPort15, SKXPort23]> { + let Latency = 5; + let ResourceCycles = [1, 1]; +} + +// Remaining instrs. + +def SKXWriteResGroup1 : SchedWriteRes<[SKXPort0]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup1], (instregex "KANDBrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDNBrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDNDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDNQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDNWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KANDWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KMOVBkk")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KMOVDkk")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KMOVQkk")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KMOVWkk")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KNOTBrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KNOTDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KNOTQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KNOTWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KORBrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KORDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KORQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KORWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXNORBrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXNORDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXNORQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXNORWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXORBrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXORDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXORQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "KXORWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQDirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTDirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXSWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXUBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINSWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINUBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWri")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWrr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSBirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSWirr")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup2 : SchedWriteRes<[SKXPort1]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup2], (instregex "MMX_MASKMOVQ64")>; + +def SKXWriteResGroup3 : SchedWriteRes<[SKXPort5]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup3], (instregex "COMP_FST0r")>; +def: InstRW<[SKXWriteResGroup3], (instregex "COM_FST0r")>; +def: InstRW<[SKXWriteResGroup3], (instregex "INSERTPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "KMOVBkr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "KMOVDkr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "KMOVQkr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "KMOVWkr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64rr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64to64rr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PALIGNR64irr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFBrr64")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFWri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOV64toPQIrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVDDUPrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVDI2PDIrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVHLPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVLHPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVSDrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVSHDUPrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVSLDUPrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSDWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSWBrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSDWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSWBrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PALIGNRrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PBLENDWrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFBrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFDri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFHWri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFLWri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PSLLDQri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PSRLDQri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHQDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLQDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPDrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPSrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_FPr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_Fr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTI32X2Z128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTSSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSSZrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRYrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWYrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQYri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDYrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSYrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSrri")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSYrr")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSrr")>; + +def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>; + +def SKXWriteResGroup5 : SchedWriteRes<[SKXPort01]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup5], (instregex "PABSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PABSDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PABSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PADDSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PADDSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PAVGBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PAVGWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQQrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMAX(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMINSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMIN(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMINSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMINUBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMINUDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PMINUWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNBrr128")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNDrr128")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNWrr128")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSLLDri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSLLQri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSLLWri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSRADri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSRAWri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSRLDri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSRLQri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSRLWri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBYrr256")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBrr128")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDYrr256")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDrr128")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWYrr256")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWrr128")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ128ri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ256ri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ128ri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ256ri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ128r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWYri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ128ri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ256ri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWri")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWYrr")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWrr")>; + +def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup6], (instregex "FINCSTP")>; +def: InstRW<[SKXWriteResGroup6], (instregex "FNOP")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSBrr64")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSDrr64")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSWrr64")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDBirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDDirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDQirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDWirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDNirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PORirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNBrr64")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNDrr64")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNWrr64")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBBirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBDirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBQirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBWirr")>; +def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PXORirr")>; + +def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup7], (instregex "ADC8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup7], (instregex "ADCX(32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "ADOX(32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)ri8")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)ri8")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)ri8")>; +def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CDQ")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CLAC")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "CQO")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JAE_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JAE_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JA_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JA_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JBE_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JBE_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JB_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JB_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JE_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JE_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JGE_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JGE_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JG_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JG_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JLE_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JLE_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JL_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JL_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JMP_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JMP_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNE_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNE_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNO_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNO_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNP_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNP_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNS_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JNS_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JO_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JO_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JP_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JP_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JS_1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "JS_4")>; +def: InstRW<[SKXWriteResGroup7], (instregex "RORX(32|64)ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SAR8r1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SAR8ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SARX(32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SBB8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETAEr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETBr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETEr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETGEr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETGr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETLEr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETLr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETNEr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETNOr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETNPr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETNSr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETOr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETPr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SETSr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHL8r1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHL8ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHLX(32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHR8r1")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHR8ri")>; +def: InstRW<[SKXWriteResGroup7], (instregex "SHRX(32|64)rr")>; +def: InstRW<[SKXWriteResGroup7], (instregex "STAC")>; + +def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr")>; +def: InstRW<[SKXWriteResGroup8], (instregex "BLSI(32|64)rr")>; +def: InstRW<[SKXWriteResGroup8], (instregex "BLSMSK(32|64)rr")>; +def: InstRW<[SKXWriteResGroup8], (instregex "BLSR(32|64)rr")>; +def: InstRW<[SKXWriteResGroup8], (instregex "BZHI(32|64)rr")>; +def: InstRW<[SKXWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>; + +def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "ANDPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "ANDPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPDrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPSrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQArr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MOVPQI2QIrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "MOVSSrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "ORPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "ORPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PADDBrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PADDDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PADDQrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PADDWrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PANDNrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PANDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PORrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PSUBBrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PSUBDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PSUBQrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PSUBWrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "PXORrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDYrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSYrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Zrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Zrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Zrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Zrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Zrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Zrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI(2Q|Lo2PQ)IZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI2QIrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ128rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ256rr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZrr(b?)(k?)(z?)(_REV)?")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPSYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VORPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPANDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDYrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDrri")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPORrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VPXORrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSYrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "XORPDrr")>; +def: InstRW<[SKXWriteResGroup9], (instregex "XORPSrr")>; + +def SKXWriteResGroup10 : SchedWriteRes<[SKXPort0156]> { + let Latency = 1; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "ADD8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "ADD8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "ADD8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "AND8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "AND8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "AND8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CBW")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CLC")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CMC")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CMP8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CMP8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CMP8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "CWDE")>; +def: InstRW<[SKXWriteResGroup10], (instregex "DEC(16|32|64)r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "DEC8r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "INC(16|32|64)r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "INC8r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "LAHF")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri(_alt)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOV8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>; +def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "NEG(16|32|64)r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "NEG8r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "NOOP")>; +def: InstRW<[SKXWriteResGroup10], (instregex "NOT(16|32|64)r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "NOT8r")>; +def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "OR8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "OR8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "OR8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SAHF")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SIDT64m")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SLDT64m")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SMSW16m")>; +def: InstRW<[SKXWriteResGroup10], (instregex "STC")>; +def: InstRW<[SKXWriteResGroup10], (instregex "STRm")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SUB8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SUB8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SUB8rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "SYSCALL")>; +def: InstRW<[SKXWriteResGroup10], (instregex "TEST(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup10], (instregex "TEST8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "TEST8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "TEST8rr")>; +def: InstRW<[SKXWriteResGroup10], (instregex "XCHG(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>; +def: InstRW<[SKXWriteResGroup10], (instregex "XOR8i8")>; +def: InstRW<[SKXWriteResGroup10], (instregex "XOR8ri")>; +def: InstRW<[SKXWriteResGroup10], (instregex "XOR8rr(_REV)?")>; + +def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> { + let Latency = 1; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm")>; +def: InstRW<[SKXWriteResGroup11], (instregex "KMOVBmk")>; +def: InstRW<[SKXWriteResGroup11], (instregex "KMOVDmk")>; +def: InstRW<[SKXWriteResGroup11], (instregex "KMOVQmk")>; +def: InstRW<[SKXWriteResGroup11], (instregex "KMOVWmk")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVNTQmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVQ64mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOV(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mi")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQAmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQUmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTDQmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTI_64mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTImr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVPDI2DImr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQI2QImr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQIto64mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVSDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVSSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP32m")>; +def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP64m")>; +def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP80m")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF128mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x8Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x4Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI128mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x8Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x4Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DIZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DImr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI(2QI|to64)Zmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI2QImr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQIto64mr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSYmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSmr")>; +def: InstRW<[SKXWriteResGroup11], (instregex "VMPTRSTm")>; + +def SKXWriteResGroup12 : SchedWriteRes<[SKXPort0]> { + let Latency = 2; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup12], (instregex "COMISDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "COMISSrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64from64rr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64grr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPSrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MOVPDI2DIrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "MOVPQIto64rr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "PMOVMSKBrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISSrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDZrr(b?)")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSZrr(b?)")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDYrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSYrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64rr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBYrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDYrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSYrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDZrr(b?)")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDrr")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSZrr(b?)")>; +def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSrr")>; + +def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "MMX_PINSRWirri")>; +def: InstRW<[SKXWriteResGroup13], (instregex "PINSRBrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "PINSRDrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "PINSRQrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "PINSRWrri")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQrr")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWrri")>; + +def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup14], (instregex "FDECSTP")>; +def: InstRW<[SKXWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>; + +def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROL8r1")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROL8ri")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROR8r1")>; +def: InstRW<[SKXWriteResGroup15], (instregex "ROR8ri")>; +def: InstRW<[SKXWriteResGroup15], (instregex "SETAr")>; +def: InstRW<[SKXWriteResGroup15], (instregex "SETBEr")>; + +def SKXWriteResGroup16 : SchedWriteRes<[SKXPort015]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPDrr0")>; +def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPSrr0")>; +def: InstRW<[SKXWriteResGroup16], (instregex "PBLENDVBrr0")>; +def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDYrr")>; +def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDrr")>; +def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSYrr")>; +def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSrr")>; +def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBYrr")>; +def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBrr")>; + +def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup17], (instregex "LFENCE")>; +def: InstRW<[SKXWriteResGroup17], (instregex "WAIT")>; +def: InstRW<[SKXWriteResGroup17], (instregex "XGETBV")>; + +def SKXWriteResGroup18 : SchedWriteRes<[SKXPort0,SKXPort237]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup18], (instregex "MMX_MASKMOVQ64")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVDQU")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDYmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSYmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDYmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQYmr")>; +def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQmr")>; + +def SKXWriteResGroup19 : SchedWriteRes<[SKXPort5,SKXPort01]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup19], (instregex "PSLLDrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSLLQrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSLLWrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSRADrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSRAWrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSRLDrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSRLQrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "PSRLWrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQrr")>; +def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLWrr")>; + +def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup20], (instregex "CLFLUSH")>; + +def SKXWriteResGroup21 : SchedWriteRes<[SKXPort237,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup21], (instregex "SFENCE")>; + +def SKXWriteResGroup22 : SchedWriteRes<[SKXPort06,SKXPort15]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr")>; +def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>; + +def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8")>; +def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri")>; +def: InstRW<[SKXWriteResGroup23], (instregex "CWD")>; +def: InstRW<[SKXWriteResGroup23], (instregex "JRCXZ")>; +def: InstRW<[SKXWriteResGroup23], (instregex "SBB8i8")>; +def: InstRW<[SKXWriteResGroup23], (instregex "SBB8ri")>; + +def SKXWriteResGroup24 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup24], (instregex "EXTRACTPSmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRBmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRDmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRQmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRWmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "STMXCSR")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWmr")>; +def: InstRW<[SKXWriteResGroup24], (instregex "VSTMXCSR")>; + +def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup25], (instregex "FNSTCW16m")>; + +def SKXWriteResGroup26 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup26], (instregex "SETAEm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETBm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETEm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETGEm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETGm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETLEm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETLm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETNEm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETNOm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETNPm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETNSm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETOm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETPm")>; +def: InstRW<[SKXWriteResGroup26], (instregex "SETSm")>; + +def SKXWriteResGroup27 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>; + +def SKXWriteResGroup28 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 2; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>; +def: InstRW<[SKXWriteResGroup28], (instregex "PUSH64i8")>; +def: InstRW<[SKXWriteResGroup28], (instregex "STOSB")>; +def: InstRW<[SKXWriteResGroup28], (instregex "STOSL")>; +def: InstRW<[SKXWriteResGroup28], (instregex "STOSQ")>; +def: InstRW<[SKXWriteResGroup28], (instregex "STOSW")>; + +def SKXWriteResGroup29 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> { + let Latency = 2; + let NumMicroOps = 5; + let ResourceCycles = [2,2,1]; +} +def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)(k?)(z?)")>; + +def SKXWriteResGroup30 : SchedWriteRes<[SKXPort0]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup30], (instregex "KADDBrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KADDDrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KADDQrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KADDWrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KMOVBrk")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KMOVDrk")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KMOVQrk")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KMOVWrk")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTBrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTDrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTQrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTWrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KTESTBrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KTESTDrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KTESTQrr")>; +def: InstRW<[SKXWriteResGroup30], (instregex "KTESTWrr")>; + +def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup31], (instregex "BSF(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup31], (instregex "BSR(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup31], (instregex "IMUL64rr(i8)?")>; +def: InstRW<[SKXWriteResGroup31], (instregex "IMUL8r")>; +def: InstRW<[SKXWriteResGroup31], (instregex "LZCNT(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup31], (instregex "MUL8r")>; +def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr")>; +def: InstRW<[SKXWriteResGroup31], (instregex "PEXT(32|64)rr")>; +def: InstRW<[SKXWriteResGroup31], (instregex "POPCNT(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup31], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[SKXWriteResGroup31], (instregex "SHRD(16|32|64)rri8")>; +def: InstRW<[SKXWriteResGroup31], (instregex "TZCNT(16|32|64)rr")>; + +def SKXWriteResGroup31_16 : SchedWriteRes<[SKXPort1, SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup31_16], (instregex "IMUL16rr(i8)?")>; + +def SKXWriteResGroup31_32 : SchedWriteRes<[SKXPort1]> { + let Latency = 3; + let NumMicroOps = 1; +} +def: InstRW<[SKXWriteResGroup31_32], (instregex "IMUL32rr(i8)?")>; + +def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> { + let Latency = 3; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FPrST0")>; +def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FST0r")>; +def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FrST0")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLBri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLDri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLQri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLWri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRBri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRDri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRQri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRWri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKBWrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKDQrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKWDrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "MMX_PSADBWirr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "PCMPGTQrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "PSADBWrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FPrST0")>; +def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FST0r")>; +def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FrST0")>; +def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FPrST0")>; +def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FST0r")>; +def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FrST0")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Z256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Zr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Z256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Zr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF128rr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x8Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI128rr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x8Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSDrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSSrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF128rr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x8Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI128rr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x8Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Z256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x4Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2F128rr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2I128rr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Drr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PDrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PSrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Qrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDYri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQYri")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256r(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Drr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PDrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PSrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Qrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWYrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWrr")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Z256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Zrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Z256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Zrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Z256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Zrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Z256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Zrri(b?)(k?)(z?)")>; + +def SKXWriteResGroup33 : SchedWriteRes<[SKXPort0,SKXPort5]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup33], (instregex "EXTRACTPSrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "MMX_PEXTRWirri")>; +def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRBrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRDrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRQrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWri")>; +def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWrr_REV")>; +def: InstRW<[SKXWriteResGroup33], (instregex "PTESTrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWZrr(_REV)?")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWri")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWrr_REV")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTYrr")>; +def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTrr")>; + +def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup34], (instregex "FNSTSW16r")>; + +def SKXWriteResGroup35 : SchedWriteRes<[SKXPort06]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup35], (instregex "ROL(16|32|64)rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "ROL8rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "ROR(16|32|64)rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "ROR8rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "SAR(16|32|64)rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "SAR8rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "SHL(16|32|64)rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "SHL8rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "SHR(16|32|64)rCL")>; +def: InstRW<[SKXWriteResGroup35], (instregex "SHR8rCL")>; + +def SKXWriteResGroup36 : SchedWriteRes<[SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup36], (instregex "XADD(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup36], (instregex "XADD8rr")>; +def: InstRW<[SKXWriteResGroup36], (instregex "XCHG8rr")>; + +def SKXWriteResGroup37 : SchedWriteRes<[SKXPort0,SKXPort5]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHADDSWrr64")>; +def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHSUBSWrr64")>; + +def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup38], (instregex "PHADDSWrr128")>; +def: InstRW<[SKXWriteResGroup38], (instregex "PHSUBSWrr128")>; +def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr128")>; +def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr256")>; +def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr128")>; +def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr256")>; + +def SKXWriteResGroup39 : SchedWriteRes<[SKXPort5,SKXPort05]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDWrr64")>; +def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDrr64")>; +def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBDrr64")>; +def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBWrr64")>; + +def SKXWriteResGroup40 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup40], (instregex "PHADDDrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "PHADDWrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBDrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBWrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDYrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWYrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDYrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWYrr")>; +def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWrr")>; + +def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr")>; +def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSWBirr")>; +def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKUSWBirr")>; + +def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup42], (instregex "CLD")>; + +def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup43], (instregex "MFENCE")>; + +def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCL8r1")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCL8ri")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)r1")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)ri")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCR8r1")>; +def: InstRW<[SKXWriteResGroup44], (instregex "RCR8ri")>; + +def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> { + let Latency = 3; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup45], (instregex "FNSTSWm")>; + +def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup46], (instregex "SETAm")>; +def: InstRW<[SKXWriteResGroup46], (instregex "SETBEm")>; + +def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup47], (instregex "CALL(16|32|64)r")>; + +def SKXWriteResGroup48 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 3; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup48], (instregex "CALL64pcrel32")>; + +def SKXWriteResGroup49 : SchedWriteRes<[SKXPort0]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup49], (instregex "AESDECLASTrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "AESDECrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "AESENCLASTrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "AESENCrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDUBSWrr64")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDWDirr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHRSWrr64")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHUWirr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHWirr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULLWirr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULUDQirr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FPrST0")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FST0r")>; +def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FrST0")>; +def: InstRW<[SKXWriteResGroup49], (instregex "RCPPSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "RCPSSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTPSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTSSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECLASTrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCLASTrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCrr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SDrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SSrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSYr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRCPSSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SDrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SSrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSYr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSr")>; +def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTSSr")>; + +def SKXWriteResGroup50 : SchedWriteRes<[SKXPort015]> { + let Latency = 4; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup50], (instregex "ADDPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "ADDPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "ADDSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "ADDSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CMPPDrri")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CMPPSrri")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CMPSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CMPSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CVTDQ2PSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CVTPS2DQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "CVTTPS2DQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MULPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MULPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MULSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "MULSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PHMINPOSUWrr128")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMADDUBSWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMADDWDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMULDQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMULHRSWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMULHUWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMULHWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMULLWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "PMULUDQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "SUBPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "SUBPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "SUBSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "SUBSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDYrri")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDrri")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSYrri")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSrri")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSDrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSSrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], + (instregex + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Yr", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128r(b?)(k?)(z?)", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256r(b?)(k?)(z?)", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zr(b?)(k?)(z?)", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)r", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zr(b?)(_Int)?(k?)(z?)", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSDr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSSr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPHMINPOSUWrr128")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSYrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDrr")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSrr")>; + +def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup51], (instregex "MPSADBWrri")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWYrri")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWrri")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup52 : SchedWriteRes<[SKXPort1,SKXPort5]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup52], (instregex "IMUL(32|64)r")>; +def: InstRW<[SKXWriteResGroup52], (instregex "MUL(32|64)r")>; +def: InstRW<[SKXWriteResGroup52], (instregex "MULX64rr")>; + +def SKXWriteResGroup52_16 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; +} +def: InstRW<[SKXWriteResGroup52_16], (instregex "IMUL16r")>; +def: InstRW<[SKXWriteResGroup52_16], (instregex "MUL16r")>; + +def SKXWriteResGroup53 : SchedWriteRes<[SKXPort5,SKXPort01]> { + let Latency = 4; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWYrr")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 4; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP16m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP32m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP64m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "IST_F16m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "IST_F32m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP16m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP32m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP64m")>; +def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZmr(b?)(k?)(z?)")>; + +def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [4]; +} +def: InstRW<[SKXWriteResGroup55], (instregex "FNCLEX")>; + +def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup56], (instregex "VZEROUPPER")>; + +def SKXWriteResGroup57 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort0156]> { + let Latency = 4; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup57], (instregex "LAR(16|32|64)rr")>; + +def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> { + let Latency = 5; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64from64rm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64rm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64to64rm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVQ64rm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOV(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOV64toPQIrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOV8rm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVDDUPrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVDI2PDIrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVQI2PQIrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVSDrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVSSrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>; +def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>; +def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHNTA")>; +def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT0")>; +def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT1")>; +def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT2")>; +def: InstRW<[SKXWriteResGroup58], (instregex "VMOV64toPQIrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDDUPrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDI2PDIrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "VMOVQI2PQIrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSDrm")>; +def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSSrm")>; + +def SKXWriteResGroup59 : SchedWriteRes<[SKXPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup59], (instregex "VCVTSD2SSZrr(b?)(_Int)?(k?)(z?)")>; + +def SKXWriteResGroup60 : SchedWriteRes<[SKXPort0,SKXPort5]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup60], (instregex "CVTDQ2PDrr")>; +def: InstRW<[SKXWriteResGroup60], (instregex "MMX_CVTPI2PDirr")>; +def: InstRW<[SKXWriteResGroup60], (instregex "VCVTDQ2PDrr")>; + +def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2DQrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2PSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTPS2PDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTSD2SSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI642SDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTSS2SDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "CVTTPD2DQrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPD2PIirr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPS2PIirr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPD2PIirr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPS2PIirr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTDQ2PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2UDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2QQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2UQQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTQQ2PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSD2SSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQrr")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2UDQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2QQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2UQQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUDQ2PDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUQQ2PSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI642SDZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup62 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr(b?)(k?)(z?)")>; + +def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup63], (instregex "STR(16|32|64)r")>; + +def SKXWriteResGroup64 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup64], (instregex "MULX32rr")>; + +def SKXWriteResGroup65 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort015]> { + let Latency = 5; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZmr(b?)(k?)(z?)")>; + +def SKXWriteResGroup66 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 5; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZmr(b?)(k?)(z?)")>; + +def SKXWriteResGroup67 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [1,4]; +} +def: InstRW<[SKXWriteResGroup67], (instregex "XSETBV")>; + +def SKXWriteResGroup68 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 5; + let ResourceCycles = [2,3]; +} +def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(16|32|64)rr")>; +def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG8rr")>; + +def SKXWriteResGroup69 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 5; + let NumMicroOps = 6; + let ResourceCycles = [1,1,4]; +} +def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF16")>; +def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF64")>; + +def SKXWriteResGroup70 : SchedWriteRes<[SKXPort5]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup70], (instregex "PCLMULQDQrr")>; +def: InstRW<[SKXWriteResGroup70], (instregex "VPCLMULQDQrr")>; + +def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> { + let Latency = 6; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup71], (instregex "LDDQUrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPDrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPSrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQArm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQUrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVNTDQArm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVSHDUPrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVSLDUPrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPDrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPSrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VLDDQUrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPDrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPSrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQArm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQUrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVNTDQArm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSHDUPrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSLDUPrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPDrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPSrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTDrm")>; +def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTQrm")>; + +def SKXWriteResGroup72 : SchedWriteRes<[SKXPort0]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQDirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTDirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXSWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXUBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINSWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINUBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLDrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLQrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLWrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRADrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRAWrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLDrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLQrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLWrm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSWirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSBirm")>; +def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSWirm")>; + +def SKXWriteResGroup74 : SchedWriteRes<[SKXPort0,SKXPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SI64rr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SIrr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SI64rr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SIrr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SI64rr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SIrr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64rr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIrr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USI64Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64rr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIrr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2USIZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64Zrr(b?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64rr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIZrr(b?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIrr")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USI64Zrr(b?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USIZrr(b?)")>; +def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSS2USIZrr(b?)")>; + +def SKXWriteResGroup75 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PALIGNR64irm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PINSRWirmi")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFBrm64")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFWmi")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHBWirm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHDQirm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHWDirm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLBWirm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLDQirm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLWDirm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPSrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPSrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PINSRBrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PINSRDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PINSRQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PINSRWrmi")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBWrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXDQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBWrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXDQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWrmi")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBWrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXDQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBWrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXDQrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWDrm")>; +def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWQrm")>; + +def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64")>; +def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>; + +def SKXWriteResGroup77 : SchedWriteRes<[SKXPort23,SKXPort05]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSBrm64")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSDrm64")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSWrm64")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDBirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDDirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDQirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDWirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDNirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PORirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNBrm64")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNDrm64")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNWrm64")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBBirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBDirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBQirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBWirm")>; +def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PXORirm")>; + +def SKXWriteResGroup78 : SchedWriteRes<[SKXPort23,SKXPort06]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup78], (instregex "ADC(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "ADC8rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "ADCX(32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "ADOX(32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "BT(16|32|64)mi8")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVAE(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVB(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVE(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVG(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVGE(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVL(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVLE(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNE(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNO(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNP(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNS(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVO(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVP(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "CMOVS(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "RORX(32|64)mi")>; +def: InstRW<[SKXWriteResGroup78], (instregex "SARX(32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "SBB(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "SBB8rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "SHLX(32|64)rm")>; +def: InstRW<[SKXWriteResGroup78], (instregex "SHRX(32|64)rm")>; + +def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm")>; +def: InstRW<[SKXWriteResGroup79], (instregex "BLSI(32|64)rm")>; +def: InstRW<[SKXWriteResGroup79], (instregex "BLSMSK(32|64)rm")>; +def: InstRW<[SKXWriteResGroup79], (instregex "BLSR(32|64)rm")>; +def: InstRW<[SKXWriteResGroup79], (instregex "BZHI(32|64)rm")>; +def: InstRW<[SKXWriteResGroup79], (instregex "MOVBE(16|32|64)rm")>; + +def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup80], (instregex "VMOVDI2PDIZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup81 : SchedWriteRes<[SKXPort23,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup81], (instregex "ADD(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "ADD8rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "AND(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "AND8rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mi")>; +def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mr")>; +def: InstRW<[SKXWriteResGroup81], (instregex "CMP8rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "OR(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "OR8rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)r(mr)?")>; +def: InstRW<[SKXWriteResGroup81], (instregex "SUB(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "SUB8rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "TEST(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mi")>; +def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mr")>; +def: InstRW<[SKXWriteResGroup81], (instregex "XOR(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup81], (instregex "XOR8rm")>; + +def SKXWriteResGroup82 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 6; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup82], (instregex "CVTSI642SSrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "HADDPDrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "HADDPSrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPDrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPSrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VCVTUSI642SSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDYrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSYrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDYrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSYrr")>; +def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSrr")>; + +def SKXWriteResGroup83 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[SKXWriteResGroup83], (instregex "SHRD(16|32|64)rrCL")>; + +def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>; + +def SKXWriteResGroup85 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup85], (instregex "VCVTPS2PHmr")>; + +def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8")>; +def: InstRW<[SKXWriteResGroup86], (instregex "BTR(16|32|64)mi8")>; +def: InstRW<[SKXWriteResGroup86], (instregex "BTS(16|32|64)mi8")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SAR8m1")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SAR8mi")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHL8m1")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHL8mi")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHR8m1")>; +def: InstRW<[SKXWriteResGroup86], (instregex "SHR8mi")>; + +def SKXWriteResGroup87 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "AND8mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "AND8mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "DEC(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "DEC8m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "INC(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "INC8m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "NEG(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "NEG8m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "NOT(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "NOT8m")>; +def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "OR8mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "OR8mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm")>; +def: InstRW<[SKXWriteResGroup87], (instregex "PUSH(16|32|64)rmm")>; +def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mi")>; +def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mr")>; + +def SKXWriteResGroup88 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 6; + let NumMicroOps = 6; + let ResourceCycles = [1,5]; +} +def: InstRW<[SKXWriteResGroup88], (instregex "STD")>; + +def SKXWriteResGroup89 : SchedWriteRes<[SKXPort23]> { + let Latency = 7; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup89], (instregex "LD_F32m")>; +def: InstRW<[SKXWriteResGroup89], (instregex "LD_F64m")>; +def: InstRW<[SKXWriteResGroup89], (instregex "LD_F80m")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTF128")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTI128")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSDYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSSYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VLDDQUYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPDYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPSYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDDUPYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQAYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQUYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSHDUPYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSLDUPYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPDYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPSYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTDYrm")>; +def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTQYrm")>; + +def SKXWriteResGroup90 : SchedWriteRes<[SKXPort0,SKXPort5]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup90], (instregex "VCVTDQ2PDYrr")>; + +def SKXWriteResGroup91 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup91], (instregex "COMISDrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "COMISSrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISDrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISSrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDrm")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSrm")>; + +def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup92], (instregex "INSERTPSrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSDWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSWBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSDWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSWBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PALIGNRrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PBLENDWrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFDmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFHWmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFLWmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHBWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHQDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHWDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLBWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLQDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLWDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPDrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPSrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPSrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPSrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPBLENDWrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWZ128mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWZ128mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSLLDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPSRLDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSrmi")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDrm")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSrm")>; + +def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQYrr")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSYrr")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSYrr")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDYrr")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHYrr")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQYrr")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup94 : SchedWriteRes<[SKXPort01,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup94], (instregex "PABSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PABSDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PABSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PADDSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PADDSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PAVGBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PAVGWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMAX(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMINSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMIN(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMINSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMINUBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMINUDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PMINUWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNBrm128")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNDrm128")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNWrm128")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSLLDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSLLQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSLLWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSRADrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSRAWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSRLDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSRLQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSRLWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPROLDZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPROLQZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPRORDZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPRORQZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNBrm128")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNDrm128")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNWrm128")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBrm")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWrm")>; + +def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "ANDPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "ANDPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPDrmi")>; +def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPSrmi")>; +def: InstRW<[SKXWriteResGroup95], (instregex "ORPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "ORPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PADDBrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PADDDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PADDQrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PADDWrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PANDNrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PANDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PORrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PSUBBrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PSUBDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PSUBQrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PSUBWrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "PXORrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPDrmi")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPSrmi")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTI32X2Z128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTSSZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTF128rm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTI128rm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDDUPZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA32Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA64Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU16Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU32Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU64Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU8Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVNTDQAZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSHDUPZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSLDUPZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VORPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VORPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VORPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VORPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPANDDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPANDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPANDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDDrmi")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTDZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTQZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVQrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPORDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPORQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPORrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGQZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPXORDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPXORQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VPXORrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "XORPDrm")>; +def: InstRW<[SKXWriteResGroup95], (instregex "XORPSrm")>; + +def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm")>; +def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSWBirm")>; +def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKUSWBirm")>; + +def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2Wrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2Wrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup98 : SchedWriteRes<[SKXPort23,SKXPort06]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup98], (instregex "CMOVA(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup98], (instregex "CMOVBE(16|32|64)rm")>; + +def SKXWriteResGroup99 : SchedWriteRes<[SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup99], (instregex "LEAVE64")>; +def: InstRW<[SKXWriteResGroup99], (instregex "SCASB")>; +def: InstRW<[SKXWriteResGroup99], (instregex "SCASL")>; +def: InstRW<[SKXWriteResGroup99], (instregex "SCASQ")>; +def: InstRW<[SKXWriteResGroup99], (instregex "SCASW")>; + +def SKXWriteResGroup100 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SI64rr")>; +def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SIrr")>; +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64Zrr(b?)")>; +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64rr")>; +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIZrr(b?)")>; +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIrr")>; +def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2USI64Zrr(b?)")>; + +def SKXWriteResGroup101 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup101], (instregex "FLDCW16m")>; + +def SKXWriteResGroup102 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup102], (instregex "LDMXCSR")>; +def: InstRW<[SKXWriteResGroup102], (instregex "VLDMXCSR")>; + +def SKXWriteResGroup103 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup103], (instregex "KMOVBkm")>; +def: InstRW<[SKXWriteResGroup103], (instregex "KMOVDkm")>; +def: InstRW<[SKXWriteResGroup103], (instregex "KMOVQkm")>; +def: InstRW<[SKXWriteResGroup103], (instregex "KMOVWkm")>; + +def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ")>; +def: InstRW<[SKXWriteResGroup104], (instregex "RETQ")>; + +def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> { + let Latency = 7; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>; + +def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> { + let Latency = 7; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZmr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ128mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ256mr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZmr(b?)(k?)(z?)")>; + +def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROL8m1")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROL8mi")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROR8m1")>; +def: InstRW<[SKXWriteResGroup107], (instregex "ROR8mi")>; + +def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup108], (instregex "XADD(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup108], (instregex "XADD8rm")>; + +def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup109], (instregex "FARCALL64")>; + +def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,2,2,2]; +} +def: InstRW<[SKXWriteResGroup110], (instrs VPSCATTERDQZ128mr, + VPSCATTERQQZ128mr, + VSCATTERDPDZ128mr, + VSCATTERQPDZ128mr)>; + +def SKXWriteResGroup111 : SchedWriteRes<[SKXPort6,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 7; + let ResourceCycles = [1,3,1,2]; +} +def: InstRW<[SKXWriteResGroup111], (instregex "LOOP")>; + +def SKXWriteResGroup112 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 11; + let ResourceCycles = [1,4,4,2]; +} +def: InstRW<[SKXWriteResGroup112], (instrs VPSCATTERDQZ256mr, + VPSCATTERQQZ256mr, + VSCATTERDPDZ256mr, + VSCATTERQPDZ256mr)>; + +def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 19; + let ResourceCycles = [1,8,8,2]; +} +def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr, + VPSCATTERQQZmr, + VSCATTERDPDZmr, + VSCATTERQPDZmr)>; + +def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 7; + let NumMicroOps = 36; + let ResourceCycles = [1,16,1,16,2]; +} +def: InstRW<[SKXWriteResGroup114], (instrs VSCATTERDPSZmr)>; + +def SKXWriteResGroup115 : SchedWriteRes<[SKXPort0]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup115], (instregex "AESIMCrr")>; +def: InstRW<[SKXWriteResGroup115], (instregex "VAESIMCrr")>; + +def SKXWriteResGroup116 : SchedWriteRes<[SKXPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def: InstRW<[SKXWriteResGroup116], (instregex "PMULLDrr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPDr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPSr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSDr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSSr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDYrr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDrr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ128rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ256rri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZrri(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESDr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESSr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPDr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPSr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSDr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSSr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPDr")>; +def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPSr")>; + +def SKXWriteResGroup117 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPDrm")>; +def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPSrm")>; + +def SKXWriteResGroup118 : SchedWriteRes<[SKXPort1,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup118], (instregex "BSF(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup118], (instregex "BSR(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup118], (instregex "IMUL64m")>; +def: InstRW<[SKXWriteResGroup118], (instregex "IMUL(32|64)rm(i8)?")>; +def: InstRW<[SKXWriteResGroup118], (instregex "IMUL8m")>; +def: InstRW<[SKXWriteResGroup118], (instregex "LZCNT(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup118], (instregex "MUL(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup118], (instregex "MUL8m")>; +def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm")>; +def: InstRW<[SKXWriteResGroup118], (instregex "PEXT(32|64)rm")>; +def: InstRW<[SKXWriteResGroup118], (instregex "POPCNT(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup118], (instregex "TZCNT(16|32|64)rm")>; + +def SKXWriteResGroup118_16_1 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup118_16_1], (instregex "IMUL16rm(i8)?")>; + +def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> { + let Latency = 8; + let NumMicroOps = 5; +} +def: InstRW<[SKXWriteResGroup118_16_2], (instregex "IMUL16m")>; +def: InstRW<[SKXWriteResGroup118_16_2], (instregex "MUL16m")>; + +def SKXWriteResGroup118_32 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup118_32], (instregex "IMUL32m")>; +def: InstRW<[SKXWriteResGroup118_32], (instregex "MUL32m")>; + +def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup119], (instregex "FCOM32m")>; +def: InstRW<[SKXWriteResGroup119], (instregex "FCOM64m")>; +def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP32m")>; +def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP64m")>; +def: InstRW<[SKXWriteResGroup119], (instregex "MMX_PSADBWirm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VFPCLASSSDrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRYrmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBLENDWYrmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBDYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBQYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXWQYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDYmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWYmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZ256mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWYmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZ256mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDYrmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSYrmi")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSYrm")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQQYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNBYrm256")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNDYrm256")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNWYrm256")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256mi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWYrm")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPDYrmi")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPSYrmi")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Z256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Zm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X8rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X4rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Z256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Zm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X8rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2Z128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X4rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x8Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x4Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x8Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x4Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPSYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Z256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVNTDQAZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VORPDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VORPSYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPANDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDDYrmi")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVQYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPORYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VPXORYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSYrm")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup122 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 8; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPDrm0")>; +def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPSrm0")>; +def: InstRW<[SKXWriteResGroup122], (instregex "PBLENDVBrm0")>; +def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPDrm")>; +def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPSrm")>; +def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBYrm")>; +def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBrm")>; + +def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,2,1]; +} +def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHADDSWrm64")>; +def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHSUBSWrm64")>; + +def SKXWriteResGroup124 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort05]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDWrm64")>; +def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDrm64")>; +def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBDrm64")>; +def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBWrm64")>; + +def SKXWriteResGroup125 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> { + let Latency = 8; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup125], (instregex "VCVTPS2PHYmr")>; + +def SKXWriteResGroup126 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKXWriteResGroup126], (instregex "ROR(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup126], (instregex "ROR8mCL")>; + +def SKXWriteResGroup127 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 5; + let ResourceCycles = [1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCL8m1")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCL8mi")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)m1")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCR8m1")>; +def: InstRW<[SKXWriteResGroup127], (instregex "RCR8mi")>; + +def SKXWriteResGroup128 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKXWriteResGroup128], (instregex "ROL(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "ROL8mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "SAR(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "SAR8mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "SHL(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "SHL8mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "SHR(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup128], (instregex "SHR8mCL")>; + +def SKXWriteResGroup129 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,3]; +} +def: InstRW<[SKXWriteResGroup129], (instregex "ADC(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup129], (instregex "ADC8mi")>; + +def SKXWriteResGroup130 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[SKXWriteResGroup130], (instregex "ADC(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup130], (instregex "ADC8mr")>; +def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG8rm")>; +def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mi")>; +def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mr")>; +def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mi")>; +def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mr")>; + +def SKXWriteResGroup131 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 8; + let ResourceCycles = [1,2,1,2,2]; +} +def: InstRW<[SKXWriteResGroup131], (instrs VPSCATTERQDZ128mr, + VPSCATTERQDZ256mr, + VSCATTERQPSZ128mr, + VSCATTERQPSZ256mr)>; + +def SKXWriteResGroup132 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 12; + let ResourceCycles = [1,4,1,4,2]; +} +def: InstRW<[SKXWriteResGroup132], (instrs VPSCATTERDDZ128mr, + VSCATTERDPSZ128mr)>; + +def SKXWriteResGroup133 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 20; + let ResourceCycles = [1,8,1,8,2]; +} +def: InstRW<[SKXWriteResGroup133], (instrs VPSCATTERDDZ256mr, + VSCATTERDPSZ256mr)>; + +def SKXWriteResGroup134 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> { + let Latency = 8; + let NumMicroOps = 36; + let ResourceCycles = [1,16,1,16,2]; +} +def: InstRW<[SKXWriteResGroup134], (instrs VPSCATTERDDZmr)>; + +def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_CVTPI2PSirm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDUBSWrm64")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDWDirm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHRSWrm64")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHUWirm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHWirm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULLWirm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULUDQirm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "RCPSSm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "RSQRTSSm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "VRCPSSm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "VRSQRTSSm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPDYrm")>; +def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPSYrm")>; + +def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup136], (instregex "PCMPGTQrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "PSADBWrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNQZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSSZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VDBPSADBWZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VFPCLASSSSrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPBZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPDZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPQZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUBZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUDZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUQZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUWZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPWZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2D128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PD128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PS128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2Q128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2D128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PD128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PS128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2Q128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXSQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXUQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMINSQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMINUQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWYrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQYrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDYrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDYrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWrm")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMBZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMWZ128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup137], (instregex "ADDSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "ADDSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "CMPSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "CMPSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "CVTPS2PDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTPS2PIirm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTTPS2PIirm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MULSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "MULSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "SUBSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "SUBSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VADDSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VADDSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPH2PSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPS2PDrm")>; +def: InstRW<[SKXWriteResGroup137], + (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VMULSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VMULSSrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSDrm")>; +def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSSrm")>; + +def SKXWriteResGroup138 : SchedWriteRes<[SKXPort0,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PDZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PSZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PDZr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PSZr(b?)(k?)(z?)")>; + +def SKXWriteResGroup139 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup139], (instregex "DPPDrri")>; +def: InstRW<[SKXWriteResGroup139], (instregex "VDPPDrri")>; + +def SKXWriteResGroup140 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPDYrm")>; +def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPSYrm")>; + +def SKXWriteResGroup141 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup141], (instregex "PTESTrm")>; +def: InstRW<[SKXWriteResGroup141], (instregex "VPTESTrm")>; + +def SKXWriteResGroup142 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup142], (instregex "MULX64rm")>; + +def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup143], (instregex "PHADDSWrm128")>; +def: InstRW<[SKXWriteResGroup143], (instregex "PHSUBSWrm128")>; +def: InstRW<[SKXWriteResGroup143], (instregex "VPHADDSWrm128")>; +def: InstRW<[SKXWriteResGroup143], (instregex "VPHSUBSWrm128")>; + +def SKXWriteResGroup144 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup144], (instregex "PHADDDrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "PHADDWrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBDrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBWrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDDrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDWrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBDrm")>; +def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBWrm")>; + +def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 9; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8")>; +def: InstRW<[SKXWriteResGroup145], (instregex "SHRD(16|32|64)mri8")>; + +def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> { + let Latency = 9; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup146], (instregex "LSL(16|32|64)rm")>; + +def SKXWriteResGroup147 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup147], (instregex "AESDECLASTrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "AESDECrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "AESENCLASTrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "AESENCrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "RCPPSm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "RSQRTPSm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECLASTrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCLASTrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCrm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PDZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PSZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SDrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SSrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRCPPSm")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PDZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PSZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SDrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SSrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRTPSm")>; + +def SKXWriteResGroup148 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F32m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F64m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F16m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F32m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F64m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F32m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F64m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F32m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F64m")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZ256rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2F128rm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2I128rm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2D256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Drm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PD256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PDrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PS256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PSrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Q256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Qrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDYmi")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQYmi")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256m(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2D256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Drm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PD256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PDrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PS256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PSrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Q256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Qrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWYrm")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Z256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Zrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Z256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Zrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Z256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Zrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Z256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Zrm(b?)i(k?)(z?)")>; + +def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 10; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup149], (instregex "ADDPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "ADDPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "CMPPDrmi")>; +def: InstRW<[SKXWriteResGroup149], (instregex "CMPPSrmi")>; +def: InstRW<[SKXWriteResGroup149], (instregex "CVTDQ2PSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "CVTPS2DQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "CVTSS2SDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "CVTTPS2DQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "MULPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "MULPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PHMINPOSUWrm128")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMADDUBSWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMADDWDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMULDQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMULHRSWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMULHUWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMULHWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMULLWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "PMULUDQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "SUBPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "SUBPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDSDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDSSZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPDrmi")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPSrmi")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2QQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2UQQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSYrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2QQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UQQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2QQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2UQQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2QQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UQQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSDrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSSrmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], + (instregex + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128m(b?)(k?)(z?)", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m", + "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPDZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPSZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSDm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSSm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSDZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSSZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SSZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SSZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMULSDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VMULSSZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPHMINPOSUWrm128")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESDZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESSZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESDZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESSZ128rmi(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSrm")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSDZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSSZrm(_Int)?(k?)(z?)")>; + +def SKXWriteResGroup150 : SchedWriteRes<[SKXPort0]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRIrr")>; +def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRM128rr")>; +def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRIrr")>; +def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRM128rr")>; + +def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup151], (instregex "MPSADBWrmi")>; +def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup151], (instregex "VMPSADBWrmi")>; +def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDQZ128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup152 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup152], (instregex "MMX_CVTPI2PDirm")>; +def: InstRW<[SKXWriteResGroup152], (instregex "VPTESTYrm")>; + +def SKXWriteResGroup153 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 10; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup153], (instregex "CVTSD2SSrm")>; +def: InstRW<[SKXWriteResGroup153], (instregex "VCVTSD2SSrm")>; + +def SKXWriteResGroup154 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWrm256")>; +def: InstRW<[SKXWriteResGroup154], (instregex "VPHSUBSWrm256")>; + +def SKXWriteResGroup155 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDDYrm")>; +def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDWYrm")>; +def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBDYrm")>; +def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBWYrm")>; + +def SKXWriteResGroup156 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort06,SKXPort0156]> { + let Latency = 10; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup156], (instregex "MULX32rm")>; + +def SKXWriteResGroup157 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 10; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,3]; +} +def: InstRW<[SKXWriteResGroup157], (instregex "ADD8mi")>; +def: InstRW<[SKXWriteResGroup157], (instregex "AND8mi")>; +def: InstRW<[SKXWriteResGroup157], (instregex "OR8mi")>; +def: InstRW<[SKXWriteResGroup157], (instregex "SUB8mi")>; +def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(16|32|64)rm")>; +def: InstRW<[SKXWriteResGroup157], (instregex "XCHG8rm")>; +def: InstRW<[SKXWriteResGroup157], (instregex "XOR8mi")>; + +def SKXWriteResGroup158 : SchedWriteRes<[SKXPort05,SKXPort0156]> { + let Latency = 10; + let NumMicroOps = 10; + let ResourceCycles = [9,1]; +} +def: InstRW<[SKXWriteResGroup158], (instregex "MMX_EMMS")>; + +def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0]> { + let Latency = 11; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup159], (instregex "DIVPSrr")>; +def: InstRW<[SKXWriteResGroup159], (instregex "DIVSSrr")>; +def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSYrr")>; +def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSrr")>; +def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSrr")>; + +def SKXWriteResGroup160 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F32m")>; +def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F64m")>; +def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PDZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PSZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup160], (instregex "VRCPPSYm")>; +def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PDZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PSZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRTPSYm")>; + +def SKXWriteResGroup161 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPDYrmi")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPSYrmi")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2QQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UQQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2QQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UQQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], + (instregex + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256m(b?)(k?)(z?)", + "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSZ256m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSYrm")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup162], (instregex "FICOM16m")>; +def: InstRW<[SKXWriteResGroup162], (instregex "FICOM32m")>; +def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP16m")>; +def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP32m")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VMPSADBWYrmi")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup163 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm(_Int)?(k?)(z?)")>; + +def SKXWriteResGroup164 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup164], (instregex "CVTDQ2PDrm")>; +def: InstRW<[SKXWriteResGroup164], (instregex "VCVTDQ2PDrm")>; + +def SKXWriteResGroup165 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSS2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2USI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2USIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2USI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64rm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIrm")>; +def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2USIZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2DQrm")>; +def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm")>; +def: InstRW<[SKXWriteResGroup166], (instregex "CVTTPD2DQrm")>; +def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTPD2PIirm")>; +def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTTPD2PIirm")>; + +def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 11; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup168 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 6; + let ResourceCycles = [1,1,1,2,1]; +} +def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL")>; +def: InstRW<[SKXWriteResGroup168], (instregex "SHRD(16|32|64)mrCL")>; + +def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 7; + let ResourceCycles = [2,3,2]; +} +def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL")>; +def: InstRW<[SKXWriteResGroup169], (instregex "RCR(16|32|64)rCL")>; + +def SKXWriteResGroup170 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 9; + let ResourceCycles = [1,5,1,2]; +} +def: InstRW<[SKXWriteResGroup170], (instregex "RCL8rCL")>; + +def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> { + let Latency = 11; + let NumMicroOps = 11; + let ResourceCycles = [2,9]; +} +def: InstRW<[SKXWriteResGroup171], (instregex "LOOPE")>; +def: InstRW<[SKXWriteResGroup171], (instregex "LOOPNE")>; + +def SKXWriteResGroup172 : SchedWriteRes<[SKXPort0]> { + let Latency = 12; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup172], (instregex "SQRTPSr")>; +def: InstRW<[SKXWriteResGroup172], (instregex "SQRTSSr")>; +def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSYr")>; +def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSr")>; +def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSZr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSr")>; + +def SKXWriteResGroup173 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup173], (instregex "PCLMULQDQrm")>; +def: InstRW<[SKXWriteResGroup173], (instregex "VPCLMULQDQrm")>; + +def SKXWriteResGroup174 : SchedWriteRes<[SKXPort015]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [3]; +} +def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup175 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup176 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSD2USIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSS2USI64Zrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSD2USIZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSS2USI64Zrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup177 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2QQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2UQQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2QQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2UQQZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup178 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup178], (instregex "HADDPDrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "HADDPSrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPDrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPSrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPDrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPSrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPDrm")>; +def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPSrm")>; + +def SKXWriteResGroup179 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 12; + let NumMicroOps = 4; + let ResourceCycles = [1,1,1,1]; +} +def: InstRW<[SKXWriteResGroup179], (instregex "CVTTSS2SI64rm")>; + +def SKXWriteResGroup180 : SchedWriteRes<[SKXPort5,SKXPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI16m")>; +def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI32m")>; +def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI16m")>; +def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI32m")>; +def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI16m")>; +def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI32m")>; +def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 13; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup181], (instregex "VCVTDQ2PDYrm")>; + +def SKXWriteResGroup182 : SchedWriteRes<[SKXPort5,SKXPort015]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup182], (instregex "DPPSrri")>; +def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSYrri")>; +def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSrri")>; + +def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 13; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPDYrm")>; +def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPSYrm")>; +def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPDYrm")>; +def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPSYrm")>; +def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup183], (instregex "VPERMT2W128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0]> { + let Latency = 14; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup184], (instregex "DIVPDrr")>; +def: InstRW<[SKXWriteResGroup184], (instregex "DIVSDrr")>; +def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDYrr")>; +def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDrr")>; +def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDZrr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDrr")>; + +def SKXWriteResGroup185 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup185], (instregex "AESIMCrm")>; +def: InstRW<[SKXWriteResGroup185], (instregex "VAESIMCrm")>; + +def SKXWriteResGroup186 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup186], (instregex "PMULLDrm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPDm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPSm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSDm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSSm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDrm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPDZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPSZ128rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESDm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESSm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPDm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPSm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSDm")>; +def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSSm")>; + +def SKXWriteResGroup187 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI16m")>; +def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI32m")>; + +def SKXWriteResGroup188 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 14; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2UDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2DQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2UDQZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup188], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup189 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 14; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2Wrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2W256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2Wrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup190 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 14; + let NumMicroOps = 10; + let ResourceCycles = [2,4,1,3]; +} +def: InstRW<[SKXWriteResGroup190], (instregex "RCR8rCL")>; + +def SKXWriteResGroup191 : SchedWriteRes<[SKXPort0]> { + let Latency = 15; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FPrST0")>; +def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FST0r")>; +def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FrST0")>; + +def SKXWriteResGroup192 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 15; + let NumMicroOps = 3; + let ResourceCycles = [1,2]; +} +def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDYrm")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZrm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZ256rm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZrm(b?)i(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPDm")>; +def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPSm")>; + +def SKXWriteResGroup193 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 15; + let NumMicroOps = 4; + let ResourceCycles = [1,1,2]; +} +def: InstRW<[SKXWriteResGroup193], (instregex "DPPDrmi")>; +def: InstRW<[SKXWriteResGroup193], (instregex "VDPPDrmi")>; + +def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 15; + let NumMicroOps = 8; + let ResourceCycles = [1,2,2,1,2]; +} +def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup195 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 15; + let NumMicroOps = 10; + let ResourceCycles = [1,1,1,5,1,1]; +} +def: InstRW<[SKXWriteResGroup195], (instregex "RCL(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup195], (instregex "RCL8mCL")>; + +def SKXWriteResGroup196 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 16; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup196], (instregex "DIVSSrm")>; +def: InstRW<[SKXWriteResGroup196], (instregex "VDIVSSrm")>; + +def SKXWriteResGroup197 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [3,1]; +} +def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRIrm")>; +def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRM128rm")>; +def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRIrm")>; +def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRM128rm")>; + +def SKXWriteResGroup198 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 16; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PDZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PSZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PDZm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PSZm(b?)(k?)(z?)")>; + +def SKXWriteResGroup199 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 16; + let NumMicroOps = 14; + let ResourceCycles = [1,1,1,4,2,5]; +} +def: InstRW<[SKXWriteResGroup199], (instregex "CMPXCHG8B")>; + +def SKXWriteResGroup200 : SchedWriteRes<[SKXPort0156]> { + let Latency = 16; + let NumMicroOps = 16; + let ResourceCycles = [16]; +} +def: InstRW<[SKXWriteResGroup200], (instregex "VZEROALL")>; + +def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 17; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup201], (instregex "DIVPSrm")>; +def: InstRW<[SKXWriteResGroup201], (instregex "SQRTSSm")>; +def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSrm")>; +def: InstRW<[SKXWriteResGroup201], (instregex "VDIVSSZrm(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup201], (instregex "VSQRTSSm")>; + +def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> { + let Latency = 17; + let NumMicroOps = 15; + let ResourceCycles = [2,1,2,4,2,4]; +} +def: InstRW<[SKXWriteResGroup202], (instregex "XCH_F")>; + +def SKXWriteResGroup203 : SchedWriteRes<[SKXPort0]> { + let Latency = 18; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup203], (instregex "SQRTPDr")>; +def: InstRW<[SKXWriteResGroup203], (instregex "SQRTSDr")>; +def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDYr")>; +def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ128r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ256r(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDr")>; +def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDZr(b?)(_Int)?(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDr")>; + +def SKXWriteResGroup204 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 18; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup204], (instregex "SQRTPSm")>; +def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSYrm")>; +def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSm")>; +def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTSSZm(_Int)?(k?)(z?)")>; + +def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 18; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup206 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [4,3,1]; +} +def: InstRW<[SKXWriteResGroup206], (instregex "PCMPESTRIrr")>; +def: InstRW<[SKXWriteResGroup206], (instregex "VPCMPESTRIrr")>; + +def SKXWriteResGroup207 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> { + let Latency = 18; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,5]; +} +def: InstRW<[SKXWriteResGroup207], (instregex "CPUID")>; +def: InstRW<[SKXWriteResGroup207], (instregex "RDTSC")>; + +def SKXWriteResGroup208 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 18; + let NumMicroOps = 11; + let ResourceCycles = [2,1,1,4,1,2]; +} +def: InstRW<[SKXWriteResGroup208], (instregex "RCR(16|32|64)mCL")>; +def: InstRW<[SKXWriteResGroup208], (instregex "RCR8mCL")>; + +def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup209], (instregex "DIVSDrm")>; +def: InstRW<[SKXWriteResGroup209], (instregex "VDIVSDrm")>; +def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSYm")>; +def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSZ256m(b?)(k?)(z?)")>; + +def SKXWriteResGroup210 : SchedWriteRes<[SKXPort0,SKXPort015]> { + let Latency = 19; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup210], (instregex "VSQRTPSZr(b?)(k?)(z?)")>; + +def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> { + let Latency = 19; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup212 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 19; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKXWriteResGroup212], (instregex "DPPSrmi")>; +def: InstRW<[SKXWriteResGroup212], (instregex "VDPPSrmi")>; + +def SKXWriteResGroup213 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015,SKXPort0156]> { + let Latency = 19; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[SKXWriteResGroup213], (instregex "PCMPESTRM128rr")>; +def: InstRW<[SKXWriteResGroup213], (instregex "VPCMPESTRM128rr")>; + +def SKXWriteResGroup214 : SchedWriteRes<[]> { + let Latency = 20; + let NumMicroOps = 0; +} +def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm, + VGATHERQPSZrm, + VPGATHERDDZ128rm)>; + +def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> { + let Latency = 20; + let NumMicroOps = 1; + let ResourceCycles = [1]; +} +def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FPrST0")>; +def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FST0r")>; +def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FrST0")>; + +def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 20; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup216], (instregex "DIVPDrm")>; +def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDZ128rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDrm")>; +def: InstRW<[SKXWriteResGroup216], (instregex "VDIVSDZrm(_Int)?(k?)(z?)")>; + +def SKXWriteResGroup217 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [1,1,3]; +} +def: InstRW<[SKXWriteResGroup217], (instregex "VDPPSYrmi")>; + +def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 20; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm, + VGATHERQPSZ256rm, + VPGATHERQDZ128rm, + VPGATHERQDZ256rm)>; + +def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 20; + let NumMicroOps = 8; + let ResourceCycles = [1,1,1,1,1,1,2]; +} +def: InstRW<[SKXWriteResGroup219], (instregex "INSB")>; +def: InstRW<[SKXWriteResGroup219], (instregex "INSL")>; +def: InstRW<[SKXWriteResGroup219], (instregex "INSW")>; + +def SKXWriteResGroup220 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort0156]> { + let Latency = 20; + let NumMicroOps = 10; + let ResourceCycles = [1,2,7]; +} +def: InstRW<[SKXWriteResGroup220], (instregex "MWAITrr")>; + +def SKXWriteResGroup221 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> { + let Latency = 20; + let NumMicroOps = 11; + let ResourceCycles = [3,6,2]; +} +def: InstRW<[SKXWriteResGroup221], (instregex "AESKEYGENASSIST128rr")>; +def: InstRW<[SKXWriteResGroup221], (instregex "VAESKEYGENASSIST128rr")>; + +def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 21; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDYrm")>; +def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDZ256rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 22; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F32m")>; +def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F64m")>; + +def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 22; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm, + VGATHERQPDZ128rm, + VPGATHERDQZ128rm, + VPGATHERQQZ128rm)>; + +def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 22; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm, + VGATHERDPDrm, + VGATHERQPDrm, + VGATHERQPSrm, + VPGATHERDDrm, + VPGATHERDQrm, + VPGATHERQDrm, + VPGATHERQQrm, + VPGATHERDDrm, + VPGATHERQDrm, + VPGATHERDQrm, + VPGATHERQQrm, + VGATHERDPSrm, + VGATHERQPSrm, + VGATHERDPDrm, + VGATHERQPDrm)>; + +def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> { + let Latency = 25; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm, + VGATHERQPDYrm, + VGATHERQPSYrm, + VPGATHERDDYrm, + VPGATHERDQYrm, + VPGATHERQDYrm, + VPGATHERQQYrm, + VPGATHERDDYrm, + VPGATHERQDYrm, + VPGATHERDQYrm, + VPGATHERQQYrm, + VGATHERDPSYrm, + VGATHERQPSYrm, + VGATHERDPDYrm)>; + +def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { + let Latency = 22; + let NumMicroOps = 14; + let ResourceCycles = [5,5,4]; +} +def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTQZ256rr(b?)(k?)(z?)")>; + +def SKXWriteResGroup226 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 23; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup226], (instregex "SQRTSDm")>; +def: InstRW<[SKXWriteResGroup226], (instregex "VSQRTSDm")>; + +def SKXWriteResGroup227 : SchedWriteRes<[SKXPort0,SKXPort015]> { + let Latency = 23; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPDZrr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPSZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup228 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 23; + let NumMicroOps = 19; + let ResourceCycles = [2,1,4,1,1,4,6]; +} +def: InstRW<[SKXWriteResGroup228], (instregex "CMPXCHG16B")>; + +def SKXWriteResGroup229 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 24; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup229], (instregex "SQRTPDm")>; +def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDZ128m(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDm")>; +def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTSDZm(_Int)?(k?)(z?)")>; + +def SKXWriteResGroup230 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 24; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup230], (instregex "VDIVPSZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup231 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> { + let Latency = 24; + let NumMicroOps = 9; + let ResourceCycles = [4,3,1,1]; +} +def: InstRW<[SKXWriteResGroup231], (instregex "PCMPESTRIrm")>; +def: InstRW<[SKXWriteResGroup231], (instregex "VPCMPESTRIrm")>; + +def SKXWriteResGroup232 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 25; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDYm")>; +def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDZ256m(b?)(k?)(z?)")>; + +def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 25; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI16m")>; +def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI32m")>; + +def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 25; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm, + VGATHERQPDZ256rm, + VPGATHERDQZ256rm, + VPGATHERQDZrm, + VPGATHERQQZ256rm)>; + +def SKXWriteResGroup235 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 25; + let NumMicroOps = 10; + let ResourceCycles = [4,3,1,1,1]; +} +def: InstRW<[SKXWriteResGroup235], (instregex "PCMPESTRM128rm")>; +def: InstRW<[SKXWriteResGroup235], (instregex "VPCMPESTRM128rm")>; + +def SKXWriteResGroup236 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> { + let Latency = 25; + let NumMicroOps = 11; + let ResourceCycles = [3,6,1,1]; +} +def: InstRW<[SKXWriteResGroup236], (instregex "AESKEYGENASSIST128rm")>; +def: InstRW<[SKXWriteResGroup236], (instregex "VAESKEYGENASSIST128rm")>; + +def SKXWriteResGroup237 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 26; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup237], (instregex "VSQRTPSZm(b?)(k?)(z?)")>; + +def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 26; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm, + VGATHERQPDZrm, + VPGATHERDQZrm, + VPGATHERQQZrm)>; + +def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> { + let Latency = 27; + let NumMicroOps = 2; + let ResourceCycles = [1,1]; +} +def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F32m")>; +def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F64m")>; + +def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 27; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm, + VPGATHERDDZ256rm)>; + +def SKXWriteResGroup241 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> { + let Latency = 28; + let NumMicroOps = 8; + let ResourceCycles = [2,4,1,1]; +} +def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(16|32|64)m")>; +def: InstRW<[SKXWriteResGroup241], (instregex "IDIV8m")>; + +def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 29; + let NumMicroOps = 15; + let ResourceCycles = [5,5,1,4]; +} +def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)(k?)(z?)")>; + +def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> { + let Latency = 30; + let NumMicroOps = 3; + let ResourceCycles = [1,1,1]; +} +def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI16m")>; +def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI32m")>; + +def SKXWriteResGroup244 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 30; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup244], (instregex "VDIVPDZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> { + let Latency = 30; + let NumMicroOps = 5; + let ResourceCycles = [1,2,1,1]; +} +def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm, + VPGATHERDDZrm)>; + +def SKXWriteResGroup246 : SchedWriteRes<[SKXPort0,SKXPort015]> { + let Latency = 31; + let NumMicroOps = 3; + let ResourceCycles = [2,1]; +} +def: InstRW<[SKXWriteResGroup246], (instregex "VSQRTPDZr(b?)(k?)(z?)")>; + +def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,3,4,10]; +} +def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)ri")>; +def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)rr")>; +def: InstRW<[SKXWriteResGroup247], (instregex "IN8ri")>; +def: InstRW<[SKXWriteResGroup247], (instregex "IN8rr")>; + +def SKXWriteResGroup248 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 35; + let NumMicroOps = 23; + let ResourceCycles = [1,5,2,1,4,10]; +} +def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)ir")>; +def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)rr")>; +def: InstRW<[SKXWriteResGroup248], (instregex "OUT8ir")>; +def: InstRW<[SKXWriteResGroup248], (instregex "OUT8rr")>; + +def SKXWriteResGroup249 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { + let Latency = 37; + let NumMicroOps = 21; + let ResourceCycles = [9,7,5]; +} +def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTQZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> { + let Latency = 37; + let NumMicroOps = 31; + let ResourceCycles = [1,8,1,21]; +} +def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>; + +def SKXWriteResGroup251 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> { + let Latency = 38; + let NumMicroOps = 4; + let ResourceCycles = [2,1,1]; +} +def: InstRW<[SKXWriteResGroup251], (instregex "VSQRTPDZm(b?)(k?)(z?)")>; + +def SKXWriteResGroup252 : SchedWriteRes<[SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort15,SKXPort0156]> { + let Latency = 40; + let NumMicroOps = 18; + let ResourceCycles = [1,1,2,3,1,1,1,8]; +} +def: InstRW<[SKXWriteResGroup252], (instregex "VMCLEARm")>; + +def SKXWriteResGroup253 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 41; + let NumMicroOps = 39; + let ResourceCycles = [1,10,1,1,26]; +} +def: InstRW<[SKXWriteResGroup253], (instregex "XSAVE64")>; + +def SKXWriteResGroup254 : SchedWriteRes<[SKXPort5,SKXPort0156]> { + let Latency = 42; + let NumMicroOps = 22; + let ResourceCycles = [2,20]; +} +def: InstRW<[SKXWriteResGroup254], (instregex "RDTSCP")>; + +def SKXWriteResGroup255 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> { + let Latency = 42; + let NumMicroOps = 40; + let ResourceCycles = [1,11,1,1,26]; +} +def: InstRW<[SKXWriteResGroup255], (instregex "XSAVE")>; + +def SKXWriteResGroup256 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 44; + let NumMicroOps = 22; + let ResourceCycles = [9,7,1,5]; +} +def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)(k?)(z?)")>; +def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTQZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup258 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05,SKXPort06,SKXPort0156]> { + let Latency = 62; + let NumMicroOps = 64; + let ResourceCycles = [2,8,5,10,39]; +} +def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>; +def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>; + +def SKXWriteResGroup259 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 63; + let NumMicroOps = 88; + let ResourceCycles = [4,4,31,1,2,1,45]; +} +def: InstRW<[SKXWriteResGroup259], (instregex "FXRSTOR64")>; + +def SKXWriteResGroup260 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> { + let Latency = 63; + let NumMicroOps = 90; + let ResourceCycles = [4,2,33,1,2,1,47]; +} +def: InstRW<[SKXWriteResGroup260], (instregex "FXRSTOR")>; + +def SKXWriteResGroup261 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> { + let Latency = 67; + let NumMicroOps = 35; + let ResourceCycles = [17,11,7]; +} +def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr(b?)(k?)(z?)")>; + +def SKXWriteResGroup262 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> { + let Latency = 74; + let NumMicroOps = 36; + let ResourceCycles = [17,11,1,7]; +} +def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)(k?)(z?)")>; + +def SKXWriteResGroup263 : SchedWriteRes<[SKXPort5,SKXPort05,SKXPort0156]> { + let Latency = 75; + let NumMicroOps = 15; + let ResourceCycles = [6,3,6]; +} +def: InstRW<[SKXWriteResGroup263], (instregex "FNINIT")>; + +def SKXWriteResGroup264 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> { + let Latency = 76; + let NumMicroOps = 32; + let ResourceCycles = [7,2,8,3,1,11]; +} +def: InstRW<[SKXWriteResGroup264], (instregex "DIV(16|32|64)r")>; + +def SKXWriteResGroup265 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> { + let Latency = 102; + let NumMicroOps = 66; + let ResourceCycles = [4,2,4,8,14,34]; +} +def: InstRW<[SKXWriteResGroup265], (instregex "IDIV(16|32|64)r")>; + +def SKXWriteResGroup266 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort237,SKXPort06,SKXPort0156]> { + let Latency = 106; + let NumMicroOps = 100; + let ResourceCycles = [9,1,11,16,1,11,21,30]; +} +def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>; +def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>; + +def SKXWriteResGroup267 : SchedWriteRes<[SKXPort6,SKXPort0156]> { + let Latency = 140; + let NumMicroOps = 4; + let ResourceCycles = [1,3]; +} +def: InstRW<[SKXWriteResGroup267], (instregex "PAUSE")>; +} // SchedModel diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td index d831a7974359..2e21a97541b2 100644 --- a/lib/Target/X86/X86Schedule.td +++ b/lib/Target/X86/X86Schedule.td @@ -299,6 +299,7 @@ def IIC_SSE_SHUFP : InstrItinClass; def IIC_SSE_PSHUF_RI : InstrItinClass; def IIC_SSE_PSHUF_MI : InstrItinClass; +def IIC_SSE_PACK : InstrItinClass; def IIC_SSE_UNPCK : InstrItinClass; def IIC_SSE_MOVMSK : InstrItinClass; @@ -384,8 +385,6 @@ def IIC_SSE_CVT_PD_RR : InstrItinClass; def IIC_SSE_CVT_PD_RM : InstrItinClass; def IIC_SSE_CVT_PS_RR : InstrItinClass; def IIC_SSE_CVT_PS_RM : InstrItinClass; -def IIC_SSE_CVT_PI2PS_RR : InstrItinClass; -def IIC_SSE_CVT_PI2PS_RM : InstrItinClass; def IIC_SSE_CVT_Scalar_RR : InstrItinClass; def IIC_SSE_CVT_Scalar_RM : InstrItinClass; def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass; @@ -395,6 +394,8 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass; def IIC_SSE_CVT_SD2SI_RM : InstrItinClass; def IIC_SSE_CVT_SD2SI_RR : InstrItinClass; +def IIC_AVX_ZERO : InstrItinClass; + // MMX def IIC_MMX_MOV_MM_RM : InstrItinClass; def IIC_MMX_MOV_REG_MM : InstrItinClass; @@ -425,12 +426,21 @@ def IIC_MMX_PSHUF : InstrItinClass; def IIC_MMX_PEXTR : InstrItinClass; def IIC_MMX_PINSRW : InstrItinClass; def IIC_MMX_MASKMOV : InstrItinClass; - +def IIC_MMX_MOVMSK : InstrItinClass; def IIC_MMX_CVT_PD_RR : InstrItinClass; def IIC_MMX_CVT_PD_RM : InstrItinClass; def IIC_MMX_CVT_PS_RR : InstrItinClass; def IIC_MMX_CVT_PS_RM : InstrItinClass; +def IIC_3DNOW_FALU_RM : InstrItinClass; +def IIC_3DNOW_FALU_RR : InstrItinClass; +def IIC_3DNOW_FCVT_F2I_RM : InstrItinClass; +def IIC_3DNOW_FCVT_F2I_RR : InstrItinClass; +def IIC_3DNOW_FCVT_I2F_RM : InstrItinClass; +def IIC_3DNOW_FCVT_I2F_RR : InstrItinClass; +def IIC_3DNOW_MISC_FUNC_REG : InstrItinClass; +def IIC_3DNOW_MISC_FUNC_MEM : InstrItinClass; + def IIC_CMPX_LOCK : InstrItinClass; def IIC_CMPX_LOCK_8 : InstrItinClass; def IIC_CMPX_LOCK_8B : InstrItinClass; @@ -439,6 +449,7 @@ def IIC_CMPX_LOCK_16B : InstrItinClass; def IIC_XADD_LOCK_MEM : InstrItinClass; def IIC_XADD_LOCK_MEM8 : InstrItinClass; +def IIC_FCMOV : InstrItinClass; def IIC_FILD : InstrItinClass; def IIC_FLD : InstrItinClass; def IIC_FLD80 : InstrItinClass; @@ -467,6 +478,8 @@ def IIC_FXTRACT : InstrItinClass; def IIC_FPREM1 : InstrItinClass; def IIC_FPSTP : InstrItinClass; def IIC_FPREM : InstrItinClass; +def IIC_FSIGN : InstrItinClass; +def IIC_FSQRT : InstrItinClass; def IIC_FYL2XP1 : InstrItinClass; def IIC_FSINCOS : InstrItinClass; def IIC_FRNDINT : InstrItinClass; @@ -483,16 +496,31 @@ def IIC_INT : InstrItinClass; def IIC_INT3 : InstrItinClass; def IIC_INVD : InstrItinClass; def IIC_INVLPG : InstrItinClass; +def IIC_INVPCID : InstrItinClass; def IIC_IRET : InstrItinClass; def IIC_HLT : InstrItinClass; def IIC_LXS : InstrItinClass; def IIC_LTR : InstrItinClass; +def IIC_MPX : InstrItinClass; +def IIC_PKU : InstrItinClass; +def IIC_PTWRITE : InstrItinClass; +def IIC_RDPID : InstrItinClass; +def IIC_RDRAND : InstrItinClass; +def IIC_RDSEED : InstrItinClass; def IIC_RDTSC : InstrItinClass; +def IIC_RDTSCP : InstrItinClass; def IIC_RSM : InstrItinClass; def IIC_SIDT : InstrItinClass; def IIC_SGDT : InstrItinClass; def IIC_SLDT : InstrItinClass; +def IIC_SMAP : InstrItinClass; +def IIC_SMX : InstrItinClass; def IIC_STR : InstrItinClass; +def IIC_SKINIT : InstrItinClass; +def IIC_SVM : InstrItinClass; +def IIC_VMX : InstrItinClass; +def IIC_CLGI : InstrItinClass; +def IIC_STGI : InstrItinClass; def IIC_SWAPGS : InstrItinClass; def IIC_SYSCALL : InstrItinClass; def IIC_SYS_ENTER_EXIT : InstrItinClass; @@ -522,6 +550,8 @@ def IIC_PUSH_CS : InstrItinClass; def IIC_PUSH_SR : InstrItinClass; def IIC_POP_SR : InstrItinClass; def IIC_POP_SR_SS : InstrItinClass; +def IIC_SEGMENT_BASE_R : InstrItinClass; +def IIC_SEGMENT_BASE_W : InstrItinClass; def IIC_VERR : InstrItinClass; def IIC_VERW_REG : InstrItinClass; def IIC_VERW_MEM : InstrItinClass; @@ -547,6 +577,10 @@ def IIC_PUSH_A : InstrItinClass; def IIC_BSWAP : InstrItinClass; def IIC_BIT_SCAN_MEM : InstrItinClass; def IIC_BIT_SCAN_REG : InstrItinClass; +def IIC_LZCNT_RR : InstrItinClass; +def IIC_LZCNT_RM : InstrItinClass; +def IIC_TZCNT_RR : InstrItinClass; +def IIC_TZCNT_RM : InstrItinClass; def IIC_MOVS : InstrItinClass; def IIC_STOS : InstrItinClass; def IIC_SCAS : InstrItinClass; @@ -659,10 +693,3 @@ def GenericPostRAModel : GenericX86Model { let PostRAScheduler = 1; } -include "X86ScheduleAtom.td" -include "X86SchedSandyBridge.td" -include "X86SchedHaswell.td" -include "X86ScheduleSLM.td" -include "X86ScheduleZnver1.td" -include "X86ScheduleBtVer2.td" - diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td index a5b440182aa9..e052ad98104c 100644 --- a/lib/Target/X86/X86ScheduleAtom.td +++ b/lib/Target/X86/X86ScheduleAtom.td @@ -212,6 +212,7 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_SSE_PACK, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >, @@ -337,6 +338,7 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >, InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >, InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >, + InstrItinData<IIC_MMX_MOVMSK, [InstrStage<3, [Port0]>] >, // conversions // from/to PD InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >, @@ -362,6 +364,7 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >, InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >, + InstrItinData<IIC_FCMOV, [InstrStage<9, [Port0, Port1]>] >, InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >, InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >, InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >, @@ -392,6 +395,8 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >, InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >, InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FSIGN, [InstrStage<1, [Port1]>] >, + InstrItinData<IIC_FSQRT, [InstrStage<71, [Port0, Port1]>] >, // System instructions InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >, @@ -404,6 +409,7 @@ def AtomItineraries : ProcessorItineraries< InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >, InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >, InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >, + InstrItinData<IIC_RDTSCP, [InstrStage<30, [Port0, Port1]>] >, InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >, InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >, InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >, diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td index 9dcc968a1a7a..6ea81a25e41c 100644 --- a/lib/Target/X86/X86ScheduleBtVer2.td +++ b/lib/Target/X86/X86ScheduleBtVer2.td @@ -135,6 +135,30 @@ def : WriteRes<WriteLEA, [JALU01]>; defm : JWriteResIntPair<WriteShift, JALU01, 1>; +def WriteSHLDrri : SchedWriteRes<[JALU01]> { + let Latency = 3; + let ResourceCycles = [6]; + let NumMicroOps = 6; +} +def: InstRW<[WriteSHLDrri], (instregex "SHLD(16|32|64)rri8")>; +def: InstRW<[WriteSHLDrri], (instregex "SHRD(16|32|64)rri8")>; + +def WriteSHLDrrCL : SchedWriteRes<[JALU01]> { + let Latency = 4; + let ResourceCycles = [8]; + let NumMicroOps = 7; +} +def: InstRW<[WriteSHLDrrCL], (instregex "SHLD(16|32|64)rrCL")>; +def: InstRW<[WriteSHLDrrCL], (instregex "SHRD(16|32|64)rrCL")>; + +def WriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> { + let Latency = 9; + let ResourceCycles = [1, 22]; + let NumMicroOps = 8; +} +def: InstRW<[WriteSHLDm], (instregex "SHLD(16|32|64)mr(i8|CL)")>; +def: InstRW<[WriteSHLDm], (instregex "SHRD(16|32|64)mr(i8|CL)")>; + //////////////////////////////////////////////////////////////////////////////// // Loads, stores, and moves, not folded with other operations. // FIXME: Split x86 and SSE load/store/moves @@ -142,7 +166,10 @@ defm : JWriteResIntPair<WriteShift, JALU01, 1>; def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; } def : WriteRes<WriteStore, [JSAGU]>; -def : WriteRes<WriteMove, [JAny]>; +def : WriteRes<WriteMove, [JALU01]>; + +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; //////////////////////////////////////////////////////////////////////////////// // Idioms that clear a register, like xorps %xmm0, %xmm0. @@ -168,6 +195,7 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>; defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>; defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>; +defm : JWriteResFpuPair<WriteFMA, JFPU1, 2>; // NOTE: Doesn't exist on Jaguar. defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>; defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>; defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>; @@ -199,11 +227,13 @@ defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conve def : WriteRes<WriteFVarBlend, [JFPU01]> { let Latency = 2; - let ResourceCycles = [2]; + let ResourceCycles = [4]; + let NumMicroOps = 3; } def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> { let Latency = 7; - let ResourceCycles = [1, 2]; + let ResourceCycles = [1, 4]; + let NumMicroOps = 3; } // Vector integer operations. @@ -217,21 +247,20 @@ defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>; def : WriteRes<WriteVarBlend, [JFPU01]> { let Latency = 2; - let ResourceCycles = [2]; + let ResourceCycles = [4]; + let NumMicroOps = 3; } def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> { let Latency = 7; - let ResourceCycles = [1, 2]; + let ResourceCycles = [1, 4]; + let NumMicroOps = 3; } // FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2? -def : WriteRes<WriteVarVecShift, [JFPU01]> { - let Latency = 1; - let ResourceCycles = [1]; -} +def : WriteRes<WriteVarVecShift, [JFPU01]> {} def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> { let Latency = 6; - let ResourceCycles = [1, 1]; + let ResourceCycles = [1, 2]; } def : WriteRes<WriteMPSAD, [JFPU0]> { @@ -249,43 +278,49 @@ def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> { // FIXME: approximate latencies + pipe dependencies //////////////////////////////////////////////////////////////////////////////// -def : WriteRes<WritePCmpIStrM, [JFPU01]> { - let Latency = 7; - let ResourceCycles = [2]; +def : WriteRes<WritePCmpIStrM, [JFPU1,JFPU0]> { + let Latency = 8; + let ResourceCycles = [2, 2]; + let NumMicroOps = 3; } -def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> { - let Latency = 12; - let ResourceCycles = [1, 2]; +def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU1, JFPU0]> { + let Latency = 13; + let ResourceCycles = [1, 2, 2]; + let NumMicroOps = 3; } // Packed Compare Explicit Length Strings, Return Mask -def : WriteRes<WritePCmpEStrM, [JFPU01]> { - let Latency = 13; - let ResourceCycles = [5]; +def : WriteRes<WritePCmpEStrM, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> { + let Latency = 14; + let ResourceCycles = [5, 5, 5, 5, 5]; + let NumMicroOps = 9; } -def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> { - let Latency = 18; - let ResourceCycles = [1, 5]; +def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> { + let Latency = 19; + let ResourceCycles = [1, 5, 5, 5, 5, 5]; + let NumMicroOps = 9; } // Packed Compare Implicit Length Strings, Return Index -def : WriteRes<WritePCmpIStrI, [JFPU01]> { - let Latency = 6; - let ResourceCycles = [2]; +def : WriteRes<WritePCmpIStrI, [JFPU1, JFPU0]> { + let Latency = 7; + let ResourceCycles = [2, 2]; } -def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> { - let Latency = 11; - let ResourceCycles = [1, 2]; +def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU1, JFPU0]> { + let Latency = 12; + let ResourceCycles = [1, 2, 2]; } // Packed Compare Explicit Length Strings, Return Index -def : WriteRes<WritePCmpEStrI, [JFPU01]> { - let Latency = 13; - let ResourceCycles = [5]; +def : WriteRes<WritePCmpEStrI, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> { + let Latency = 14; + let ResourceCycles = [5, 5, 5, 5, 5]; + let NumMicroOps = 9; } -def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> { - let Latency = 18; - let ResourceCycles = [1, 5]; +def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> { + let Latency = 19; + let ResourceCycles = [1, 5, 5, 5, 5, 5]; + let NumMicroOps = 9; } //////////////////////////////////////////////////////////////////////////////// @@ -371,6 +406,38 @@ def : WriteRes<WriteFence, [JSAGU]>; def : WriteRes<WriteNop, []>; //////////////////////////////////////////////////////////////////////////////// +// SSE4.1 instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WriteDPPS: SchedWriteRes<[JFPU0, JFPU1]> { + let Latency = 11; + let ResourceCycles = [3,3]; + let NumMicroOps = 5; +} +def : InstRW<[WriteDPPS], (instregex "(V)?DPPSrri")>; + +def WriteDPPSLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> { + let Latency = 16; + let ResourceCycles = [1,3,3]; + let NumMicroOps = 6; +} +def : InstRW<[WriteDPPSLd], (instregex "(V)?DPPSrmi")>; + +def WriteDPPD: SchedWriteRes<[JFPU0, JFPU1]> { + let Latency = 9; + let ResourceCycles = [3,3]; + let NumMicroOps = 3; +} +def : InstRW<[WriteDPPD], (instregex "(V)?DPPDrri")>; + +def WriteDPPDLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> { + let Latency = 14; + let ResourceCycles = [1,3,3]; + let NumMicroOps = 3; +} +def : InstRW<[WriteDPPDLd], (instregex "(V)?DPPDrmi")>; + +//////////////////////////////////////////////////////////////////////////////// // SSE4A instructions. //////////////////////////////////////////////////////////////////////////////// @@ -387,9 +454,73 @@ def WriteINSERTQ: SchedWriteRes<[JFPU01]> { def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>; //////////////////////////////////////////////////////////////////////////////// +// F16C instructions. +//////////////////////////////////////////////////////////////////////////////// + +def WriteCVT3: SchedWriteRes<[JFPU1]> { + let Latency = 3; +} +def : InstRW<[WriteCVT3], (instregex "VCVTPS2PHrr")>; +def : InstRW<[WriteCVT3], (instregex "VCVTPH2PSrr")>; + +def WriteCVT3St: SchedWriteRes<[JFPU1, JSAGU]> { + let Latency = 3; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVT3St], (instregex "VCVTPS2PHmr")>; + +def WriteCVT3Ld: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 8; + let ResourceCycles = [1, 1]; +} +def : InstRW<[WriteCVT3Ld], (instregex "VCVTPH2PSrm")>; + +def WriteCVTPS2PHY: SchedWriteRes<[JFPU1, JFPU01]> { + let Latency = 6; + let ResourceCycles = [2,2]; + let NumMicroOps = 3; +} +def : InstRW<[WriteCVTPS2PHY], (instregex "VCVTPS2PHYrr")>; + +def WriteCVTPS2PHYSt: SchedWriteRes<[JFPU1, JFPU01, JSAGU]> { + let Latency = 11; + let ResourceCycles = [2,2,1]; + let NumMicroOps = 3; +} +def : InstRW<[WriteCVTPS2PHYSt], (instregex "VCVTPS2PHYmr")>; + +def WriteCVTPH2PSY: SchedWriteRes<[JFPU1]> { + let Latency = 3; + let ResourceCycles = [2]; + let NumMicroOps = 2; +} +def : InstRW<[WriteCVTPH2PSY], (instregex "VCVTPH2PSYrr")>; + +def WriteCVTPH2PSYLd: SchedWriteRes<[JLAGU, JFPU1]> { + let Latency = 8; + let ResourceCycles = [1,2]; + let NumMicroOps = 2; +} +def : InstRW<[WriteCVTPH2PSYLd], (instregex "VCVTPH2PSYrm")>; + +//////////////////////////////////////////////////////////////////////////////// // AVX instructions. //////////////////////////////////////////////////////////////////////////////// +def WriteVDPPSY: SchedWriteRes<[JFPU1, JFPU0]> { + let Latency = 12; + let ResourceCycles = [6, 6]; + let NumMicroOps = 10; +} +def : InstRW<[WriteVDPPSY], (instregex "VDPPSYrr")>; + +def WriteVDPPSYLd: SchedWriteRes<[JLAGU, JFPU1, JFPU0]> { + let Latency = 17; + let ResourceCycles = [1, 6, 6]; + let NumMicroOps = 11; +} +def : InstRW<[WriteVDPPSYLd, ReadAfterLd], (instregex "VDPPSYrm")>; + def WriteFAddY: SchedWriteRes<[JFPU0]> { let Latency = 3; let ResourceCycles = [2]; @@ -438,6 +569,152 @@ def WriteVMULYPSLd: SchedWriteRes<[JLAGU, JFPU1]> { } def : InstRW<[WriteVMULYPSLd, ReadAfterLd], (instregex "VMULPSYrm", "VRCPPSYm", "VRSQRTPSYm")>; +def WriteVCVTY: SchedWriteRes<[JSTC]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : InstRW<[WriteVCVTY], (instregex "VCVTDQ2P(S|D)Yrr")>; +def : InstRW<[WriteVCVTY], (instregex "VROUNDYP(S|D)r")>; +def : InstRW<[WriteVCVTY], (instregex "VCVTPS2DQYrr")>; +def : InstRW<[WriteVCVTY], (instregex "VCVTTPS2DQYrr")>; + +def WriteVCVTYLd: SchedWriteRes<[JLAGU, JSTC]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTDQ2P(S|D)Yrm")>; +def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VROUNDYP(S|D)m")>; +def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTPS2DQYrm")>; +def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTTPS2DQYrm")>; + +def WriteVMONTPSt: SchedWriteRes<[JSTC, JLAGU]> { + let Latency = 3; + let ResourceCycles = [2,1]; +} +def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTP(S|D)Ymr")>; +def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTDQYmr")>; + +def WriteVCVTPDY: SchedWriteRes<[JSTC, JFPU01]> { + let Latency = 6; + let ResourceCycles = [2, 4]; +} +def : InstRW<[WriteVCVTPDY], (instregex "VCVTPD2(DQ|PS)Yrr")>; +def : InstRW<[WriteVCVTPDY], (instregex "VCVTTPD2DQYrr")>; + +def WriteVCVTPDYLd: SchedWriteRes<[JLAGU, JSTC, JFPU01]> { + let Latency = 11; + let ResourceCycles = [1, 2, 4]; +} +def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTPD2(DQ|PS)Yrm")>; +def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTTPD2DQYrm")>; + +def WriteVBlendVPY: SchedWriteRes<[JFPU01]> { + let Latency = 3; + let ResourceCycles = [6]; +} +def : InstRW<[WriteVBlendVPY], (instregex "VBLENDVP(S|D)Yrr", "VPERMILP(D|S)Yrr")>; + +def WriteVBlendVPYLd: SchedWriteRes<[JLAGU, JFPU01]> { + let Latency = 8; + let ResourceCycles = [1, 6]; +} +def : InstRW<[WriteVBlendVPYLd, ReadAfterLd], (instregex "VBLENDVP(S|D)Yrm")>; + +def WriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01]> { + let Latency = 6; + let ResourceCycles = [1, 4]; +} +def : InstRW<[WriteVBROADCASTYLd, ReadAfterLd], (instregex "VBROADCASTS(S|D)Yrm")>; + +def WriteFPAY22: SchedWriteRes<[JFPU0]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : InstRW<[WriteFPAY22], (instregex "VCMPP(S|D)Yrri", "VM(AX|IN)P(D|S)Yrr")>; + +def WriteFPAY22Ld: SchedWriteRes<[JLAGU, JFPU0]> { + let Latency = 7; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteFPAY22Ld, ReadAfterLd], (instregex "VCMPP(S|D)Yrmi", "VM(AX|IN)P(D|S)Yrm")>; + +def WriteVHAddSubY: SchedWriteRes<[JFPU0]> { + let Latency = 3; + let ResourceCycles = [2]; +} +def : InstRW<[WriteVHAddSubY], (instregex "VH(ADD|SUB)P(D|S)Yrr")>; + +def WriteVHAddSubYLd: SchedWriteRes<[JLAGU, JFPU0]> { + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteVHAddSubYLd], (instregex "VH(ADD|SUB)P(D|S)Yrm")>; + +def WriteVMaskMovLd: SchedWriteRes<[JLAGU,JFPU01]> { + let Latency = 6; + let ResourceCycles = [1, 2]; +} +def : InstRW<[WriteVMaskMovLd], (instregex "VMASKMOVP(D|S)rm")>; + +def WriteVMaskMovYLd: SchedWriteRes<[JLAGU,JFPU01]> { + let Latency = 6; + let ResourceCycles = [1, 4]; +} +def : InstRW<[WriteVMaskMovYLd], (instregex "VMASKMOVP(D|S)Yrm")>; + +def WriteVMaskMovSt: SchedWriteRes<[JFPU01,JSAGU]> { + let Latency = 6; + let ResourceCycles = [4, 1]; +} +def : InstRW<[WriteVMaskMovSt], (instregex "VMASKMOVP(D|S)mr")>; + +def WriteVMaskMovYSt: SchedWriteRes<[JFPU01,JSAGU]> { + let Latency = 6; + let ResourceCycles = [4, 1]; +} +def : InstRW<[WriteVMaskMovYSt], (instregex "VMASKMOVP(D|S)Ymr")>; + +// TODO: In fact we have latency '2+i'. The +i represents an additional 1 cycle transfer +// operation which moves the floating point result to the integer unit. During this +// additional cycle the floating point unit execution resources are not occupied +// and ALU0 in the integer unit is occupied instead. +def WriteVMOVMSK: SchedWriteRes<[JFPU0]> { + let Latency = 3; +} +def : InstRW<[WriteVMOVMSK], (instregex "VMOVMSKP(D|S)(Y)?rr")>; + +// TODO: In fact we have latency '3+i'. The +i represents an additional 1 cycle transfer +// operation which moves the floating point result to the integer unit. During this +// additional cycle the floating point unit execution resources are not occupied +// and ALU0 in the integer unit is occupied instead. +def WriteVTESTY: SchedWriteRes<[JFPU01, JFPU0]> { + let Latency = 4; + let ResourceCycles = [2, 2]; + let NumMicroOps = 3; +} +def : InstRW<[WriteVTESTY], (instregex "VTESTP(S|D)Yrr")>; +def : InstRW<[WriteVTESTY], (instregex "VPTESTYrr")>; + +def WriteVTESTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPU0]> { + let Latency = 9; + let ResourceCycles = [1, 2, 2]; + let NumMicroOps = 3; +} +def : InstRW<[WriteVTESTYLd], (instregex "VTESTP(S|D)Yrm")>; +def : InstRW<[WriteVTESTYLd], (instregex "VPTESTYrm")>; + +def WriteVTEST: SchedWriteRes<[JFPU0]> { + let Latency = 3; +} +def : InstRW<[WriteVTEST], (instregex "VTESTP(S|D)rr")>; +def : InstRW<[WriteVTEST], (instregex "VPTESTrr")>; + +def WriteVTESTLd: SchedWriteRes<[JLAGU, JFPU0]> { + let Latency = 8; +} +def : InstRW<[WriteVTESTLd], (instregex "VTESTP(S|D)rm")>; +def : InstRW<[WriteVTESTLd], (instregex "VPTESTrm")>; + def WriteVSQRTYPD: SchedWriteRes<[JFPU1]> { let Latency = 54; let ResourceCycles = [54]; @@ -462,5 +739,16 @@ def WriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> { } def : InstRW<[WriteVSQRTYPSLd], (instregex "VSQRTPSYm")>; +def WriteJVZEROALL: SchedWriteRes<[]> { + let Latency = 90; + let NumMicroOps = 73; +} +def : InstRW<[WriteJVZEROALL], (instregex "VZEROALL")>; + +def WriteJVZEROUPPER: SchedWriteRes<[]> { + let Latency = 46; + let NumMicroOps = 37; +} +def : InstRW<[WriteJVZEROUPPER], (instregex "VZEROUPPER")>; } // SchedModel diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td index 03ed2db2350d..35ec7488db72 100644 --- a/lib/Target/X86/X86ScheduleSLM.td +++ b/lib/Target/X86/X86ScheduleSLM.td @@ -32,7 +32,6 @@ def SLMModel : SchedMachineModel { let SchedModel = SLMModel in { // Silvermont has 5 reservation stations for micro-ops - def IEC_RSV0 : ProcResource<1>; def IEC_RSV1 : ProcResource<1>; def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; } @@ -78,6 +77,9 @@ def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; } def : WriteRes<WriteMove, [IEC_RSV01]>; def : WriteRes<WriteZero, []>; +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>; defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>; defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>; @@ -249,7 +251,7 @@ def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; } def : WriteRes<WriteFence, [MEC_RSV]>; def : WriteRes<WriteNop, []>; -// AVX is not supported on that architecture, but we should define the basic +// AVX/FMA is not supported on that architecture, but we should define the basic // scheduling resources anyway. def : WriteRes<WriteIMulH, [FPC_RSV0]>; defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>; @@ -257,4 +259,5 @@ defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>; defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>; defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>; defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>; +defm : SMWriteResPair<WriteFMA, FPC_RSV0, 1>; } // SchedModel diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td index d5b4cfe2ddee..a4e5327213c2 100644 --- a/lib/Target/X86/X86ScheduleZnver1.td +++ b/lib/Target/X86/X86ScheduleZnver1.td @@ -92,7 +92,7 @@ def ZnDivider : ProcResource<1>; def : ReadAdvance<ReadAfterLd, 4>; // (a folded load is an instruction that loads and does some operation) -// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops +// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops // Instructions with folded loads are usually micro-fused, so they only appear // as two micro-ops. // a. load and @@ -104,9 +104,10 @@ multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW, // Register variant takes 1-cycle on Execution Port. def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; } - // Memory variant also uses a cycle on ZnAGU + // Memory variant also uses a cycle on ZnAGU // adds 4 cycles to the latency. def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> { + let NumMicroOps = 2; let Latency = !add(Lat, 4); } } @@ -125,7 +126,7 @@ multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW, } } -// WriteRMW is set for instructions with Memory write +// WriteRMW is set for instructions with Memory write // operation in codegen def : WriteRes<WriteRMW, [ZnAGU]>; @@ -139,6 +140,9 @@ defm : ZnWriteResPair<WriteALU, ZnALU, 1>; defm : ZnWriteResPair<WriteShift, ZnALU, 1>; defm : ZnWriteResPair<WriteJump, ZnALU, 1>; +// Treat misc copies as a move. +def : InstRW<[WriteMove], (instrs COPY)>; + // IDIV def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> { let Latency = 41; @@ -174,6 +178,7 @@ defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>; defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>; defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>; defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>; +defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>; defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>; defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>; defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>; @@ -220,4 +225,1550 @@ let Latency = 100 in { def : WriteRes<WritePCmpIStrI, []>; def : WriteRes<WritePCmpIStrILd, []>; } + +//=== Regex based itineraries ===// +// Notation: +// - r: register. +// - m = memory. +// - i = immediate +// - mm: 64 bit mmx register. +// - x = 128 bit xmm register. +// - (x)mm = mmx or xmm register. +// - y = 256 bit ymm register. +// - v = any vector register. + +//=== Integer Instructions ===// +//-- Move instructions --// +// MOV. +// r16,m. +def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>; + +// MOVSX, MOVZX. +// r,m. +def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>; + +// CMOVcc. +// r,r. +def : InstRW<[WriteALU], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>; +// r,m. +def : InstRW<[WriteALULd, ReadAfterLd], + (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>; + +// XCHG. +// r,r. +def ZnWriteXCHG : SchedWriteRes<[ZnALU]> { + let NumMicroOps = 2; + let ResourceCycles = [2]; +} + +def : InstRW<[ZnWriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>; + +// r,m. +def ZnWriteXCHGrm : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 5; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>; + +def : InstRW<[WriteMicrocoded], (instregex "XLAT")>; + +// POP16. +// r. +def ZnWritePop16r : SchedWriteRes<[ZnAGU]>{ + let Latency = 5; + let NumMicroOps = 2; +} +def : InstRW<[ZnWritePop16r], (instregex "POP16rmm")>; +def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>; +def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>; + + +// PUSH. +// r. Has default values. +// m. +def ZnWritePUSH : SchedWriteRes<[ZnAGU]>{ + let Latency = 4; +} +def : InstRW<[ZnWritePUSH], (instregex "PUSH(16|32)rmm")>; + +//PUSHF +def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>; + +// PUSHA. +def ZnWritePushA : SchedWriteRes<[ZnAGU]> { + let Latency = 8; +} +def : InstRW<[ZnWritePushA], (instregex "PUSHA(16|32)")>; + +//LAHF +def : InstRW<[WriteMicrocoded], (instregex "LAHF")>; + +// SAHF. +def ZnWriteSAHF : SchedWriteRes<[ZnALU]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteSAHF], (instregex "SAHF")>; + +// BSWAP. +def ZnWriteBSwap : SchedWriteRes<[ZnALU]> { + let ResourceCycles = [4]; +} +def : InstRW<[ZnWriteBSwap], (instregex "BSWAP")>; + +// MOVBE. +// r,m. +def ZnWriteMOVBE : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 5; +} +def : InstRW<[ZnWriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>; + +// m16,r16. +def : InstRW<[ZnWriteMOVBE], (instregex "MOVBE(16|32|64)mr")>; + +//-- Arithmetic instructions --// + +// ADD SUB. +// m,r/i. +def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)", + "(ADD|SUB)(8|16|32|64)mi8", + "(ADD|SUB)64mi32")>; + +// ADC SBB. +// r,r/i. +def : InstRW<[WriteALU], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)", + "(ADC|SBB)(16|32|64)ri8", + "(ADC|SBB)64ri32", + "(ADC|SBB)(8|16|32|64)rr_REV")>; + +// r,m. +def : InstRW<[WriteALULd, ReadAfterLd], + (instregex "(ADC|SBB)(8|16|32|64)rm")>; + +// m,r/i. +def : InstRW<[WriteALULd], + (instregex "(ADC|SBB)(8|16|32|64)m(r|i)", + "(ADC|SBB)(16|32|64)mi8", + "(ADC|SBB)64mi32")>; + +// INC DEC NOT NEG. +// m. +def : InstRW<[WriteALULd], + (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m", + "(INC|DEC)64(16|32)m")>; + +// MUL IMUL. +// r16. +def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMul16], (instregex "IMUL16r", "MUL16r")>; + +// m16. +def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instregex "IMUL16m", "MUL16m")>; + +// r32. +def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMul32], (instregex "IMUL32r", "MUL32r")>; + +// m32. +def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instregex "IMUL32m", "MUL32m")>; + +// r64. +def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 4; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteMul64], (instregex "IMUL64r", "MUL64r")>; + +// m64. +def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 9; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instregex "IMUL64m", "MUL64m")>; + +// r16,r16. +def ZnWriteMul16rri : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>; + +// r16,m16. +def ZnWriteMul16rmi : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMul16rmi, ReadAfterLd], (instregex "IMUL16rmi", "IMUL16rmi8")>; + +// MULX. +// r32,r32,r32. +def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> { + let Latency = 3; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteMulX32], (instregex "MULX32rr")>; + +// r32,r32,m32. +def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; + let ResourceCycles = [1, 2, 2]; +} +def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instregex "MULX32rm")>; + +// r64,r64,r64. +def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> { + let Latency = 3; +} +def : InstRW<[ZnWriteMulX64], (instregex "MULX64rr")>; + +// r64,r64,m64. +def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instregex "MULX64rm")>; + +// DIV, IDIV. +// r8. +def ZnWriteDiv8 : SchedWriteRes<[ZnALU2, ZnDivider]> { + let Latency = 15; +} +def : InstRW<[ZnWriteDiv8], (instregex "DIV8r", "IDIV8r")>; + +// r16. +def ZnWriteDiv16 : SchedWriteRes<[ZnALU2, ZnDivider]> { + let Latency = 17; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteDiv16], (instregex "DIV16r", "IDIV16r")>; + +// r32. +def ZnWriteDiv32 : SchedWriteRes<[ZnALU2, ZnDivider]> { + let Latency = 25; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteDiv32], (instregex "DIV32r", "IDIV32r")>; + +// r64. +def ZnWriteDiv64 : SchedWriteRes<[ZnALU2, ZnDivider]> { + let Latency = 41; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteDiv64], (instregex "DIV64r", "IDIV64r")>; + +//-- Control transfer instructions --// + +// J(E|R)CXZ. +def ZnWriteJCXZ : SchedWriteRes<[ZnALU03]>; +def : InstRW<[ZnWriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>; + +// INTO +def : InstRW<[WriteMicrocoded], (instregex "INTO")>; + +// LOOP. +def ZnWriteLOOP : SchedWriteRes<[ZnALU03]>; +def : InstRW<[ZnWriteLOOP], (instregex "LOOP")>; + +// LOOP(N)E, LOOP(N)Z +def ZnWriteLOOPE : SchedWriteRes<[ZnALU03]>; +def : InstRW<[ZnWriteLOOPE], (instregex "LOOPE", "LOOPNE", + "LOOPZ", "LOOPNZ")>; + +// CALL. +// r. +def ZnWriteCALLr : SchedWriteRes<[ZnAGU, ZnALU03]>; +def : InstRW<[ZnWriteCALLr], (instregex "CALL(16|32)r")>; + +def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>; + +// RET. +def ZnWriteRET : SchedWriteRes<[ZnALU03]> { + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)", + "IRET(D|Q)", "RETF")>; + +//-- Logic instructions --// + +// AND OR XOR. +// m,r/i. +def : InstRW<[WriteALULd], + (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)", + "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>; + +// ANDN. +// r,r. +def : InstRW<[WriteALU], (instregex "ANDN(32|64)rr")>; +// r,m. +def : InstRW<[WriteALULd, ReadAfterLd], (instregex "ANDN(32|64)rm")>; + +// Define ALU latency variants +def ZnWriteALULat2 : SchedWriteRes<[ZnALU]> { + let Latency = 2; +} +def ZnWriteALULat2Ld : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 6; +} + +def ZnWriteALULat3 : SchedWriteRes<[ZnALU]> { + let Latency = 3; +} +def ZnWriteALULat3Ld : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 7; +} + +// BSF BSR. +// r,r. +def : InstRW<[ZnWriteALULat3], (instregex "BS(R|F)(16|32|64)rr")>; +// r,m. +def : InstRW<[ZnWriteALULat3Ld, ReadAfterLd], (instregex "BS(R|F)(16|32|64)rm")>; + +// BT. +// r,r/i. +def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>; + +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mr")>; +def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>; + +// BTR BTS BTC. +// r,r,i. +def ZnWriteBTRSC : SchedWriteRes<[ZnALU]> { + let Latency = 2; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>; + + +// m,r,i. +def ZnWriteBTRSCm : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 6; + let NumMicroOps = 2; +} +// m,r,i. +def : InstRW<[ZnWriteBTRSCm], (instregex "BT(R|S|C)(16|32|64)m(r|i8)")>; + +// BLSI BLSMSK BLSR. +// r,r. +def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>; +// r,m. +def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>; + +// BEXTR. +// r,r,r. +def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>; + +// BZHI. +// r,r,r. +def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>; + +// CLD STD. +def : InstRW<[WriteALU], (instregex "STD", "CLD")>; + +// PDEP PEXT. +// r,r,r. +def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>; +// r,m,r. +def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>; + +// ROR ROL. +def : InstRW<[WriteShift], (instregex "RO(R|L)(8|16|32|64)r1")>; + +// RCR RCL. +// r,1. +def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r1")>; + +// m,1. +def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m1")>; + +// i. +def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>; + +// m,i. +def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>; + +// SHR SHL SAR. +// m,i. +def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>; + +// SHRD SHLD. +// r,r +def : InstRW<[WriteShift], (instregex "SH(R|L)D(16|32|64)rri8")>; + +// m,r +def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>; + +// r,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SHLD(16|32|64)rrCL")>; + +// r,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SHRD(16|32|64)rrCL")>; + +// m,r,cl. +def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>; + +// SETcc. +// r. +def : InstRW<[WriteShift], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>; +// m. +def : InstRW<[WriteShift], + (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>; + +// LZCNT TZCNT. +// r,r. +def : InstRW<[ZnWriteALULat2], (instregex "(LZCNT|TZCNT)(16|32|64)rr")>; +// r,m. +def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "(LZCNT|TZCNT)(16|32|64)rm")>; + +//-- Misc instructions --// +// CMPXCHG. +def ZnWriteCMPXCHG : SchedWriteRes<[ZnAGU, ZnALU]> { + let Latency = 8; + let NumMicroOps = 5; +} +def : InstRW<[ZnWriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>; + +// CMPXCHG8B. +def ZnWriteCMPXCHG8B : SchedWriteRes<[ZnAGU, ZnALU]> { + let NumMicroOps = 18; +} +def : InstRW<[ZnWriteCMPXCHG8B], (instregex "CMPXCHG8B")>; + +def : InstRW<[WriteMicrocoded], (instregex "CMPXCHG16B")>; + +// LEAVE +def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> { + let Latency = 8; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteLEAVE], (instregex "LEAVE")>; + +// PAUSE. +def : InstRW<[WriteMicrocoded], (instregex "PAUSE")>; + +// RDTSC. +def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>; + +// RDPMC. +def : InstRW<[WriteMicrocoded], (instregex "RDPMC")>; + +// RDRAND. +def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>; + +// XGETBV. +def : InstRW<[WriteMicrocoded], (instregex "XGETBV")>; + +//-- String instructions --// +// CMPS. +def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>; + +// LODSB/W. +def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>; + +// LODSD/Q. +def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>; + +// MOVS. +def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>; + +// SCAS. +def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>; + +// STOS +def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>; + +// XADD. +def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>; + +//=== Floating Point x87 Instructions ===// +//-- Move instructions --// + +def ZnWriteFLDr : SchedWriteRes<[ZnFPU13]> ; + +def ZnWriteSTr: SchedWriteRes<[ZnFPU23]> { + let Latency = 5; + let NumMicroOps = 2; +} + +// LD_F. +// r. +def : InstRW<[ZnWriteFLDr], (instregex "LD_Frr")>; + +// m. +def ZnWriteLD_F80m : SchedWriteRes<[ZnAGU, ZnFPU13]> { + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteLD_F80m], (instregex "LD_F80m")>; + +// FBLD. +def : InstRW<[WriteMicrocoded], (instregex "FBLDm")>; + +// FST(P). +// r. +def : InstRW<[ZnWriteSTr], (instregex "ST_(F|FP)rr")>; + +// m80. +def ZnWriteST_FP80m : SchedWriteRes<[ZnAGU, ZnFPU23]> { + let Latency = 5; } +def : InstRW<[ZnWriteST_FP80m], (instregex "ST_FP80m")>; + +// FBSTP. +// m80. +def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>; + +def ZnWriteFXCH : SchedWriteRes<[ZnFPU]>; + +// FXCHG. +def : InstRW<[ZnWriteFXCH], (instregex "XCH_F")>; + +// FILD. +def ZnWriteFILD : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteFILD], (instregex "ILD_F(16|32|64)m")>; + +// FIST(P) FISTTP. +def ZnWriteFIST : SchedWriteRes<[ZnAGU, ZnFPU23]> { + let Latency = 12; +} +def : InstRW<[ZnWriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>; + +def ZnWriteFPU13 : SchedWriteRes<[ZnAGU, ZnFPU13]> { + let Latency = 8; +} + +def ZnWriteFPU3 : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; +} + +// FLDZ. +def : InstRW<[ZnWriteFPU13], (instregex "LD_F0")>; + +// FLD1. +def : InstRW<[ZnWriteFPU3], (instregex "LD_F1")>; + +// FLDPI FLDL2E etc. +def : InstRW<[ZnWriteFPU3], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>; + +def : InstRW<[WriteMicrocoded], (instregex "CMOV(B|BE|E|P|NB|NBE|NE|NP)_F")>; + +// FNSTSW. +// AX. +def : InstRW<[WriteMicrocoded], (instregex "FNSTSW16r")>; + +// m16. +def : InstRW<[WriteMicrocoded], (instregex "FNSTSWm")>; + +// FLDCW. +def : InstRW<[WriteMicrocoded], (instregex "FLDCW16m")>; + +// FNSTCW. +def : InstRW<[WriteMicrocoded], (instregex "FNSTCW16m")>; + +// FINCSTP FDECSTP. +def : InstRW<[ZnWriteFPU3], (instregex "FINCSTP", "FDECSTP")>; + +// FFREE. +def : InstRW<[ZnWriteFPU3], (instregex "FFREE")>; + +// FNSAVE. +def : InstRW<[WriteMicrocoded], (instregex "FSAVEm")>; + +// FRSTOR. +def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>; + +//-- Arithmetic instructions --// + +def ZnWriteFPU3Lat2 : SchedWriteRes<[ZnFPU3]> { + let Latency = 2; +} + +def ZnWriteFPU3Lat2Ld : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 9; +} + +def ZnWriteFPU3Lat1 : SchedWriteRes<[ZnFPU3]> ; + +def ZnWriteFPU0Lat1 : SchedWriteRes<[ZnFPU0]> ; + +def ZnWriteFPU0Lat1Ld : SchedWriteRes<[ZnAGU, ZnFPU0]> { + let Latency = 8; +} + +// FABS. +def : InstRW<[ZnWriteFPU3Lat2], (instregex "ABS_F")>; + +// FCHS. +def : InstRW<[ZnWriteFPU3Lat1], (instregex "CHS_F")>; + +// FCOM(P) FUCOM(P). +// r. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr", + "UCOM_FPr")>; +// m. +def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>; + +// FCOMPP FUCOMPP. +// r. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "FCOMPP", "UCOM_FPPr")>; + +def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]> +{ + let Latency = 9; +} + +// FCOMI(P) FUCOMI(P). +// m. +def : InstRW<[ZnWriteFPU02], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr", + "UCOM_FIPr")>; + +def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]> +{ + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,3]; +} + +// FICOM(P). +def : InstRW<[ZnWriteFPU03], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>; + +// FTST. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>; + +// FXAM. +def : InstRW<[ZnWriteFPU3Lat1], (instregex "FXAM")>; + +// FPREM. +def : InstRW<[WriteMicrocoded], (instregex "FPREM")>; + +// FPREM1. +def : InstRW<[WriteMicrocoded], (instregex "FPREM1")>; + +// FRNDINT. +def : InstRW<[WriteMicrocoded], (instregex "FRNDINT")>; + +// FSCALE. +def : InstRW<[WriteMicrocoded], (instregex "FSCALE")>; + +// FXTRACT. +def : InstRW<[WriteMicrocoded], (instregex "FXTRACT")>; + +// FNOP. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "FNOP")>; + +// WAIT. +def : InstRW<[ZnWriteFPU0Lat1], (instregex "WAIT")>; + +// FNCLEX. +def : InstRW<[WriteMicrocoded], (instregex "FNCLEX")>; + +// FNINIT. +def : InstRW<[WriteMicrocoded], (instregex "FNINIT")>; + +//=== Integer MMX and XMM Instructions ===// +//-- Move instructions --// + +// Moves from GPR to FPR incurs a penalty +def ZnWriteFPU2 : SchedWriteRes<[ZnFPU2]> { + let Latency = 3; +} + +// Move to ALU doesn't incur penalty +def ZnWriteToALU2 : SchedWriteRes<[ZnFPU2]> { + let Latency = 2; +} + +def ZnWriteFPU : SchedWriteRes<[ZnFPU]>; +def ZnWriteFPUY : SchedWriteRes<[ZnFPU]> { + let NumMicroOps = 2; + let Latency=2; +} + +// MOVD. +// r32/64 <- (x)mm. +def : InstRW<[ZnWriteToALU2], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr", + "VMOVPDI2DIrr", "MOVPDI2DIrr")>; + +// (x)mm <- r32/64. +def : InstRW<[ZnWriteFPU2], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr", + "VMOVDI2PDIrr", "MOVDI2PDIrr")>; + +// MOVQ. +// r64 <- (x)mm. +def : InstRW<[ZnWriteToALU2], (instregex "VMOVPQIto64rr")>; + +// (x)mm <- r64. +def : InstRW<[ZnWriteFPU2], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>; + +// (x)mm <- (x)mm. +def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ64rr")>; + +// (V)MOVDQA/U. +// x <- x. +def : InstRW<[ZnWriteFPU], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr", + "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV")>; + +// y <- y. +def : InstRW<[ZnWriteFPUY], (instregex "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>; + +// MOVDQ2Q. +def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVDQ2Qrr")>; + +// MOVQ2DQ. +def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ2DQrr")>; + +// PACKSSWB/DW. +// mm <- mm. +def ZnWriteFPU12 : SchedWriteRes<[ZnFPU12]> ; +def ZnWriteFPU12Y : SchedWriteRes<[ZnFPU12]> { + let NumMicroOps = 2; +} +def ZnWriteFPU12m : SchedWriteRes<[ZnAGU, ZnFPU12]> ; + +def : InstRW<[ZnWriteFPU12], (instregex "MMX_PACKSSDWirr", + "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>; +def : InstRW<[ZnWriteFPU12m], (instregex "MMX_PACKSSDWirm", + "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>; + +// VPMOVSX/ZX BW BD BQ DW DQ. +// y <- x. +def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>; + +def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ; +def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> { + let Latency = 2; +} +def ZnWriteFPU013m : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} +def ZnWriteFPU013Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 8; + let NumMicroOps = 2; +} +def ZnWriteFPU013LdY : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 9; + let NumMicroOps = 2; +} + +// PBLENDW. +// x,x,i / v,v,v,i +def : InstRW<[ZnWriteFPU013], (instregex "(V?)PBLENDWrri")>; +// ymm +def : InstRW<[ZnWriteFPU013Y], (instregex "(V?)PBLENDWYrri")>; + +// x,m,i / v,v,m,i +def : InstRW<[ZnWriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>; +// y,m,i +def : InstRW<[ZnWriteFPU013LdY], (instregex "(V?)PBLENDWYrmi")>; + +def ZnWriteFPU01 : SchedWriteRes<[ZnFPU01]> ; +def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> { + let NumMicroOps = 2; +} + +// VPBLENDD. +// v,v,v,i. +def : InstRW<[ZnWriteFPU01], (instregex "VPBLENDDrri")>; +// ymm +def : InstRW<[ZnWriteFPU01Y], (instregex "VPBLENDDYrri")>; + +// v,v,m,i +def ZnWriteFPU01Op2 : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let NumMicroOps = 2; + let Latency = 8; + let ResourceCycles = [1, 2]; +} +def ZnWriteFPU01Op2Y : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let NumMicroOps = 2; + let Latency = 9; + let ResourceCycles = [1, 3]; +} +def : InstRW<[ZnWriteFPU01Op2], (instregex "VPBLENDDrmi")>; +def : InstRW<[ZnWriteFPU01Op2Y], (instregex "VPBLENDDYrmi")>; + +// MASKMOVQ. +def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>; + +// MASKMOVDQU. +def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>; + +// VPMASKMOVQ. +// ymm +def : InstRW<[ZnWriteFPU01Op2],(instregex "VPMASKMOVQrm")>; +def : InstRW<[ZnWriteFPU01Op2Y],(instregex "VPMASKMOVQYrm")>; + +def : InstRW<[WriteMicrocoded], + (instregex "VPMASKMOVD(Y?)rm")>; +// m, v,v. +def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>; + +// PMOVMSKB. +def ZnWritePMOVMSKB : SchedWriteRes<[ZnFPU2]> { + let NumMicroOps = 2; +} +def ZnWritePMOVMSKBY : SchedWriteRes<[ZnFPU2]> { + let Latency = 2; +} +def : InstRW<[ZnWritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKBrr")>; +def : InstRW<[ZnWritePMOVMSKBY], (instregex "(V|MMX_)?PMOVMSKBYrr")>; + +// PEXTR B/W/D/Q. +// r32,x,i. +def ZnWritePEXTRr : SchedWriteRes<[ZnFPU12, ZnFPU2]> { + let Latency = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>; + +def ZnWritePEXTRm : SchedWriteRes<[ZnAGU, ZnFPU12, ZnFPU2]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 2, 3]; +} +// m8,x,i. +def : InstRW<[ZnWritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>; + +// VPBROADCAST B/W. +// x, m8/16. +def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteVPBROADCAST128Ld], + (instregex "VPBROADCAST(B|W)rm")>; + +// y, m8/16 +def ZnWriteVPBROADCAST256Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteVPBROADCAST256Ld], + (instregex "VPBROADCAST(B|W)Yrm")>; + +// VPGATHER. +def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +// PHADD|PHSUB (S) W/D. +def : InstRW<[WriteMicrocoded], (instregex "MMX_PHADD(W?)r(r|m)64", + "MMX_PHADDSWr(r|m)64", + "MMX_PHSUB(W|D)r(r|m)64", + "MMX_PHSUBSWrr64", + "(V?)PH(ADD|SUB)(W|D)(Y?)r(r|m)", + "(V?)PH(ADD|SUB)SWr(r|m)(256)?")>; + + +// PCMPGTQ. +def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>; +def : InstRW<[ZnWritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>; + +// x <- x,m. +def ZnWritePCMPGTQm : SchedWriteRes<[ZnAGU, ZnFPU03]> { + let Latency = 8; +} +// ymm. +def ZnWritePCMPGTQYm : SchedWriteRes<[ZnAGU, ZnFPU03]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; +} +def : InstRW<[ZnWritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>; +def : InstRW<[ZnWritePCMPGTQYm], (instregex "(V?)PCMPGTQYrm")>; + +// PMULLD. +// x,x. +def ZnWritePMULLDr : SchedWriteRes<[ZnFPU0]> { + let Latency = 4; +} +// ymm. +def ZnWritePMULLDYr : SchedWriteRes<[ZnFPU0]> { + let Latency = 5; + let ResourceCycles = [2]; +} +def : InstRW<[ZnWritePMULLDr], (instregex "(V?)PMULLDrr")>; +def : InstRW<[ZnWritePMULLDYr], (instregex "(V?)PMULLDYrr")>; + +// x,m. +def ZnWritePMULLDm : SchedWriteRes<[ZnAGU, ZnFPU0]> { + let Latency = 11; + let NumMicroOps = 2; +} +// y,m. +def ZnWritePMULLDYm : SchedWriteRes<[ZnAGU, ZnFPU0]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWritePMULLDm], (instregex "(V?)PMULLDrm")>; +def : InstRW<[ZnWritePMULLDYm], (instregex "(V?)PMULLDYrm")>; + +//-- Logic instructions --// + +// PTEST. +// v,v. +def ZnWritePTESTr : SchedWriteRes<[ZnFPU12]> { + let ResourceCycles = [2]; +} +def : InstRW<[ZnWritePTESTr], (instregex "(V?)PTEST(Y?)rr")>; + +// v,m. +def ZnWritePTESTm : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWritePTESTm], (instregex "(V?)PTEST(Y?)rm")>; + +// PSLL,PSRL,PSRA W/D/Q. +// x,x / v,v,x. +def ZnWritePShift : SchedWriteRes<[ZnFPU2]> ; +def ZnWritePShiftY : SchedWriteRes<[ZnFPU2]> { + let Latency = 2; +} +def ZnWritePShiftLd : SchedWriteRes<[ZnAGU,ZnFPU2]> { + let Latency = 8; +} +def ZnWritePShiftYLd : SchedWriteRes<[ZnAGU, ZnFPU2]> { + let Latency = 9; +} +def : InstRW<[ZnWritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rr")>; +def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrr")>; + +def : InstRW<[ZnWritePShiftLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rm")>; +def : InstRW<[ZnWritePShiftYLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrm")>; + +// PSLL,PSRL DQ. +def : InstRW<[ZnWritePShift], (instregex "(V?)PS(R|L)LDQri")>; +def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>; + +//=== Floating Point XMM and YMM Instructions ===// +//-- Move instructions --// + +// MOVMSKP S/D. +// r32 <- x,y. +def ZnWriteMOVMSKPr : SchedWriteRes<[ZnFPU2]> ; +def : InstRW<[ZnWriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)(Y?)rr")>; + +// VPERM2F128. +def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rr")>; +def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rm")>; + +// BLENDVP S/D. +def ZnWriteFPU01Lat3 : SchedWriteRes<[ZnFPU013]> { + let Latency = 3; +} +def ZnWriteFPU01Lat3Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteFPU01Lat3], (instregex "BLENDVP(S|D)rr0")>; +def : InstRW<[ZnWriteFPU01Lat3Ld, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>; + +def ZnWriteBROADCAST : SchedWriteRes<[ZnAGU, ZnFPU13]> { + let NumMicroOps = 2; + let Latency = 8; +} +// VBROADCASTF128. +def : InstRW<[ZnWriteBROADCAST], (instregex "VBROADCASTF128")>; + +// EXTRACTPS. +// r32,x,i. +def ZnWriteEXTRACTPSr : SchedWriteRes<[ZnFPU12, ZnFPU2]> { + let Latency = 2; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>; + +def ZnWriteEXTRACTPSm : SchedWriteRes<[ZnAGU,ZnFPU12, ZnFPU2]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [5, 1, 2]; +} +// m32,x,i. +def : InstRW<[ZnWriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>; + +// VEXTRACTF128. +// x,y,i. +def : InstRW<[ZnWriteFPU013], (instregex "VEXTRACTF128rr")>; + +// m128,y,i. +def : InstRW<[ZnWriteFPU013m], (instregex "VEXTRACTF128mr")>; + +def ZnWriteVINSERT128r: SchedWriteRes<[ZnFPU013]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def ZnWriteVINSERT128Ld: SchedWriteRes<[ZnAGU,ZnFPU013]> { + let Latency = 9; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +// VINSERTF128. +// y,y,x,i. +def : InstRW<[ZnWriteVINSERT128r], (instregex "VINSERTF128rr")>; +def : InstRW<[ZnWriteVINSERT128Ld], (instregex "VINSERTF128rm")>; + +// VMASKMOVP S/D. +// x,x,m. +def ZnWriteVMASKMOVPLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 8; +} +// y,y,m. +def ZnWriteVMASKMOVPLdY : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 8; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def ZnWriteVMASKMOVPm : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 4; +} +def : InstRW<[ZnWriteVMASKMOVPLd], (instregex "VMASKMOVP(S|D)rm")>; +def : InstRW<[ZnWriteVMASKMOVPLdY], (instregex "VMASKMOVP(S|D)Yrm")>; +def : InstRW<[ZnWriteVMASKMOVPm], (instregex "VMASKMOVP(S|D)mr")>; + +// m256,y,y. +def ZnWriteVMASKMOVPYmr : SchedWriteRes<[ZnAGU,ZnFPU01]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>; + +// VGATHERDPS. +// x. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSrm")>; +// y. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSYrm")>; + +// VGATHERQPS. +// x. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSrm")>; + +// y. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSYrm")>; + +// VGATHERDPD. +// x. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDrm")>; + +// y. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDYrm")>; + +// VGATHERQPD. +// x. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDrm")>; + +// y. +def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDYrm")>; + +//-- Conversion instructions --// +def ZnWriteCVTPD2PSr: SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +// CVTPD2PS. +// x,x. +def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(V?)CVTPD2PSrr")>; + +def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; +} +// x,m128. +def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(V?)CVTPD2PS(X?)rm")>; + +// x,y. +def ZnWriteCVTPD2PSYr : SchedWriteRes<[ZnFPU3]> { + let Latency = 5; +} +def : InstRW<[ZnWriteCVTPD2PSYr], (instregex "(V?)CVTPD2PSYrr")>; + +// x,m256. +def ZnWriteCVTPD2PSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; +} +def : InstRW<[ZnWriteCVTPD2PSYLd], (instregex "(V?)CVTPD2PSYrm")>; + +// CVTSD2SS. +// x,x. +// Same as WriteCVTPD2PSr +def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(Int_)?(V)?CVTSD2SSrr")>; + +// x,m64. +def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(Int_)?(V)?CVTSD2SSrm")>; + +// CVTPS2PD. +// x,x. +def ZnWriteCVTPS2PDr : SchedWriteRes<[ZnFPU3]> { + let Latency = 3; +} +def : InstRW<[ZnWriteCVTPS2PDr], (instregex "(V?)CVTPS2PDrr")>; + +// x,m64. +// y,m128. +def ZnWriteCVTPS2PDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 10; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteCVTPS2PDLd], (instregex "(V?)CVTPS2PD(Y?)rm")>; + +// y,x. +def ZnWriteVCVTPS2PDY : SchedWriteRes<[ZnFPU3]> { + let Latency = 3; +} +def : InstRW<[ZnWriteVCVTPS2PDY], (instregex "VCVTPS2PDYrr")>; + +// CVTSS2SD. +// x,x. +def ZnWriteCVTSS2SDr : SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +def : InstRW<[ZnWriteCVTSS2SDr], (instregex "(Int_)?(V?)CVTSS2SDrr")>; + +// x,m32. +def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; + let NumMicroOps = 2; + let ResourceCycles = [1, 2]; +} +def : InstRW<[ZnWriteCVTSS2SDLd], (instregex "(Int_)?(V?)CVTSS2SDrm")>; + +def ZnWriteCVTDQ2PDr: SchedWriteRes<[ZnFPU12,ZnFPU3]> { + let Latency = 5; +} +// CVTDQ2PD. +// x,x. +def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>; + +// Same as xmm +// y,x. +def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "VCVTDQ2PDYrr")>; + +def ZnWriteCVTPD2DQr: SchedWriteRes<[ZnFPU12, ZnFPU3]> { + let Latency = 5; +} +// CVT(T)PD2DQ. +// x,x. +def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V?)CVT(T?)PD2DQrr")>; + +def ZnWriteCVTPD2DQLd: SchedWriteRes<[ZnAGU,ZnFPU12,ZnFPU3]> { + let Latency = 12; + let NumMicroOps = 2; +} +// x,m128. +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>; +// same as xmm handling +// x,y. +def : InstRW<[ZnWriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>; +// x,m256. +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>; +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQ(64)?rm")>; + +def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +// CVT(T)PS2PI. +// mm,x. +def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>; + +// CVTPI2PD. +// x,mm. +def : InstRW<[ZnWriteCVTPS2PDr], (instregex "MMX_CVT(T?)PI2PDirr")>; + +// CVT(T)PD2PI. +// mm,x. +def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>; + +def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> { + let Latency = 5; +} +// CVSTSI2SS. +// x,r32. +def : InstRW<[ZnWriteCVSTSI2SSr], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>; + +// same as CVTPD2DQr +// CVT(T)SS2SI. +// r32,x. +def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>; +// same as CVTPD2DQm +// r32,m32. +def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>; + +def ZnWriteCVSTSI2SDr: SchedWriteRes<[ZnFPU013, ZnFPU3]> { + let Latency = 5; +} +// CVTSI2SD. +// x,r32/64. +def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>; + + +def ZnWriteCVSTSI2SIr: SchedWriteRes<[ZnFPU3, ZnFPU2]> { + let Latency = 5; +} +def ZnWriteCVSTSI2SILd: SchedWriteRes<[ZnAGU, ZnFPU3, ZnFPU2]> { + let Latency = 12; +} +// CVTSD2SI. +// r32/64 +def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?CVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?CVT(T?)SD2SI(64)?rm")>; + + +def ZnWriteVCVSTSI2SIr: SchedWriteRes<[ZnFPU3]> { + let Latency = 5; +} +def ZnWriteVCVSTSI2SILd: SchedWriteRes<[ZnFPU3, ZnAGU]> { + let Latency = 12; +} +// VCVTSD2SI. +// r32/64 +def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rr")>; +// r32,m32. +def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rm")>; + +// VCVTPS2PH. +// x,v,i. +def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)rr")>; +// m,v,i. +def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)mr")>; + +// VCVTPH2PS. +// v,x. +def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rr")>; +// v,m. +def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rm")>; + +//-- SSE4A instructions --// +// EXTRQ +def ZnWriteEXTRQ: SchedWriteRes<[ZnFPU12, ZnFPU2]> { + let Latency = 2; +} +def : InstRW<[ZnWriteEXTRQ], (instregex "EXTRQ")>; + +// INSERTQ +def ZnWriteINSERTQ: SchedWriteRes<[ZnFPU03,ZnFPU1]> { + let Latency = 4; +} +def : InstRW<[ZnWriteINSERTQ], (instregex "INSERTQ")>; + +// MOVNTSS/MOVNTSD +def ZnWriteMOVNT: SchedWriteRes<[ZnAGU,ZnFPU2]> { + let Latency = 8; +} +def : InstRW<[ZnWriteMOVNT], (instregex "MOVNTS(S|D)")>; + +//-- SHA instructions --// +// SHA256MSG2 +def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>; + +// SHA1MSG1, SHA256MSG1 +// x,x. +def ZnWriteSHA1MSG1r : SchedWriteRes<[ZnFPU12]> { + let Latency = 2; + let ResourceCycles = [2]; +} +def : InstRW<[ZnWriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>; +// x,m. +def ZnWriteSHA1MSG1Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 9; + let ResourceCycles = [1,2]; +} +def : InstRW<[ZnWriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>; + +// SHA1MSG2 +// x,x. +def ZnWriteSHA1MSG2r : SchedWriteRes<[ZnFPU12]> ; +def : InstRW<[ZnWriteSHA1MSG2r], (instregex "SHA1MSG2rr")>; +// x,m. +def ZnWriteSHA1MSG2Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> { + let Latency = 8; +} +def : InstRW<[ZnWriteSHA1MSG2Ld], (instregex "SHA1MSG2rm")>; + +// SHA1NEXTE +// x,x. +def ZnWriteSHA1NEXTEr : SchedWriteRes<[ZnFPU1]> ; +def : InstRW<[ZnWriteSHA1NEXTEr], (instregex "SHA1NEXTErr")>; +// x,m. +def ZnWriteSHA1NEXTELd : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 8; +} +def : InstRW<[ZnWriteSHA1NEXTELd], (instregex "SHA1NEXTErm")>; + +// SHA1RNDS4 +// x,x. +def ZnWriteSHA1RNDS4r : SchedWriteRes<[ZnFPU1]> { + let Latency = 6; +} +def : InstRW<[ZnWriteSHA1RNDS4r], (instregex "SHA1RNDS4rr")>; +// x,m. +def ZnWriteSHA1RNDS4Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 13; +} +def : InstRW<[ZnWriteSHA1RNDS4Ld], (instregex "SHA1RNDS4rm")>; + +// SHA256RNDS2 +// x,x. +def ZnWriteSHA256RNDS2r : SchedWriteRes<[ZnFPU1]> { + let Latency = 4; +} +def : InstRW<[ZnWriteSHA256RNDS2r], (instregex "SHA256RNDS2rr")>; +// x,m. +def ZnWriteSHA256RNDS2Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> { + let Latency = 11; +} +def : InstRW<[ZnWriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>; + +//-- Arithmetic instructions --// + +// HADD, HSUB PS/PD +def : InstRW<[WriteMicrocoded], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)r(r|m)")>; + +// MULL SS/SD PS/PD. +// x,x / v,v,v. +def ZnWriteMULr : SchedWriteRes<[ZnFPU01]> { + let Latency = 3; +} +// ymm. +def ZnWriteMULYr : SchedWriteRes<[ZnFPU01]> { + let Latency = 4; +} +def : InstRW<[ZnWriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>; +def : InstRW<[ZnWriteMULYr], (instregex "(V?)MUL(P|S)(S|D)Yrr")>; + +// x,m / v,v,m. +def ZnWriteMULLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 10; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteMULLd], (instregex "(V?)MUL(P|S)(S|D)rm")>; + +// ymm +def ZnWriteMULYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 11; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteMULYLd], (instregex "(V?)MUL(P|S)(S|D)Yrm")>; + +// VDIVPS. +// y,y,y. +def ZnWriteVDIVPSYr : SchedWriteRes<[ZnFPU3]> { + let Latency = 12; + let ResourceCycles = [12]; +} +def : InstRW<[ZnWriteVDIVPSYr], (instregex "VDIVPSYrr")>; + +// y,y,m256. +def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 19; + let NumMicroOps = 2; + let ResourceCycles = [1, 19]; +} +def : InstRW<[ZnWriteVDIVPSYLd], (instregex "VDIVPSYrm")>; + +// VDIVPD. +// y,y,y. +def ZnWriteVDIVPDY : SchedWriteRes<[ZnFPU3]> { + let Latency = 15; + let ResourceCycles = [15]; +} +def : InstRW<[ZnWriteVDIVPDY], (instregex "VDIVPDYrr")>; + +// y,y,m256. +def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 22; + let NumMicroOps = 2; + let ResourceCycles = [1,22]; +} +def : InstRW<[ZnWriteVDIVPDYLd], (instregex "VDIVPDYrm")>; + +// VRCPPS. +// y,y. +def ZnWriteVRCPPSr : SchedWriteRes<[ZnFPU01]> { + let Latency = 5; +} +def : InstRW<[ZnWriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>; + +// y,m256. +def ZnWriteVRCPPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 12; + let NumMicroOps = 3; +} +def : InstRW<[ZnWriteVRCPPSLd], (instregex "VRCPPSYm(_Int)?")>; + +// ROUND SS/SD PS/PD. +// v,v,i. +def ZnWriteROUNDr : SchedWriteRes<[ZnFPU3]> { + let Latency = 4; +} +def : InstRW<[ZnWriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>; + +// VFMADD. +// v,v,v. +def ZnWriteFMADDr : SchedWriteRes<[ZnFPU03]> { + let Latency = 5; +} +def : InstRW<[ZnWriteFMADDr], + (instregex + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(213|132|231)(Y)?r", + "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)r", + "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?", + "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>; + +// v,v,m. +def ZnWriteFMADDm : SchedWriteRes<[ZnAGU, ZnFPU03]> { + let Latency = 12; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteFMADDm], + (instregex + "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)(213|132|231)P(S|D)(Y)?m", + "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)m", + "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?", + "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>; + +// v,m,i. +def ZnWriteROUNDm : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 11; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>; + +// DPPS. +// x,x,i / v,v,v,i. +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rri")>; + +// x,m,i / v,v,m,i. +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rmi")>; + +// DPPD. +// x,x,i. +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrri")>; + +// x,m,i. +def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrmi")>; + +// VSQRTPS. +// y,y. +def ZnWriteVSQRTPSYr : SchedWriteRes<[ZnFPU3]> { + let Latency = 28; + let ResourceCycles = [28]; +} +def : InstRW<[ZnWriteVSQRTPSYr], (instregex "VSQRTPSYr")>; + +// y,m256. +def ZnWriteVSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 35; + let ResourceCycles = [1,35]; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteVSQRTPSYLd], (instregex "VSQRTPSYm")>; + +// VSQRTPD. +// y,y. +def ZnWriteVSQRTPDYr : SchedWriteRes<[ZnFPU3]> { + let Latency = 40; + let ResourceCycles = [40]; +} +def : InstRW<[ZnWriteVSQRTPDYr], (instregex "VSQRTPDYr")>; + +// y,m256. +def ZnWriteVSQRTPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> { + let Latency = 47; + let NumMicroOps = 2; + let ResourceCycles = [1,47]; +} +def : InstRW<[ZnWriteVSQRTPDYLd], (instregex "VSQRTPDYm")>; + +// RSQRTSS +// x,x. +def ZnWriteRSQRTSSr : SchedWriteRes<[ZnFPU02]> { + let Latency = 5; +} +def : InstRW<[ZnWriteRSQRTSSr], (instregex "(V?)RSQRTSS(Y?)r(_Int)?")>; + +// RSQRTPS +// x,x. +def ZnWriteRSQRTPSr : SchedWriteRes<[ZnFPU01]> { + let Latency = 5; +} +def : InstRW<[ZnWriteRSQRTPSr], (instregex "(V?)RSQRTPS(Y?)r(_Int)?")>; + +// RSQRTSSm +// x,m128. +def ZnWriteRSQRTSSLd: SchedWriteRes<[ZnAGU, ZnFPU02]> { + let Latency = 12; + let NumMicroOps = 2; + let ResourceCycles = [1,2]; +} +def : InstRW<[ZnWriteRSQRTSSLd], (instregex "(V?)RSQRTSSm(_Int)?")>; + +// RSQRTPSm +def ZnWriteRSQRTPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 12; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteRSQRTPSLd], (instregex "(V?)RSQRTPSm(_Int)?")>; + +// RSQRTPS 256. +// y,y. +def ZnWriteRSQRTPSYr : SchedWriteRes<[ZnFPU01]> { + let Latency = 5; + let NumMicroOps = 2; + let ResourceCycles = [2]; +} +def : InstRW<[ZnWriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>; + +// y,m256. +def ZnWriteRSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> { + let Latency = 12; + let NumMicroOps = 2; +} +def : InstRW<[ZnWriteRSQRTPSYLd], (instregex "VRSQRTPSYm(_Int)?")>; + +//-- Logic instructions --// + +// AND, ANDN, OR, XOR PS/PD. +// x,x / v,v,v. +def : InstRW<[WriteVecLogic], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>; +// x,m / v,v,m. +def : InstRW<[WriteVecLogicLd], + (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>; + +//-- Other instructions --// + +// VZEROUPPER. +def : InstRW<[WriteMicrocoded], (instregex "VZEROUPPER")>; + +// VZEROALL. +def : InstRW<[WriteMicrocoded], (instregex "VZEROALL")>; + +// LDMXCSR. +def : InstRW<[WriteMicrocoded], (instregex "(V)?LDMXCSR")>; + +// STMXCSR. +def : InstRW<[WriteMicrocoded], (instregex "(V)?STMXCSR")>; + +} // SchedModel diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp index c67aa04aebea..1e04997ad294 100644 --- a/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -17,8 +17,8 @@ #include "X86RegisterInfo.h" #include "X86Subtarget.h" #include "llvm/CodeGen/SelectionDAG.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/DerivedTypes.h" -#include "llvm/Target/TargetLowering.h" using namespace llvm; @@ -247,7 +247,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; if (Repeats.BytesLeft() > 0 && - DAG.getMachineFunction().getFunction()->optForMinSize()) { + DAG.getMachineFunction().getFunction().optForMinSize()) { // When agressively optimizing for size, avoid generating the code to // handle BytesLeft. Repeats.AVT = MVT::i8; diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp index 2cebb76022ef..c7ddf93f8e85 100644 --- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp +++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp @@ -12,10 +12,8 @@ // //===----------------------------------------------------------------------===// -#include "X86ShuffleDecodeConstantPool.h" #include "Utils/X86ShuffleDecode.h" #include "llvm/ADT/APInt.h" -#include "llvm/CodeGen/MachineValueType.h" #include "llvm/IR/Constants.h" //===----------------------------------------------------------------------===// diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp index 24845beac22d..8b08766b6171 100644 --- a/lib/Target/X86/X86Subtarget.cpp +++ b/lib/Target/X86/X86Subtarget.cpp @@ -13,21 +13,15 @@ #include "X86.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL #include "X86CallLowering.h" #include "X86LegalizerInfo.h" #include "X86RegisterBankInfo.h" -#endif #include "X86Subtarget.h" #include "MCTargetDesc/X86BaseInfo.h" #include "X86TargetMachine.h" #include "llvm/ADT/Triple.h" -#ifdef LLVM_BUILD_GLOBAL_ISEL #include "llvm/CodeGen/GlobalISel/CallLowering.h" #include "llvm/CodeGen/GlobalISel/InstructionSelect.h" -#include "llvm/CodeGen/GlobalISel/Legalizer.h" -#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" -#endif #include "llvm/IR/Attributes.h" #include "llvm/IR/ConstantRange.h" #include "llvm/IR/Function.h" @@ -39,8 +33,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetMachine.h" -#include <cassert> -#include <string> #if defined(_MSC_VER) #include <intrin.h> @@ -151,7 +143,12 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, if (TM.shouldAssumeDSOLocal(M, GV)) return X86II::MO_NO_FLAG; - assert(!isTargetCOFF()); + if (isTargetCOFF()) { + assert(GV->hasDLLImportStorageClass() && + "shouldAssumeDSOLocal gave inconsistent answer"); + return X86II::MO_DLLIMPORT; + } + const Function *F = dyn_cast_or_null<Function>(GV); if (isTargetELF()) { @@ -160,6 +157,8 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV, // In Regcall calling convention those registers are used for passing // parameters. Thus we need to prevent lazy binding in Regcall. return X86II::MO_GOTPCREL; + if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit()) + return X86II::MO_GOTPCREL; return X86II::MO_PLT; } @@ -189,9 +188,12 @@ const char *X86Subtarget::getBZeroEntry() const { } bool X86Subtarget::hasSinCos() const { - return getTargetTriple().isMacOSX() && - !getTargetTriple().isMacOSXVersionLT(10, 9) && - is64Bit(); + if (getTargetTriple().isMacOSX()) { + return !getTargetTriple().isMacOSXVersionLT(10, 9) && is64Bit(); + } else if (getTargetTriple().isOSFuchsia()) { + return true; + } + return false; } /// Return true if the subtarget allows calls to immediate address. @@ -263,6 +265,17 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) { else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() || isTargetKFreeBSD() || In64BitMode) stackAlignment = 16; + + // Some CPUs have more overhead for gather. The specified overhead is relative + // to the Load operation. "2" is the number provided by Intel architects. This + // parameter is used for cost estimation of Gather Op and comparison with + // other alternatives. + // TODO: Remove the explicit hasAVX512()?, That would mean we would only + // enable gather with a -march. + if (hasAVX512() || (hasAVX2() && hasFastGather())) + GatherOverhead = 2; + if (hasAVX512()) + ScatterOverhead = 2; } void X86Subtarget::initializeEnvironment() { @@ -274,12 +287,15 @@ void X86Subtarget::initializeEnvironment() { HasPOPCNT = false; HasSSE4A = false; HasAES = false; + HasVAES = false; HasFXSR = false; HasXSAVE = false; HasXSAVEOPT = false; HasXSAVEC = false; HasXSAVES = false; HasPCLMUL = false; + HasVPCLMULQDQ = false; + HasGFNI = false; HasFMA = false; HasFMA4 = false; HasXOP = false; @@ -293,6 +309,7 @@ void X86Subtarget::initializeEnvironment() { HasBMI = false; HasBMI2 = false; HasVBMI = false; + HasVBMI2 = false; HasIFMA = false; HasRTM = false; HasERI = false; @@ -304,6 +321,8 @@ void X86Subtarget::initializeEnvironment() { HasVLX = false; HasADX = false; HasPKU = false; + HasVNNI = false; + HasBITALG = false; HasSHA = false; HasPRFCHW = false; HasRDSEED = false; @@ -311,10 +330,11 @@ void X86Subtarget::initializeEnvironment() { HasMWAITX = false; HasCLZERO = false; HasMPX = false; + HasSHSTK = false; + HasIBT = false; HasSGX = false; HasCLFLUSHOPT = false; HasCLWB = false; - IsBTMemSlow = false; IsPMULLDSlow = false; IsSHLDSlow = false; IsUAMem16Slow = false; @@ -323,15 +343,17 @@ void X86Subtarget::initializeEnvironment() { HasCmpxchg16b = false; UseLeaForSP = false; HasFastPartialYMMorZMMWrite = false; + HasFastGather = false; HasFastScalarFSQRT = false; HasFastVectorFSQRT = false; HasFastLZCNT = false; HasFastSHLDRotate = false; + HasMacroFusion = false; HasERMSB = false; HasSlowDivide32 = false; HasSlowDivide64 = false; PadShortFunctions = false; - CallRegIndirect = false; + SlowTwoMemOps = false; LEAUsesAG = false; SlowLEA = false; Slow3OpsLEA = false; @@ -340,6 +362,9 @@ void X86Subtarget::initializeEnvironment() { // FIXME: this is a known good value for Yonah. How about others? MaxInlineSizeThreshold = 128; UseSoftFloat = false; + X86ProcFamily = Others; + GatherOverhead = 1024; + ScatterOverhead = 1024; } X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, @@ -349,35 +374,6 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU, return *this; } -#ifdef LLVM_BUILD_GLOBAL_ISEL -namespace { - -struct X86GISelActualAccessor : public GISelAccessor { - std::unique_ptr<CallLowering> CallLoweringInfo; - std::unique_ptr<LegalizerInfo> Legalizer; - std::unique_ptr<RegisterBankInfo> RegBankInfo; - std::unique_ptr<InstructionSelector> InstSelector; - - const CallLowering *getCallLowering() const override { - return CallLoweringInfo.get(); - } - - const InstructionSelector *getInstructionSelector() const override { - return InstSelector.get(); - } - - const LegalizerInfo *getLegalizerInfo() const override { - return Legalizer.get(); - } - - const RegisterBankInfo *getRegBankInfo() const override { - return RegBankInfo.get(); - } -}; - -} // end anonymous namespace -#endif - X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride) @@ -402,39 +398,29 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, setPICStyle(PICStyles::StubPIC); else if (isTargetELF()) setPICStyle(PICStyles::GOT); -#ifndef LLVM_BUILD_GLOBAL_ISEL - GISelAccessor *GISel = new GISelAccessor(); -#else - X86GISelActualAccessor *GISel = new X86GISelActualAccessor(); - GISel->CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering())); - GISel->Legalizer.reset(new X86LegalizerInfo(*this, TM)); + CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering())); + Legalizer.reset(new X86LegalizerInfo(*this, TM)); auto *RBI = new X86RegisterBankInfo(*getRegisterInfo()); - GISel->RegBankInfo.reset(RBI); - GISel->InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI)); -#endif - setGISelAccessor(*GISel); + RegBankInfo.reset(RBI); + InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI)); } const CallLowering *X86Subtarget::getCallLowering() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getCallLowering(); + return CallLoweringInfo.get(); } const InstructionSelector *X86Subtarget::getInstructionSelector() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getInstructionSelector(); + return InstSelector.get(); } const LegalizerInfo *X86Subtarget::getLegalizerInfo() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getLegalizerInfo(); + return Legalizer.get(); } const RegisterBankInfo *X86Subtarget::getRegBankInfo() const { - assert(GISel && "Access to GlobalISel APIs not set"); - return GISel->getRegBankInfo(); + return RegBankInfo.get(); } bool X86Subtarget::enableEarlyIfConversion() const { diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index 427a0001bef9..be4d46c470de 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -20,11 +20,14 @@ #include "X86SelectionDAGInfo.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Triple.h" -#include "llvm/CodeGen/GlobalISel/GISelAccessor.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" +#include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/MC/MCInstrItineraries.h" #include "llvm/Target/TargetMachine.h" -#include "llvm/Target/TargetSubtargetInfo.h" #include <memory> #define GET_SUBTARGETINFO_HEADER @@ -48,6 +51,21 @@ enum Style { } // end namespace PICStyles class X86Subtarget final : public X86GenSubtargetInfo { +public: + enum X86ProcFamilyEnum { + Others, + IntelAtom, + IntelSLM, + IntelGLM, + IntelHaswell, + IntelBroadwell, + IntelSkylake, + IntelKNL, + IntelSKX, + IntelCannonlake, + IntelIcelake, + }; + protected: enum X86SSEEnum { NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F @@ -57,10 +75,6 @@ protected: NoThreeDNow, MMX, ThreeDNow, ThreeDNowA }; - enum X86ProcFamilyEnum { - Others, IntelAtom, IntelSLM, IntelGLM - }; - /// X86 processor family: Intel Atom, and others X86ProcFamilyEnum X86ProcFamily; @@ -93,6 +107,7 @@ protected: /// Target has AES instructions bool HasAES; + bool HasVAES; /// Target has FXSAVE/FXRESTOR instructions bool HasFXSR; @@ -111,6 +126,10 @@ protected: /// Target has carry-less multiplication bool HasPCLMUL; + bool HasVPCLMULQDQ; + + /// Target has Galois Field Arithmetic instructions + bool HasGFNI; /// Target has 3-operand fused multiply-add bool HasFMA; @@ -151,6 +170,9 @@ protected: /// Processor has VBMI instructions. bool HasVBMI; + /// Processor has VBMI2 instructions. + bool HasVBMI2; + /// Processor has Integer Fused Multiply Add bool HasIFMA; @@ -181,9 +203,6 @@ protected: /// Processor has Prefetch with intent to Write instruction bool HasPFPREFETCHWT1; - /// True if BT (bit test) of memory instructions are slow. - bool IsBTMemSlow; - /// True if SHLD instructions are slow. bool IsSHLDSlow; @@ -213,6 +232,10 @@ protected: /// of a YMM or ZMM register without clearing the upper part. bool HasFastPartialYMMorZMMWrite; + /// True if gather is reasonably fast. This is true for Skylake client and + /// all AVX-512 CPUs. + bool HasFastGather; + /// True if hardware SQRTSS instruction is at least as fast (latency) as /// RSQRTSS followed by a Newton-Raphson iteration. bool HasFastScalarFSQRT; @@ -235,6 +258,9 @@ protected: /// True if SHLD based rotate is fast. bool HasFastSHLDRotate; + /// True if the processor supports macrofusion. + bool HasMacroFusion; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB; @@ -242,9 +268,9 @@ protected: /// a stall when returning too early. bool PadShortFunctions; - /// True if the Calls with memory reference should be converted - /// to a register-based indirect call. - bool CallRegIndirect; + /// True if two memory operand instructions should use a temporary register + /// instead. + bool SlowTwoMemOps; /// True if the LEA instruction inputs have to be ready at address generation /// (AG) time. @@ -285,9 +311,23 @@ protected: /// Processor has PKU extenstions bool HasPKU; + /// Processor has AVX-512 Vector Neural Network Instructions + bool HasVNNI; + + /// Processor has AVX-512 Bit Algorithms instructions + bool HasBITALG; + /// Processor supports MPX - Memory Protection Extensions bool HasMPX; + /// Processor supports CET SHSTK - Control-Flow Enforcement Technology + /// using Shadow Stack + bool HasSHSTK; + + /// Processor supports CET IBT - Control-Flow Enforcement Technology + /// using Indirect Branch Tracking + bool HasIBT; + /// Processor has Software Guard Extensions bool HasSGX; @@ -314,10 +354,11 @@ protected: /// Instruction itineraries for scheduling InstrItineraryData InstrItins; - /// Gather the accessor points to GlobalISel-related APIs. - /// This is used to avoid ifndefs spreading around while GISel is - /// an optional library. - std::unique_ptr<GISelAccessor> GISel; + /// GlobalISel related APIs. + std::unique_ptr<CallLowering> CallLoweringInfo; + std::unique_ptr<LegalizerInfo> Legalizer; + std::unique_ptr<RegisterBankInfo> RegBankInfo; + std::unique_ptr<InstructionSelector> InstSelector; private: /// Override the stack alignment. @@ -332,6 +373,10 @@ private: /// True if compiling for 16-bit, false for 32-bit or 64-bit. bool In16BitMode; + /// Contains the Overhead of gather\scatter instructions + int GatherOverhead; + int ScatterOverhead; + X86SelectionDAGInfo TSInfo; // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which // X86TargetLowering needs. @@ -346,9 +391,6 @@ public: X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS, const X86TargetMachine &TM, unsigned StackAlignOverride); - /// This object will take onwership of \p GISelAccessor. - void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); } - const X86TargetLowering *getTargetLowering() const override { return &TLInfo; } @@ -441,15 +483,18 @@ public: bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; } bool hasPOPCNT() const { return HasPOPCNT; } bool hasAES() const { return HasAES; } + bool hasVAES() const { return HasVAES; } bool hasFXSR() const { return HasFXSR; } bool hasXSAVE() const { return HasXSAVE; } bool hasXSAVEOPT() const { return HasXSAVEOPT; } bool hasXSAVEC() const { return HasXSAVEC; } bool hasXSAVES() const { return HasXSAVES; } bool hasPCLMUL() const { return HasPCLMUL; } + bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; } + bool hasGFNI() const { return HasGFNI; } // Prefer FMA4 to FMA - its better for commutation/memory folding and // has equal or better performance on all supported targets. - bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; } + bool hasFMA() const { return HasFMA; } bool hasFMA4() const { return HasFMA4; } bool hasAnyFMA() const { return hasFMA() || hasFMA4(); } bool hasXOP() const { return HasXOP; } @@ -463,6 +508,7 @@ public: bool hasBMI() const { return HasBMI; } bool hasBMI2() const { return HasBMI2; } bool hasVBMI() const { return HasVBMI; } + bool hasVBMI2() const { return HasVBMI2; } bool hasIFMA() const { return HasIFMA; } bool hasRTM() const { return HasRTM; } bool hasADX() const { return HasADX; } @@ -472,26 +518,29 @@ public: bool hasLAHFSAHF() const { return HasLAHFSAHF; } bool hasMWAITX() const { return HasMWAITX; } bool hasCLZERO() const { return HasCLZERO; } - bool isBTMemSlow() const { return IsBTMemSlow; } bool isSHLDSlow() const { return IsSHLDSlow; } bool isPMULLDSlow() const { return IsPMULLDSlow; } bool isUnalignedMem16Slow() const { return IsUAMem16Slow; } bool isUnalignedMem32Slow() const { return IsUAMem32Slow; } + int getGatherOverhead() const { return GatherOverhead; } + int getScatterOverhead() const { return ScatterOverhead; } bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; } bool hasCmpxchg16b() const { return HasCmpxchg16b; } bool useLeaForSP() const { return UseLeaForSP; } bool hasFastPartialYMMorZMMWrite() const { return HasFastPartialYMMorZMMWrite; } + bool hasFastGather() const { return HasFastGather; } bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; } bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasMacroFusion() const { return HasMacroFusion; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } bool padShortFunctions() const { return PadShortFunctions; } - bool callRegIndirect() const { return CallRegIndirect; } + bool slowTwoMemOps() const { return SlowTwoMemOps; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } bool slow3OpsLEA() const { return Slow3OpsLEA; } @@ -504,11 +553,19 @@ public: bool hasBWI() const { return HasBWI; } bool hasVLX() const { return HasVLX; } bool hasPKU() const { return HasPKU; } + bool hasVNNI() const { return HasVNNI; } + bool hasBITALG() const { return HasBITALG; } bool hasMPX() const { return HasMPX; } + bool hasSHSTK() const { return HasSHSTK; } + bool hasIBT() const { return HasIBT; } bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; } + bool hasCLWB() const { return HasCLWB; } bool isXRaySupported() const override { return is64Bit(); } + X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; } + + /// TODO: to be removed later and replaced with suitable properties bool isAtom() const { return X86ProcFamily == IntelAtom; } bool isSLM() const { return X86ProcFamily == IntelSLM; } bool useSoftFloat() const { return UseSoftFloat; } @@ -568,13 +625,9 @@ public: bool isOSWindows() const { return TargetTriple.isOSWindows(); } - bool isTargetWin64() const { - return In64BitMode && TargetTriple.isOSWindows(); - } + bool isTargetWin64() const { return In64BitMode && isOSWindows(); } - bool isTargetWin32() const { - return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC()); - } + bool isTargetWin32() const { return !In64BitMode && isOSWindows(); } bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; } bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; } @@ -590,6 +643,7 @@ public: // On Win64, all these conventions just use the default convention. case CallingConv::C: case CallingConv::Fast: + case CallingConv::Swift: case CallingConv::X86_FastCall: case CallingConv::X86_StdCall: case CallingConv::X86_ThisCall: @@ -655,6 +709,8 @@ public: AntiDepBreakMode getAntiDepBreakMode() const override { return TargetSubtargetInfo::ANTIDEP_CRITICAL; } + + bool enableAdvancedRASplitCost() const override { return true; } }; } // end namespace llvm diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 08c2cdaefe71..ea8c9862230e 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -11,13 +11,13 @@ // //===----------------------------------------------------------------------===// +#include "X86TargetMachine.h" #include "MCTargetDesc/X86MCTargetDesc.h" #include "X86.h" #include "X86CallLowering.h" #include "X86LegalizerInfo.h" #include "X86MacroFusion.h" #include "X86Subtarget.h" -#include "X86TargetMachine.h" #include "X86TargetObjectFile.h" #include "X86TargetTransformInfo.h" #include "llvm/ADT/Optional.h" @@ -34,6 +34,7 @@ #include "llvm/CodeGen/GlobalISel/RegBankSelect.h" #include "llvm/CodeGen/MachineScheduler.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" @@ -43,7 +44,6 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/TargetRegistry.h" -#include "llvm/Target/TargetLoweringObjectFile.h" #include "llvm/Target/TargetOptions.h" #include <memory> #include <string> @@ -58,7 +58,10 @@ namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); void initializeFixupLEAPassPass(PassRegistry &); +void initializeX86CallFrameOptimizationPass(PassRegistry &); +void initializeX86CmovConverterPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); +void initializeX86DomainReassignmentPass(PassRegistry &); } // end namespace llvm @@ -73,7 +76,10 @@ extern "C" void LLVMInitializeX86Target() { initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); initializeFixupLEAPassPass(PR); + initializeX86CallFrameOptimizationPass(PR); + initializeX86CmovConverterPassPass(PR); initializeX86ExecutionDepsFixPass(PR); + initializeX86DomainReassignmentPass(PR); } static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) { @@ -181,15 +187,27 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT, return *RM; } +static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM, + bool JIT, bool Is64Bit) { + if (CM) + return *CM; + if (JIT) + return Is64Bit ? CodeModel::Large : CodeModel::Small; + return CodeModel::Small; +} + /// Create an X86 target. /// X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, Optional<Reloc::Model> RM, - CodeModel::Model CM, CodeGenOpt::Level OL) - : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options, - getEffectiveRelocModel(TT, RM), CM, OL), + Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT) + : LLVMTargetMachine( + T, computeDataLayout(TT), TT, CPU, FS, Options, + getEffectiveRelocModel(TT, RM), + getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL), TLOF(createTLOF(getTargetTriple())) { // Windows stack unwinder gets confused when execution flow "falls through" // after a call to 'noreturn' function. @@ -294,14 +312,13 @@ public: void addIRPasses() override; bool addInstSelector() override; -#ifdef LLVM_BUILD_GLOBAL_ISEL bool addIRTranslator() override; bool addLegalizeMachineIR() override; bool addRegBankSelect() override; bool addGlobalInstructionSelect() override; -#endif bool addILPOpts() override; bool addPreISel() override; + void addMachineSSAOptimization() override; void addPreRegAlloc() override; void addPostRegAlloc() override; void addPreEmitPass() override; @@ -349,7 +366,6 @@ bool X86PassConfig::addInstSelector() { return false; } -#ifdef LLVM_BUILD_GLOBAL_ISEL bool X86PassConfig::addIRTranslator() { addPass(new IRTranslator()); return false; @@ -369,7 +385,6 @@ bool X86PassConfig::addGlobalInstructionSelect() { addPass(new InstructionSelect()); return false; } -#endif bool X86PassConfig::addILPOpts() { addPass(&EarlyIfConverterID); @@ -397,6 +412,10 @@ void X86PassConfig::addPreRegAlloc() { addPass(createX86WinAllocaExpander()); } +void X86PassConfig::addMachineSSAOptimization() { + addPass(createX86DomainReassignmentPass()); + TargetPassConfig::addMachineSSAOptimization(); +} void X86PassConfig::addPostRegAlloc() { addPass(createX86FloatingPointStackifierPass()); diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h index c16207973b39..952bd1321ff9 100644 --- a/lib/Target/X86/X86TargetMachine.h +++ b/lib/Target/X86/X86TargetMachine.h @@ -35,13 +35,14 @@ class X86TargetMachine final : public LLVMTargetMachine { public: X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU, StringRef FS, const TargetOptions &Options, - Optional<Reloc::Model> RM, CodeModel::Model CM, - CodeGenOpt::Level OL); + Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM, + CodeGenOpt::Level OL, bool JIT); ~X86TargetMachine() override; const X86Subtarget *getSubtargetImpl(const Function &F) const override; - // The no argument getSubtargetImpl, while it exists on some targets, is - // deprecated and should not be used. + // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget, + // subtargets are per-function entities based on the target-specific + // attributes of each function. const X86Subtarget *getSubtargetImpl() const = delete; TargetIRAnalysis getTargetIRAnalysis() override; diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp index 8627c06d4431..fb35a6b2ec1a 100644 --- a/lib/Target/X86/X86TargetObjectFile.cpp +++ b/lib/Target/X86/X86TargetObjectFile.cpp @@ -11,6 +11,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/BinaryFormat/COFF.h" #include "llvm/BinaryFormat/Dwarf.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/Mangler.h" #include "llvm/IR/Operator.h" #include "llvm/MC/MCContext.h" @@ -18,7 +19,6 @@ #include "llvm/MC/MCSectionCOFF.h" #include "llvm/MC/MCSectionELF.h" #include "llvm/MC/MCValue.h" -#include "llvm/Target/TargetLowering.h" using namespace llvm; using namespace dwarf; diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h index f6aa570b6332..76e9cd5db2a0 100644 --- a/lib/Target/X86/X86TargetObjectFile.h +++ b/lib/Target/X86/X86TargetObjectFile.h @@ -10,8 +10,8 @@ #ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H #define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H +#include "llvm/CodeGen/TargetLoweringObjectFile.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" -#include "llvm/Target/TargetLoweringObjectFile.h" namespace llvm { diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index c9924f264939..223eed3048db 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -42,10 +42,10 @@ #include "X86TargetTransformInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" +#include "llvm/CodeGen/CostTable.h" +#include "llvm/CodeGen/TargetLowering.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/Support/Debug.h" -#include "llvm/Target/CostTable.h" -#include "llvm/Target/TargetLowering.h" using namespace llvm; @@ -66,6 +66,57 @@ X86TTIImpl::getPopcntSupport(unsigned TyWidth) { return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software; } +llvm::Optional<unsigned> X86TTIImpl::getCacheSize( + TargetTransformInfo::CacheLevel Level) const { + switch (Level) { + case TargetTransformInfo::CacheLevel::L1D: + // - Penryn + // - Nehalem + // - Westmere + // - Sandy Bridge + // - Ivy Bridge + // - Haswell + // - Broadwell + // - Skylake + // - Kabylake + return 32 * 1024; // 32 KByte + case TargetTransformInfo::CacheLevel::L2D: + // - Penryn + // - Nehalem + // - Westmere + // - Sandy Bridge + // - Ivy Bridge + // - Haswell + // - Broadwell + // - Skylake + // - Kabylake + return 256 * 1024; // 256 KByte + } + + llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); +} + +llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity( + TargetTransformInfo::CacheLevel Level) const { + // - Penryn + // - Nehalem + // - Westmere + // - Sandy Bridge + // - Ivy Bridge + // - Haswell + // - Broadwell + // - Skylake + // - Kabylake + switch (Level) { + case TargetTransformInfo::CacheLevel::L1D: + LLVM_FALLTHROUGH; + case TargetTransformInfo::CacheLevel::L2D: + return 8; + } + + llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); +} + unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) { if (Vector && !ST->hasSSE1()) return 0; @@ -144,9 +195,9 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::FSUB, MVT::v2f64, 2 }, // subpd // v2i64/v4i64 mul is custom lowered as a series of long: // multiplies(3), shifts(3) and adds(2) - // slm muldq version throughput is 2 and addq throughput 4 + // slm muldq version throughput is 2 and addq throughput 4 // thus: 3X2 (muldq throughput) + 3X1 (shift throuput) + - // 3X4 (addq throughput) = 17 + // 3X4 (addq throughput) = 17 { ISD::MUL, MVT::v2i64, 17 }, // slm addq\subq throughput is 4 { ISD::ADD, MVT::v2i64, 4 }, @@ -838,11 +889,22 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw { TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps { TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq { TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd - { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb // + vpblendvb - { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 } // vperm2i128 + 2 * vpshufb + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb + // + vpblendvb + + { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd + { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps + { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd + { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd + { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb + // + vpblendvb + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb // + vpblendvb }; @@ -850,6 +912,28 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second)) return LT.first * Entry->Cost; + static const CostTblEntry XOPShuffleTbl[] = { + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm + // + vinsertf128 + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm + // + vinsertf128 + + { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm + // + vinsertf128 + { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm + // + vinsertf128 + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm + }; + + if (ST->hasXOP()) + if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second)) + return LT.first * Entry->Cost; + static const CostTblEntry AVX1ShuffleTbl[] = { { TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd { TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps @@ -872,7 +956,25 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps { TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps { TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor - { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor + { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor + + { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd + { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd + { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps + { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps + { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 + { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb + // + 2*por + vinsertf128 + + { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd + { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps + { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd + { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps + { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 + { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb + // + 4*por + vinsertf128 }; if (ST->hasAVX()) @@ -899,11 +1001,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb { TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb - { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por - { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por + { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por + { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb - { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb + + { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por }; if (ST->hasSSSE3()) @@ -914,13 +1019,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd { TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd { TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd + { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd { TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd { TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd { TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd { TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd - { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd + { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd { TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw // + 2*pshufd + 2*unpck + packus @@ -930,8 +1035,19 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, { TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por { TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por - { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd - { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd + { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd + { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw + // + pshufd/unpck + { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw + // + 2*pshufd + 2*unpck + 2*packus + + { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd + { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd + { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd} + { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute + { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute }; if (ST->hasSSE2()) @@ -939,9 +1055,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, return LT.first * Entry->Cost; static const CostTblEntry SSE1ShuffleTbl[] = { - { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps - { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps - { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps + { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps + { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps + { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps + { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps + { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps }; if (ST->hasSSE1()) @@ -1052,7 +1170,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, { ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 }, { ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 }, { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 }, + { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 }, + { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 }, { ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 }, + { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 }, + { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 }, }; static const TypeConversionCostTblEntry AVX2ConversionTbl[] = { @@ -1315,7 +1437,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, return Entry->Cost; } - return BaseT::getCastInstrCost(Opcode, Dst, Src); + return BaseT::getCastInstrCost(Opcode, Dst, Src, I); } int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy, @@ -1805,8 +1927,8 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE, return BaseT::getAddressComputationCost(Ty, SE, Ptr); } -int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, - bool IsPairwise) { +int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy, + bool IsPairwise) { std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); @@ -1874,7 +1996,153 @@ int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy, return LT.first * Entry->Cost; } - return BaseT::getReductionCost(Opcode, ValTy, IsPairwise); + return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise); +} + +int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy, + bool IsPairwise, bool IsUnsigned) { + std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy); + + MVT MTy = LT.second; + + int ISD; + if (ValTy->isIntOrIntVectorTy()) { + ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN; + } else { + assert(ValTy->isFPOrFPVectorTy() && + "Expected float point or integer vector type."); + ISD = ISD::FMINNUM; + } + + // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput + // and make it as the cost. + + static const CostTblEntry SSE42CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v2f64, 3}, + {ISD::FMINNUM, MVT::v4f32, 2}, + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6" + {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" + {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" + {ISD::SMIN, MVT::v8i16, 2}, + {ISD::UMIN, MVT::v8i16, 2}, + }; + + static const CostTblEntry AVX1CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 1}, + {ISD::FMINNUM, MVT::v4f64, 1}, + {ISD::FMINNUM, MVT::v8f32, 2}, + {ISD::SMIN, MVT::v2i64, 3}, + {ISD::UMIN, MVT::v2i64, 3}, + {ISD::SMIN, MVT::v4i32, 1}, + {ISD::UMIN, MVT::v4i32, 1}, + {ISD::SMIN, MVT::v8i16, 1}, + {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v8i32, 3}, + {ISD::UMIN, MVT::v8i32, 3}, + }; + + static const CostTblEntry AVX2CostTblPairWise[] = { + {ISD::SMIN, MVT::v4i64, 2}, + {ISD::UMIN, MVT::v4i64, 2}, + {ISD::SMIN, MVT::v8i32, 1}, + {ISD::UMIN, MVT::v8i32, 1}, + {ISD::SMIN, MVT::v16i16, 1}, + {ISD::UMIN, MVT::v16i16, 1}, + {ISD::SMIN, MVT::v32i8, 2}, + {ISD::UMIN, MVT::v32i8, 2}, + }; + + static const CostTblEntry AVX512CostTblPairWise[] = { + {ISD::FMINNUM, MVT::v8f64, 1}, + {ISD::FMINNUM, MVT::v16f32, 2}, + {ISD::SMIN, MVT::v8i64, 2}, + {ISD::UMIN, MVT::v8i64, 2}, + {ISD::SMIN, MVT::v16i32, 1}, + {ISD::UMIN, MVT::v16i32, 1}, + }; + + static const CostTblEntry SSE42CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v2f64, 3}, + {ISD::FMINNUM, MVT::v4f32, 3}, + {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8" + {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6" + {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5" + {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8" + {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5" + {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8" + }; + + static const CostTblEntry AVX1CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v4f32, 1}, + {ISD::FMINNUM, MVT::v4f64, 1}, + {ISD::FMINNUM, MVT::v8f32, 1}, + {ISD::SMIN, MVT::v2i64, 3}, + {ISD::UMIN, MVT::v2i64, 3}, + {ISD::SMIN, MVT::v4i32, 1}, + {ISD::UMIN, MVT::v4i32, 1}, + {ISD::SMIN, MVT::v8i16, 1}, + {ISD::UMIN, MVT::v8i16, 1}, + {ISD::SMIN, MVT::v8i32, 2}, + {ISD::UMIN, MVT::v8i32, 2}, + }; + + static const CostTblEntry AVX2CostTblNoPairWise[] = { + {ISD::SMIN, MVT::v4i64, 1}, + {ISD::UMIN, MVT::v4i64, 1}, + {ISD::SMIN, MVT::v8i32, 1}, + {ISD::UMIN, MVT::v8i32, 1}, + {ISD::SMIN, MVT::v16i16, 1}, + {ISD::UMIN, MVT::v16i16, 1}, + {ISD::SMIN, MVT::v32i8, 1}, + {ISD::UMIN, MVT::v32i8, 1}, + }; + + static const CostTblEntry AVX512CostTblNoPairWise[] = { + {ISD::FMINNUM, MVT::v8f64, 1}, + {ISD::FMINNUM, MVT::v16f32, 2}, + {ISD::SMIN, MVT::v8i64, 1}, + {ISD::UMIN, MVT::v8i64, 1}, + {ISD::SMIN, MVT::v16i32, 1}, + {ISD::UMIN, MVT::v16i32, 1}, + }; + + if (IsPairwise) { + if (ST->hasAVX512()) + if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } else { + if (ST->hasAVX512()) + if (const auto *Entry = + CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX2()) + if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasAVX()) + if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + + if (ST->hasSSE42()) + if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy)) + return LT.first * Entry->Cost; + } + + return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned); } /// \brief Calculate the cost of materializing a 64-bit value. This helper @@ -2046,6 +2314,21 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, return X86TTIImpl::getIntImmCost(Imm, Ty); } +unsigned X86TTIImpl::getUserCost(const User *U, + ArrayRef<const Value *> Operands) { + if (isa<StoreInst>(U)) { + Value *Ptr = U->getOperand(1); + // Store instruction with index and scale costs 2 Uops. + // Check the preceding GEP to identify non-const indices. + if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) { + if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); })) + return TTI::TCC_Basic * 2; + } + return TTI::TCC_Basic; + } + return BaseT::getUserCost(U, Operands); +} + // Return an average cost of Gather / Scatter instruction, maybe improved later int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, unsigned Alignment, unsigned AddressSpace) { @@ -2085,8 +2368,9 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, // Trying to reduce IndexSize to 32 bits for vector 16. // By default the IndexSize is equal to pointer size. - unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) : - DL.getPointerSizeInBits(); + unsigned IndexSize = (ST->hasAVX512() && VF >= 16) + ? getIndexSizeInBits(Ptr, DL) + : DL.getPointerSizeInBits(); Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(), IndexSize), VF); @@ -2102,7 +2386,9 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr, // The gather / scatter cost is given by Intel architects. It is a rough // number since we are looking at one instruction in a time. - const int GSOverhead = 2; + const int GSOverhead = (Opcode == Instruction::Load) + ? ST->getGatherOverhead() + : ST->getScatterOverhead(); return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(), Alignment, AddressSpace); } @@ -2173,7 +2459,7 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, // the mask vector will add more instructions. Right now we give the scalar // cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction // is better in the VariableMask case. - if (VF == 2 || (VF == 4 && !ST->hasVLX())) + if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX()))) Scalarize = true; if (Scalarize) @@ -2183,7 +2469,21 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy, return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace); } +bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2) { + // X86 specific here are "instruction number 1st priority". + return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, + C1.NumIVMuls, C1.NumBaseAdds, + C1.ScaleCost, C1.ImmCost, C1.SetupCost) < + std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, + C2.NumIVMuls, C2.NumBaseAdds, + C2.ScaleCost, C2.ImmCost, C2.SetupCost); +} + bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) { + // The backend can't handle a single element vector. + if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1) + return false; Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa<PointerType>(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); @@ -2207,20 +2507,40 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) { // the vector type. // The Scalarizer asks again about legality. It sends a vector type. // In this case we can reject non-power-of-2 vectors. - if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements())) - return false; + // We also reject single element vectors as the type legalizer can't + // scalarize it. + if (isa<VectorType>(DataTy)) { + unsigned NumElts = DataTy->getVectorNumElements(); + if (NumElts == 1 || !isPowerOf2_32(NumElts)) + return false; + } Type *ScalarTy = DataTy->getScalarType(); int DataWidth = isa<PointerType>(ScalarTy) ? DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits(); - // AVX-512 allows gather and scatter - return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512(); + // Some CPUs have better gather performance than others. + // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only + // enable gather with a -march. + return (DataWidth == 32 || DataWidth == 64) && + (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2())); } bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) { + // AVX2 doesn't support scatter + if (!ST->hasAVX512()) + return false; return isLegalMaskedGather(DataType); } +bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) { + EVT VT = TLI->getValueType(DL, DataType); + return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT); +} + +bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) { + return false; +} + bool X86TTIImpl::areInlineCompatible(const Function *Caller, const Function *Callee) const { const TargetMachine &TM = getTLI()->getTargetMachine(); @@ -2237,10 +2557,35 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller, return (CallerBits & CalleeBits) == CalleeBits; } -bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) { - // TODO: We can increase these based on available vector ops. - MaxLoadSize = ST->is64Bit() ? 8 : 4; - return true; +const X86TTIImpl::TTI::MemCmpExpansionOptions * +X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const { + // Only enable vector loads for equality comparison. + // Right now the vector version is not as fast, see #33329. + static const auto ThreeWayOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + static const auto EqZeroOptions = [this]() { + TTI::MemCmpExpansionOptions Options; + // TODO: enable AVX512 when the DAG is ready. + // if (ST->hasAVX512()) Options.LoadSizes.push_back(64); + if (ST->hasAVX2()) Options.LoadSizes.push_back(32); + if (ST->hasSSE2()) Options.LoadSizes.push_back(16); + if (ST->is64Bit()) { + Options.LoadSizes.push_back(8); + } + Options.LoadSizes.push_back(4); + Options.LoadSizes.push_back(2); + Options.LoadSizes.push_back(1); + return Options; + }(); + return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions; } bool X86TTIImpl::enableInterleavedAccessVectorization() { @@ -2288,7 +2633,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, unsigned VF = VecTy->getVectorNumElements() / Factor; Type *ScalarTy = VecTy->getVectorElementType(); - + // Calculate the number of memory operations (NumOfMemOps), required // for load/store the VecTy. unsigned VecTySize = DL.getTypeStoreSize(VecTy); @@ -2300,7 +2645,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, LegalVT.getVectorNumElements()); unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); - + VectorType *VT = VectorType::get(ScalarTy, VF); EVT ETy = TLI->getValueType(DL, VT); if (!ETy.isSimple()) @@ -2315,31 +2660,40 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, // The cost of the loads/stores is accounted for separately. // static const CostTblEntry AVX2InterleavedLoadTbl[] = { + { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64 + { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64 + { 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8 { 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8 { 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8 - { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8 - { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8 - + { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8 + { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8 + { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32 + { 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8 { 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8 { 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8 { 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8 - { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8 + { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8 + + { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32 }; static const CostTblEntry AVX2InterleavedStoreTbl[] = { + { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store) + { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store) + { 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store) { 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store) { 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store) - { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store) - { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store) + { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store) + { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store) { 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store) { 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store) - { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store) - { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store) - { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store) + { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store) + { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store) + { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store) }; if (Opcode == Instruction::Load) { @@ -2349,7 +2703,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy, } else { assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); - if (const auto *Entry = + if (const auto *Entry = CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT())) return NumOfMemOps * MemOpCost + Entry->Cost; } @@ -2385,7 +2739,27 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace); + unsigned VF = VecTy->getVectorNumElements() / Factor; + MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF); + if (Opcode == Instruction::Load) { + // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl) + // contain the cost of the optimized shuffle sequence that the + // X86InterleavedAccess pass will generate. + // The cost of loads and stores are computed separately from the table. + + // X86InterleavedAccess support only the following interleaved-access group. + static const CostTblEntry AVX512InterleavedLoadTbl[] = { + {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8 + {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8 + {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8 + }; + + if (const auto *Entry = + CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT)) + return NumOfMemOps * MemOpCost + Entry->Cost; + //If an entry does not exist, fallback to the default implementation. + // Kind of shuffle depends on number of loaded values. // If we load the entire data in one register, we can use a 1-src shuffle. // Otherwise, we'll merge 2 sources in each operation. @@ -2428,6 +2802,22 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy, // Store. assert(Opcode == Instruction::Store && "Expected Store Instruction at this point"); + // X86InterleavedAccess support only the following interleaved-access group. + static const CostTblEntry AVX512InterleavedStoreTbl[] = { + {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store) + {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store) + {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store) + + {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store) + {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store) + {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store) + {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store) + }; + + if (const auto *Entry = + CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT)) + return NumOfMemOps * MemOpCost + Entry->Cost; + //If an entry does not exist, fallback to the default implementation. // There is no strided stores meanwhile. And store can't be folded in // shuffle. @@ -2449,27 +2839,22 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, ArrayRef<unsigned> Indices, unsigned Alignment, unsigned AddressSpace) { - auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) { - RequiresBW = false; + auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) { Type *EltTy = VecTy->getVectorElementType(); if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) || EltTy->isIntegerTy(32) || EltTy->isPointerTy()) return true; - if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) { - RequiresBW = true; - return true; - } + if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) + return HasBW; return false; }; - bool RequiresBW; - bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW); - if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI())) + if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI())) return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); if (ST->hasAVX2()) return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); - + return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, Alignment, AddressSpace); } diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h index ad0a0a211301..6f01a6fd11df 100644 --- a/lib/Target/X86/X86TargetTransformInfo.h +++ b/lib/Target/X86/X86TargetTransformInfo.h @@ -21,7 +21,7 @@ #include "X86TargetMachine.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/BasicTTIImpl.h" -#include "llvm/Target/TargetLowering.h" +#include "llvm/CodeGen/TargetLowering.h" namespace llvm { @@ -47,6 +47,14 @@ public: /// @} + /// \name Cache TTI Implementation + /// @{ + llvm::Optional<unsigned> getCacheSize( + TargetTransformInfo::CacheLevel Level) const; + llvm::Optional<unsigned> getCacheAssociativity( + TargetTransformInfo::CacheLevel Level) const; + /// @} + /// \name Vector TTI Implementations /// @{ @@ -85,7 +93,11 @@ public: ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF = 1); - int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm); + int getArithmeticReductionCost(unsigned Opcode, Type *Ty, + bool IsPairwiseForm); + + int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm, + bool IsUnsigned); int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices, @@ -101,16 +113,23 @@ public: int getIntImmCost(const APInt &Imm, Type *Ty); + unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands); + int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty); int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty); + bool isLSRCostLess(TargetTransformInfo::LSRCost &C1, + TargetTransformInfo::LSRCost &C2); bool isLegalMaskedLoad(Type *DataType); bool isLegalMaskedStore(Type *DataType); bool isLegalMaskedGather(Type *DataType); bool isLegalMaskedScatter(Type *DataType); + bool hasDivRemOp(Type *DataType, bool IsSigned); + bool isFCmpOrdCheaperThanFCmpZero(Type *Ty); bool areInlineCompatible(const Function *Caller, const Function *Callee) const; - bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize); + const TTI::MemCmpExpansionOptions *enableMemCmpExpansion( + bool IsZeroCmp) const; bool enableInterleavedAccessVectorization(); private: int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask, diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp index d17dfac6a997..224262830b12 100644 --- a/lib/Target/X86/X86VZeroUpper.cpp +++ b/lib/Target/X86/X86VZeroUpper.cpp @@ -1,4 +1,4 @@ -//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===// +//===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===// // // The LLVM Compiler Infrastructure // @@ -17,14 +17,25 @@ #include "X86.h" #include "X86InstrInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" #include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/IR/CallingConv.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/IR/Function.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" +#include <cassert> + using namespace llvm; #define DEBUG_TYPE "x86-vzeroupper" @@ -35,23 +46,25 @@ namespace { class VZeroUpperInserter : public MachineFunctionPass { public: - VZeroUpperInserter() : MachineFunctionPass(ID) {} + bool runOnMachineFunction(MachineFunction &MF) override; + MachineFunctionProperties getRequiredProperties() const override { return MachineFunctionProperties().set( MachineFunctionProperties::Property::NoVRegs); } + StringRef getPassName() const override { return "X86 vzeroupper inserter"; } private: - void processBasicBlock(MachineBasicBlock &MBB); void insertVZeroUpper(MachineBasicBlock::iterator I, MachineBasicBlock &MBB); void addDirtySuccessor(MachineBasicBlock &MBB); - typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState; + using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY }; + static const char* getBlockExitStateName(BlockExitState ST); // Core algorithm state: @@ -73,13 +86,15 @@ namespace { // to be guarded until we discover a predecessor that // is DIRTY_OUT. struct BlockState { - BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {} - BlockExitState ExitState; - bool AddedToDirtySuccessors; + BlockExitState ExitState = PASS_THROUGH; + bool AddedToDirtySuccessors = false; MachineBasicBlock::iterator FirstUnguardedCall; + + BlockState() = default; }; - typedef SmallVector<BlockState, 8> BlockStateMap; - typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList; + + using BlockStateMap = SmallVector<BlockState, 8>; + using DirtySuccessorsWorkList = SmallVector<MachineBasicBlock *, 8>; BlockStateMap BlockStates; DirtySuccessorsWorkList DirtySuccessors; @@ -90,8 +105,9 @@ namespace { static char ID; }; - char VZeroUpperInserter::ID = 0; -} +} // end anonymous namespace + +char VZeroUpperInserter::ID = 0; FunctionPass *llvm::createX86IssueVZeroUpperPass() { return new VZeroUpperInserter(); @@ -116,9 +132,8 @@ static bool isYmmOrZmmReg(unsigned Reg) { } static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) { - for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(), - E = MRI.livein_end(); I != E; ++I) - if (isYmmOrZmmReg(I->first)) + for (std::pair<unsigned, unsigned> LI : MRI.liveins()) + if (isYmmOrZmmReg(LI.first)) return true; return false; @@ -220,7 +235,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) { // If the call has no RegMask, skip it as well. It usually happens on // helper function calls (such as '_chkstk', '_ftol2') where standard // calling convention is not used (RegMask is not used to mark register - // clobbered and register usage (def/imp-def/use) is well-defined and + // clobbered and register usage (def/implicit-def/use) is well-defined and // explicitly specified. if (IsCall && !callHasRegMask(MI)) continue; @@ -270,7 +285,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) { TII = ST.getInstrInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); EverMadeChange = false; - IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR; + IsX86INTR = MF.getFunction().getCallingConv() == CallingConv::X86_INTR; bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI); diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp index fc08f1582ad7..1046696587d9 100644 --- a/lib/Target/X86/X86WinAllocaExpander.cpp +++ b/lib/Target/X86/X86WinAllocaExpander.cpp @@ -25,9 +25,9 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/Passes.h" +#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/IR/Function.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Target/TargetInstrInfo.h" using namespace llvm; @@ -279,9 +279,9 @@ bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) { SlotSize = TRI->getSlotSize(); StackProbeSize = 4096; - if (MF.getFunction()->hasFnAttribute("stack-probe-size")) { + if (MF.getFunction().hasFnAttribute("stack-probe-size")) { MF.getFunction() - ->getFnAttribute("stack-probe-size") + .getFnAttribute("stack-probe-size") .getValueAsString() .getAsInteger(0, StackProbeSize); } diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 0c3b34341476..0472a85f50da 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -401,6 +401,8 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape( ParentFunc->getName()), TheModule); + if (auto *C = ParentFunc->getComdat()) + Trampoline->setComdat(C); BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); IRBuilder<> Builder(EntryBB); Value *LSDA = emitEHLSDA(Builder, ParentFunc); |
