summaryrefslogtreecommitdiff
path: root/lib/Target/X86
diff options
context:
space:
mode:
Diffstat (limited to 'lib/Target/X86')
-rw-r--r--lib/Target/X86/AsmParser/LLVMBuild.txt2
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp23
-rw-r--r--lib/Target/X86/AsmParser/X86AsmInstrumentation.h6
-rw-r--r--lib/Target/X86/AsmParser/X86AsmParser.cpp1118
-rw-r--r--lib/Target/X86/AsmParser/X86Operand.h26
-rw-r--r--lib/Target/X86/CMakeLists.txt30
-rw-r--r--lib/Target/X86/Disassembler/X86Disassembler.cpp134
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp340
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoder.h20
-rw-r--r--lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h4
-rw-r--r--lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp8
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.cpp43
-rw-r--r--lib/Target/X86/InstPrinter/X86InstComments.h5
-rw-r--r--lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp9
-rw-r--r--lib/Target/X86/MCTargetDesc/CMakeLists.txt3
-rw-r--r--lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp26
-rw-r--r--lib/Target/X86/MCTargetDesc/X86BaseInfo.h22
-rw-r--r--lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp12
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp13
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp11
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp190
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h36
-rw-r--r--lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp13
-rw-r--r--lib/Target/X86/MCTargetDesc/X86TargetStreamer.h34
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp9
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp31
-rw-r--r--lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp415
-rw-r--r--lib/Target/X86/README-SSE.txt62
-rw-r--r--lib/Target/X86/README-X86-64.txt8
-rw-r--r--lib/Target/X86/README.txt14
-rw-r--r--lib/Target/X86/TargetInfo/X86TargetInfo.cpp4
-rw-r--r--lib/Target/X86/X86.h6
-rw-r--r--lib/Target/X86/X86.td363
-rw-r--r--lib/Target/X86/X86AsmPrinter.cpp34
-rw-r--r--lib/Target/X86/X86AsmPrinter.h15
-rw-r--r--lib/Target/X86/X86CallFrameOptimization.cpp131
-rw-r--r--lib/Target/X86/X86CallLowering.cpp272
-rw-r--r--lib/Target/X86/X86CallLowering.h28
-rw-r--r--lib/Target/X86/X86CallingConv.td7
-rw-r--r--lib/Target/X86/X86CmovConversion.cpp359
-rw-r--r--lib/Target/X86/X86DomainReassignment.cpp753
-rwxr-xr-xlib/Target/X86/X86EvexToVex.cpp133
-rw-r--r--lib/Target/X86/X86ExpandPseudo.cpp2
-rw-r--r--lib/Target/X86/X86FastISel.cpp279
-rw-r--r--lib/Target/X86/X86FixupBWInsts.cpp101
-rw-r--r--lib/Target/X86/X86FixupLEAs.cpp8
-rw-r--r--lib/Target/X86/X86FloatingPoint.cpp22
-rw-r--r--lib/Target/X86/X86FrameLowering.cpp205
-rw-r--r--lib/Target/X86/X86FrameLowering.h18
-rw-r--r--lib/Target/X86/X86GenRegisterBankInfo.def4
-rw-r--r--lib/Target/X86/X86ISelDAGToDAG.cpp765
-rw-r--r--lib/Target/X86/X86ISelLowering.cpp6349
-rw-r--r--lib/Target/X86/X86ISelLowering.h263
-rw-r--r--lib/Target/X86/X86Instr3DNow.td135
-rw-r--r--lib/Target/X86/X86InstrAVX512.td6652
-rw-r--r--lib/Target/X86/X86InstrArithmetic.td60
-rw-r--r--lib/Target/X86/X86InstrCMovSetCC.td4
-rw-r--r--lib/Target/X86/X86InstrCompiler.td399
-rw-r--r--lib/Target/X86/X86InstrControl.td24
-rw-r--r--lib/Target/X86/X86InstrExtension.td26
-rw-r--r--lib/Target/X86/X86InstrFMA.td370
-rw-r--r--lib/Target/X86/X86InstrFPStack.td186
-rw-r--r--lib/Target/X86/X86InstrFormats.td45
-rw-r--r--lib/Target/X86/X86InstrFragmentsSIMD.td469
-rw-r--r--lib/Target/X86/X86InstrInfo.cpp1143
-rw-r--r--lib/Target/X86/X86InstrInfo.h137
-rw-r--r--lib/Target/X86/X86InstrInfo.td442
-rw-r--r--lib/Target/X86/X86InstrMMX.td7
-rw-r--r--lib/Target/X86/X86InstrMPX.td34
-rw-r--r--lib/Target/X86/X86InstrSGX.td2
-rw-r--r--lib/Target/X86/X86InstrSSE.td2685
-rw-r--r--lib/Target/X86/X86InstrSVM.td27
-rw-r--r--lib/Target/X86/X86InstrShiftRotate.td84
-rw-r--r--lib/Target/X86/X86InstrSystem.td254
-rw-r--r--lib/Target/X86/X86InstrTSX.td5
-rw-r--r--lib/Target/X86/X86InstrVMX.td52
-rw-r--r--lib/Target/X86/X86InstrVecCompiler.td586
-rw-r--r--lib/Target/X86/X86InstrXOP.td91
-rw-r--r--lib/Target/X86/X86InstructionSelector.cpp461
-rw-r--r--lib/Target/X86/X86InterleavedAccess.cpp625
-rw-r--r--lib/Target/X86/X86IntrinsicsInfo.h554
-rw-r--r--lib/Target/X86/X86LegalizerInfo.cpp202
-rw-r--r--lib/Target/X86/X86MCInstLower.cpp195
-rw-r--r--lib/Target/X86/X86MachineFunctionInfo.cpp2
-rw-r--r--lib/Target/X86/X86MacroFusion.cpp16
-rw-r--r--lib/Target/X86/X86OptimizeLEAs.cpp49
-rw-r--r--lib/Target/X86/X86PadShortFunction.cpp8
-rw-r--r--lib/Target/X86/X86RegisterBankInfo.cpp26
-rw-r--r--lib/Target/X86/X86RegisterInfo.cpp63
-rw-r--r--lib/Target/X86/X86RegisterInfo.h2
-rw-r--r--lib/Target/X86/X86RegisterInfo.td5
-rwxr-xr-xlib/Target/X86/X86SchedBroadwell.td3869
-rw-r--r--lib/Target/X86/X86SchedHaswell.td4699
-rw-r--r--lib/Target/X86/X86SchedSandyBridge.td2645
-rw-r--r--lib/Target/X86/X86SchedSkylakeClient.td3993
-rwxr-xr-xlib/Target/X86/X86SchedSkylakeServer.td6500
-rw-r--r--lib/Target/X86/X86Schedule.td47
-rw-r--r--lib/Target/X86/X86ScheduleAtom.td6
-rw-r--r--lib/Target/X86/X86ScheduleBtVer2.td356
-rw-r--r--lib/Target/X86/X86ScheduleSLM.td7
-rw-r--r--lib/Target/X86/X86ScheduleZnver1.td1557
-rw-r--r--lib/Target/X86/X86SelectionDAGInfo.cpp4
-rw-r--r--lib/Target/X86/X86ShuffleDecodeConstantPool.cpp2
-rw-r--r--lib/Target/X86/X86Subtarget.cpp108
-rw-r--r--lib/Target/X86/X86Subtarget.h112
-rw-r--r--lib/Target/X86/X86TargetMachine.cpp37
-rw-r--r--lib/Target/X86/X86TargetMachine.h9
-rw-r--r--lib/Target/X86/X86TargetObjectFile.cpp2
-rw-r--r--lib/Target/X86/X86TargetObjectFile.h2
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.cpp495
-rw-r--r--lib/Target/X86/X86TargetTransformInfo.h25
-rw-r--r--lib/Target/X86/X86VZeroUpper.cpp51
-rw-r--r--lib/Target/X86/X86WinAllocaExpander.cpp6
-rw-r--r--lib/Target/X86/X86WinEHState.cpp2
114 files changed, 41020 insertions, 12429 deletions
diff --git a/lib/Target/X86/AsmParser/LLVMBuild.txt b/lib/Target/X86/AsmParser/LLVMBuild.txt
index 9f94d5d38864..67c0d1358d80 100644
--- a/lib/Target/X86/AsmParser/LLVMBuild.txt
+++ b/lib/Target/X86/AsmParser/LLVMBuild.txt
@@ -19,5 +19,5 @@
type = Library
name = X86AsmParser
parent = X86
-required_libraries = MC MCParser Support X86Desc X86Info
+required_libraries = MC MCParser Support X86Desc X86Info X86AsmPrinter
add_to_library_groups = X86
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index f7e31de65f6d..2c376fd062ca 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -38,7 +38,7 @@
// Currently we have only AddressSanitizer instrumentation, but we're
// planning to implement MemorySanitizer for inline assembly too. If
// you're not familiar with AddressSanitizer algorithm, please, read
-// https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm.
+// https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
//
// When inline assembly is parsed by an instance of X86AsmParser, all
// instructions are emitted via EmitInstruction method. That's the
@@ -193,11 +193,10 @@ public:
~X86AddressSanitizer() override = default;
// X86AsmInstrumentation implementation:
- void InstrumentAndEmitInstruction(const MCInst &Inst,
- OperandVector &Operands,
- MCContext &Ctx,
- const MCInstrInfo &MII,
- MCStreamer &Out) override {
+ void InstrumentAndEmitInstruction(const MCInst &Inst, OperandVector &Operands,
+ MCContext &Ctx, const MCInstrInfo &MII,
+ MCStreamer &Out,
+ /* unused */ bool) override {
InstrumentMOVS(Inst, Operands, Ctx, MII, Out);
if (RepPrefix)
EmitInstruction(Out, MCInstBuilder(X86::REP_PREFIX));
@@ -611,7 +610,7 @@ private:
EmitInstruction(Out, MCInstBuilder(X86::CLD));
EmitInstruction(Out, MCInstBuilder(X86::MMX_EMMS));
- EmitInstruction(Out, MCInstBuilder(X86::AND64ri8)
+ EmitInstruction(Out, MCInstBuilder(X86::AND32ri8)
.addReg(X86::ESP)
.addReg(X86::ESP)
.addImm(-16));
@@ -1045,13 +1044,13 @@ X86AsmInstrumentation::~X86AsmInstrumentation() = default;
void X86AsmInstrumentation::InstrumentAndEmitInstruction(
const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
- const MCInstrInfo &MII, MCStreamer &Out) {
- EmitInstruction(Out, Inst);
+ const MCInstrInfo &MII, MCStreamer &Out, bool PrintSchedInfoEnabled) {
+ EmitInstruction(Out, Inst, PrintSchedInfoEnabled);
}
-void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out,
- const MCInst &Inst) {
- Out.EmitInstruction(Inst, *STI);
+void X86AsmInstrumentation::EmitInstruction(MCStreamer &Out, const MCInst &Inst,
+ bool PrintSchedInfoEnabled) {
+ Out.EmitInstruction(Inst, *STI, PrintSchedInfoEnabled);
}
unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 97a55cd8ad98..42a9dc3ba26a 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -42,7 +42,8 @@ public:
virtual void InstrumentAndEmitInstruction(
const MCInst &Inst,
SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
- MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
+ MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out,
+ bool PrintSchedInfoEnabled);
protected:
friend X86AsmInstrumentation *
@@ -54,7 +55,8 @@ protected:
unsigned GetFrameRegGeneric(const MCContext &Ctx, MCStreamer &Out);
- void EmitInstruction(MCStreamer &Out, const MCInst &Inst);
+ void EmitInstruction(MCStreamer &Out, const MCInst &Inst,
+ bool PrintSchedInfoEnabled = false);
const MCSubtargetInfo *&STI;
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index c1d216c8b7af..87c65347e334 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -7,7 +7,9 @@
//
//===----------------------------------------------------------------------===//
+#include "InstPrinter/X86IntelInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86TargetStreamer.h"
#include "X86AsmInstrumentation.h"
#include "X86AsmParserCommon.h"
#include "X86Operand.h"
@@ -37,6 +39,14 @@
using namespace llvm;
+static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
+ if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
+ ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+ return true;
+ }
+ return false;
+}
+
namespace {
static const char OpPrecedence[] = {
@@ -59,7 +69,6 @@ static const char OpPrecedence[] = {
};
class X86AsmParser : public MCTargetAsmParser {
- const MCInstrInfo &MII;
ParseInstructionInfo *InstInfo;
std::unique_ptr<X86AsmInstrumentation> Instrumentation;
bool Code16GCC;
@@ -72,6 +81,13 @@ private:
return Result;
}
+ X86TargetStreamer &getTargetStreamer() {
+ assert(getParser().getStreamer().getTargetStreamer() &&
+ "do not have a target streamer");
+ MCTargetStreamer &TS = *getParser().getStreamer().getTargetStreamer();
+ return static_cast<X86TargetStreamer &>(TS);
+ }
+
unsigned MatchInstruction(const OperandVector &Operands, MCInst &Inst,
uint64_t &ErrorInfo, bool matchingInlineAsm,
unsigned VariantID = 0) {
@@ -125,8 +141,8 @@ private:
int64_t popOperand() {
assert (!PostfixStack.empty() && "Poped an empty stack!");
ICToken Op = PostfixStack.pop_back_val();
- assert ((Op.first == IC_IMM || Op.first == IC_REGISTER)
- && "Expected and immediate or register!");
+ if (!(Op.first == IC_IMM || Op.first == IC_REGISTER))
+ return -1; // The invalid Scale value will be caught later by checkScale
return Op.second;
}
void pushOperand(InfixCalculatorTok Op, int64_t Val = 0) {
@@ -293,6 +309,7 @@ private:
};
enum IntelExprState {
+ IES_INIT,
IES_OR,
IES_XOR,
IES_AND,
@@ -320,16 +337,20 @@ private:
int64_t Imm;
const MCExpr *Sym;
StringRef SymName;
- bool StopOnLBrac, AddImmPrefix;
InfixCalculator IC;
InlineAsmIdentifierInfo Info;
+ short BracCount;
+ bool MemExpr;
public:
- IntelExprStateMachine(int64_t imm, bool stoponlbrac, bool addimmprefix) :
- State(IES_PLUS), PrevState(IES_ERROR), BaseReg(0), IndexReg(0), TmpReg(0),
- Scale(1), Imm(imm), Sym(nullptr), StopOnLBrac(stoponlbrac),
- AddImmPrefix(addimmprefix) { Info.clear(); }
-
+ IntelExprStateMachine()
+ : State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
+ TmpReg(0), Scale(1), Imm(0), Sym(nullptr), BracCount(0),
+ MemExpr(false) {}
+
+ void addImm(int64_t imm) { Imm += imm; }
+ short getBracCount() { return BracCount; }
+ bool isMemExpr() { return MemExpr; }
unsigned getBaseReg() { return BaseReg; }
unsigned getIndexReg() { return IndexReg; }
unsigned getScale() { return Scale; }
@@ -339,13 +360,8 @@ private:
bool isValidEndState() {
return State == IES_RBRAC || State == IES_INTEGER;
}
- bool getStopOnLBrac() { return StopOnLBrac; }
- bool getAddImmPrefix() { return AddImmPrefix; }
bool hadError() { return State == IES_ERROR; }
-
- InlineAsmIdentifierInfo &getIdentifierInfo() {
- return Info;
- }
+ InlineAsmIdentifierInfo &getIdentifierInfo() { return Info; }
void onOr() {
IntelExprState CurrState = State;
@@ -422,7 +438,7 @@ private:
}
PrevState = CurrState;
}
- void onPlus() {
+ bool onPlus(StringRef &ErrMsg) {
IntelExprState CurrState = State;
switch (State) {
default:
@@ -439,7 +455,10 @@ private:
if (!BaseReg) {
BaseReg = TmpReg;
} else {
- assert (!IndexReg && "BaseReg/IndexReg already set!");
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
IndexReg = TmpReg;
Scale = 1;
}
@@ -447,8 +466,9 @@ private:
break;
}
PrevState = CurrState;
+ return false;
}
- void onMinus() {
+ bool onMinus(StringRef &ErrMsg) {
IntelExprState CurrState = State;
switch (State) {
default:
@@ -470,12 +490,17 @@ private:
case IES_RBRAC:
case IES_INTEGER:
case IES_REGISTER:
+ case IES_INIT:
State = IES_MINUS;
// push minus operator if it is not a negate operator
if (CurrState == IES_REGISTER || CurrState == IES_RPAREN ||
CurrState == IES_INTEGER || CurrState == IES_RBRAC)
IC.pushOperator(IC_MINUS);
- else
+ else if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
+ // We have negate operator for Scale: it's illegal
+ ErrMsg = "Scale can't be negative";
+ return true;
+ } else
IC.pushOperator(IC_NEG);
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
// If we already have a BaseReg, then assume this is the IndexReg with
@@ -483,7 +508,10 @@ private:
if (!BaseReg) {
BaseReg = TmpReg;
} else {
- assert (!IndexReg && "BaseReg/IndexReg already set!");
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
IndexReg = TmpReg;
Scale = 1;
}
@@ -491,6 +519,7 @@ private:
break;
}
PrevState = CurrState;
+ return false;
}
void onNot() {
IntelExprState CurrState = State;
@@ -511,13 +540,15 @@ private:
case IES_MOD:
case IES_LPAREN:
case IES_LBRAC:
+ case IES_INIT:
State = IES_NOT;
IC.pushOperator(IC_NOT);
break;
}
PrevState = CurrState;
}
- void onRegister(unsigned Reg) {
+
+ bool onRegister(unsigned Reg, StringRef &ErrMsg) {
IntelExprState CurrState = State;
switch (State) {
default:
@@ -525,6 +556,7 @@ private:
break;
case IES_PLUS:
case IES_LPAREN:
+ case IES_LBRAC:
State = IES_REGISTER;
TmpReg = Reg;
IC.pushOperand(IC_REGISTER);
@@ -532,11 +564,16 @@ private:
case IES_MULTIPLY:
// Index Register - Scale * Register
if (PrevState == IES_INTEGER) {
- assert (!IndexReg && "IndexReg already set!");
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
State = IES_REGISTER;
IndexReg = Reg;
// Get the scale and replace the 'Scale * Register' with '0'.
Scale = IC.popOperand();
+ if (checkScale(Scale, ErrMsg))
+ return true;
IC.pushOperand(IC_IMM);
IC.popOperator();
} else {
@@ -545,9 +582,20 @@ private:
break;
}
PrevState = CurrState;
+ return false;
}
- void onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName) {
+ bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
+ const InlineAsmIdentifierInfo &IDInfo,
+ bool ParsingInlineAsm, StringRef &ErrMsg) {
+ // InlineAsm: Treat an enum value as an integer
+ if (ParsingInlineAsm)
+ if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
+ return onInteger(IDInfo.Enum.EnumVal, ErrMsg);
+ // Treat a symbolic constant like an integer
+ if (auto *CE = dyn_cast<MCConstantExpr>(SymRef))
+ return onInteger(CE->getValue(), ErrMsg);
PrevState = State;
+ bool HasSymbol = Sym != nullptr;
switch (State) {
default:
State = IES_ERROR;
@@ -555,12 +603,20 @@ private:
case IES_PLUS:
case IES_MINUS:
case IES_NOT:
+ case IES_INIT:
+ case IES_LBRAC:
+ MemExpr = true;
State = IES_INTEGER;
Sym = SymRef;
SymName = SymRefName;
IC.pushOperand(IC_IMM);
+ if (ParsingInlineAsm)
+ Info = IDInfo;
break;
}
+ if (HasSymbol)
+ ErrMsg = "cannot use more than one symbol in memory operand";
+ return HasSymbol;
}
bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
IntelExprState CurrState = State;
@@ -580,16 +636,19 @@ private:
case IES_MOD:
case IES_MULTIPLY:
case IES_LPAREN:
+ case IES_INIT:
+ case IES_LBRAC:
State = IES_INTEGER;
if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
// Index Register - Register * Scale
- assert (!IndexReg && "IndexReg already set!");
+ if (IndexReg) {
+ ErrMsg = "BaseReg/IndexReg already set!";
+ return true;
+ }
IndexReg = TmpReg;
Scale = TmpInt;
- if(Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
- ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
+ if (checkScale(Scale, ErrMsg))
return true;
- }
// Get the scale and replace the 'Register * Scale' with '0'.
IC.popOperator();
} else {
@@ -640,19 +699,30 @@ private:
break;
}
}
- void onLBrac() {
+ bool onLBrac() {
+ if (BracCount)
+ return true;
PrevState = State;
switch (State) {
default:
State = IES_ERROR;
break;
case IES_RBRAC:
+ case IES_INTEGER:
+ case IES_RPAREN:
State = IES_PLUS;
IC.pushOperator(IC_PLUS);
break;
+ case IES_INIT:
+ assert(!BracCount && "BracCount should be zero on parsing's start");
+ State = IES_LBRAC;
+ break;
}
+ MemExpr = true;
+ BracCount++;
+ return false;
}
- void onRBrac() {
+ bool onRBrac() {
IntelExprState CurrState = State;
switch (State) {
default:
@@ -661,6 +731,8 @@ private:
case IES_INTEGER:
case IES_REGISTER:
case IES_RPAREN:
+ if (BracCount-- != 1)
+ return true;
State = IES_RBRAC;
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
// If we already have a BaseReg, then assume this is the IndexReg with
@@ -676,6 +748,7 @@ private:
break;
}
PrevState = CurrState;
+ return false;
}
void onLParen() {
IntelExprState CurrState = State;
@@ -695,6 +768,8 @@ private:
case IES_DIVIDE:
case IES_MOD:
case IES_LPAREN:
+ case IES_INIT:
+ case IES_LBRAC:
State = IES_LPAREN;
IC.pushOperator(IC_LPAREN);
break;
@@ -747,34 +822,41 @@ private:
std::unique_ptr<X86Operand> ParseATTOperand();
std::unique_ptr<X86Operand> ParseIntelOperand();
std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
- bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
- unsigned IdentifyIntelOperator(StringRef Name);
- unsigned ParseIntelOperator(unsigned OpKind);
- std::unique_ptr<X86Operand>
- ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
+ bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
+ unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
+ unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
+ void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
+ SMLoc End);
bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
- std::unique_ptr<X86Operand>
- ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
- bool isSymbol, unsigned Size);
- bool ParseIntelIdentifier(const MCExpr *&Val, StringRef &Identifier,
- InlineAsmIdentifierInfo &Info,
- bool IsUnevaluatedOperand, SMLoc &End);
+ bool ParseIntelInlineAsmIdentifier(const MCExpr *&Val, StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End);
std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg, SMLoc StartLoc);
+ bool ParseIntelMemoryOperandSize(unsigned &Size);
std::unique_ptr<X86Operand>
CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
unsigned IndexReg, unsigned Scale, SMLoc Start,
SMLoc End, unsigned Size, StringRef Identifier,
- InlineAsmIdentifierInfo &Info,
- bool AllowBetterSizeMatch = false);
+ const InlineAsmIdentifierInfo &Info);
bool parseDirectiveEven(SMLoc L);
bool ParseDirectiveWord(unsigned Size, SMLoc L);
bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
+ /// CodeView FPO data directives.
+ bool parseDirectiveFPOProc(SMLoc L);
+ bool parseDirectiveFPOSetFrame(SMLoc L);
+ bool parseDirectiveFPOPushReg(SMLoc L);
+ bool parseDirectiveFPOStackAlloc(SMLoc L);
+ bool parseDirectiveFPOEndPrologue(SMLoc L);
+ bool parseDirectiveFPOEndProc(SMLoc L);
+ bool parseDirectiveFPOData(SMLoc L);
+
+ bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
bool processInstruction(MCInst &Inst, const OperandVector &Ops);
/// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
@@ -828,7 +910,7 @@ private:
MCSubtargetInfo &STI = copySTI();
FeatureBitset AllModes({X86::Mode64Bit, X86::Mode32Bit, X86::Mode16Bit});
FeatureBitset OldMode = STI.getFeatureBits() & AllModes;
- unsigned FB = ComputeAvailableFeatures(
+ uint64_t FB = ComputeAvailableFeatures(
STI.ToggleFeature(OldMode.flip(mode)));
setAvailableFeatures(FB);
@@ -858,7 +940,7 @@ public:
X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
const MCInstrInfo &mii, const MCTargetOptions &Options)
- : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
+ : MCTargetAsmParser(Options, sti, mii), InstInfo(nullptr),
Code16GCC(false) {
// Initialize the set of available features.
@@ -885,8 +967,8 @@ static unsigned MatchRegisterName(StringRef Name);
/// }
-static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
- StringRef &ErrMsg) {
+static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
+ unsigned Scale, StringRef &ErrMsg) {
// If we have both a base register and an index register make sure they are
// both 64-bit or 32-bit registers.
// To support VSIB, IndexReg can be 128-bit or 256-bit registers.
@@ -925,7 +1007,7 @@ static bool CheckBaseRegAndIndexReg(unsigned BaseReg, unsigned IndexReg,
}
}
}
- return false;
+ return checkScale(Scale, ErrMsg);
}
bool X86AsmParser::ParseRegister(unsigned &RegNo,
@@ -1016,19 +1098,31 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
EndLoc = Parser.getTok().getEndLoc();
- // If this is "db[0-7]", match it as an alias
- // for dr[0-7].
- if (RegNo == 0 && Tok.getString().size() == 3 &&
- Tok.getString().startswith("db")) {
- switch (Tok.getString()[2]) {
- case '0': RegNo = X86::DR0; break;
- case '1': RegNo = X86::DR1; break;
- case '2': RegNo = X86::DR2; break;
- case '3': RegNo = X86::DR3; break;
- case '4': RegNo = X86::DR4; break;
- case '5': RegNo = X86::DR5; break;
- case '6': RegNo = X86::DR6; break;
- case '7': RegNo = X86::DR7; break;
+ // If this is "db[0-15]", match it as an alias
+ // for dr[0-15].
+ if (RegNo == 0 && Tok.getString().startswith("db")) {
+ if (Tok.getString().size() == 3) {
+ switch (Tok.getString()[2]) {
+ case '0': RegNo = X86::DR0; break;
+ case '1': RegNo = X86::DR1; break;
+ case '2': RegNo = X86::DR2; break;
+ case '3': RegNo = X86::DR3; break;
+ case '4': RegNo = X86::DR4; break;
+ case '5': RegNo = X86::DR5; break;
+ case '6': RegNo = X86::DR6; break;
+ case '7': RegNo = X86::DR7; break;
+ case '8': RegNo = X86::DR8; break;
+ case '9': RegNo = X86::DR9; break;
+ }
+ } else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') {
+ switch (Tok.getString()[3]) {
+ case '0': RegNo = X86::DR10; break;
+ case '1': RegNo = X86::DR11; break;
+ case '2': RegNo = X86::DR12; break;
+ case '3': RegNo = X86::DR13; break;
+ case '4': RegNo = X86::DR14; break;
+ case '5': RegNo = X86::DR15; break;
+ }
}
if (RegNo != 0) {
@@ -1198,124 +1292,48 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
return ParseATTOperand();
}
-/// getIntelMemOperandSize - Return intel memory operand size.
-static unsigned getIntelMemOperandSize(StringRef OpStr) {
- unsigned Size = StringSwitch<unsigned>(OpStr)
- .Cases("BYTE", "byte", 8)
- .Cases("WORD", "word", 16)
- .Cases("DWORD", "dword", 32)
- .Cases("FWORD", "fword", 48)
- .Cases("QWORD", "qword", 64)
- .Cases("MMWORD","mmword", 64)
- .Cases("XWORD", "xword", 80)
- .Cases("TBYTE", "tbyte", 80)
- .Cases("XMMWORD", "xmmword", 128)
- .Cases("YMMWORD", "ymmword", 256)
- .Cases("ZMMWORD", "zmmword", 512)
- .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
- .Default(0);
- return Size;
-}
-
std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
- InlineAsmIdentifierInfo &Info, bool AllowBetterSizeMatch) {
+ const InlineAsmIdentifierInfo &Info) {
// If we found a decl other than a VarDecl, then assume it is a FuncDecl or
// some other label reference.
- if (isa<MCSymbolRefExpr>(Disp) && Info.OpDecl && !Info.IsVarDecl) {
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Label)) {
// Insert an explicit size if the user didn't have one.
if (!Size) {
Size = getPointerWidth();
InstInfo->AsmRewrites->emplace_back(AOK_SizeDirective, Start,
/*Len=*/0, Size);
}
-
// Create an absolute memory reference in order to match against
// instructions taking a PC relative operand.
return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size,
- Identifier, Info.OpDecl);
+ Identifier, Info.Label.Decl);
}
-
-
// We either have a direct symbol reference, or an offset from a symbol. The
// parser always puts the symbol on the LHS, so look there for size
// calculation purposes.
unsigned FrontendSize = 0;
- const MCBinaryExpr *BinOp = dyn_cast<MCBinaryExpr>(Disp);
- bool IsSymRef =
- isa<MCSymbolRefExpr>(BinOp ? BinOp->getLHS() : Disp);
- if (IsSymRef && !Size && Info.Type)
- FrontendSize = Info.Type * 8; // Size is in terms of bits in this context.
-
- // When parsing inline assembly we set the base register to a non-zero value
+ void *Decl = nullptr;
+ bool IsGlobalLV = false;
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
+ // Size is in terms of bits in this context.
+ FrontendSize = Info.Var.Type * 8;
+ Decl = Info.Var.Decl;
+ IsGlobalLV = Info.Var.IsGlobalLV;
+ }
+ // It is widely common for MS InlineAsm to use a global variable and one/two
+ // registers in a mmory expression, and though unaccessible via rip/eip.
+ if (IsGlobalLV && (BaseReg || IndexReg)) {
+ return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End);
+ // Otherwise, we set the base register to a non-zero value
// if we don't know the actual value at this time. This is necessary to
// get the matching correct in some cases.
- BaseReg = BaseReg ? BaseReg : 1;
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
- IndexReg, Scale, Start, End, Size, Identifier,
- Info.OpDecl, FrontendSize);
-}
-
-static void
-RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
- StringRef SymName, int64_t ImmDisp,
- int64_t FinalImmDisp, SMLoc &BracLoc,
- SMLoc &StartInBrac, SMLoc &End) {
- // Remove the '[' and ']' from the IR string.
- AsmRewrites.emplace_back(AOK_Skip, BracLoc, 1);
- AsmRewrites.emplace_back(AOK_Skip, End, 1);
-
- // If ImmDisp is non-zero, then we parsed a displacement before the
- // bracketed expression (i.e., ImmDisp [ BaseReg + Scale*IndexReg + Disp])
- // If ImmDisp doesn't match the displacement computed by the state machine
- // then we have an additional displacement in the bracketed expression.
- if (ImmDisp != FinalImmDisp) {
- if (ImmDisp) {
- // We have an immediate displacement before the bracketed expression.
- // Adjust this to match the final immediate displacement.
- bool Found = false;
- for (AsmRewrite &AR : AsmRewrites) {
- if (AR.Loc.getPointer() > BracLoc.getPointer())
- continue;
- if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm) {
- assert (!Found && "ImmDisp already rewritten.");
- AR.Kind = AOK_Imm;
- AR.Len = BracLoc.getPointer() - AR.Loc.getPointer();
- AR.Val = FinalImmDisp;
- Found = true;
- break;
- }
- }
- assert (Found && "Unable to rewrite ImmDisp.");
- (void)Found;
- } else {
- // We have a symbolic and an immediate displacement, but no displacement
- // before the bracketed expression. Put the immediate displacement
- // before the bracketed expression.
- AsmRewrites.emplace_back(AOK_Imm, BracLoc, 0, FinalImmDisp);
- }
- }
- // Remove all the ImmPrefix rewrites within the brackets.
- // We may have some Imm rewrties as a result of an operator applying,
- // remove them as well
- for (AsmRewrite &AR : AsmRewrites) {
- if (AR.Loc.getPointer() < StartInBrac.getPointer())
- continue;
- if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm)
- AR.Kind = AOK_Delete;
- }
- const char *SymLocPtr = SymName.data();
- // Skip everything before the symbol.
- if (unsigned Len = SymLocPtr - StartInBrac.getPointer()) {
- assert(Len > 0 && "Expected a non-negative length.");
- AsmRewrites.emplace_back(AOK_Skip, StartInBrac, Len);
- }
- // Skip everything after the symbol.
- if (unsigned Len = End.getPointer() - (SymLocPtr + SymName.size())) {
- SMLoc Loc = SMLoc::getFromPointer(SymLocPtr + SymName.size());
- assert(Len > 0 && "Expected a non-negative length.");
- AsmRewrites.emplace_back(AOK_Skip, Loc, Len);
+ } else {
+ BaseReg = BaseReg ? BaseReg : 1;
+ return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
+ IndexReg, Scale, Start, End, Size, Identifier,
+ Decl, FrontendSize);
}
}
@@ -1348,77 +1366,80 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine
bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
+ StringRef ErrMsg;
AsmToken::TokenKind PrevTK = AsmToken::Error;
bool Done = false;
while (!Done) {
bool UpdateLocLex = true;
-
AsmToken::TokenKind TK = getLexer().getKind();
- // The period in the dot operator (e.g., [ebx].foo.bar) is parsed as an
- // identifier. Don't try an parse it as a register.
- if (PrevTK != AsmToken::Error && Tok.getString().startswith(".") &&
- TK != AsmToken::Identifier)
- break;
-
- // If we're parsing an immediate expression, we don't expect a '['.
- if (SM.getStopOnLBrac() && getLexer().getKind() == AsmToken::LBrac)
- break;
switch (TK) {
- default: {
- if (SM.isValidEndState()) {
- Done = true;
+ default:
+ if ((Done = SM.isValidEndState()))
break;
- }
return Error(Tok.getLoc(), "unknown token in expression");
- }
- case AsmToken::EndOfStatement: {
+ case AsmToken::EndOfStatement:
Done = true;
break;
- }
+ case AsmToken::Real:
+ // DotOperator: [ebx].0
+ UpdateLocLex = false;
+ if (ParseIntelDotOperator(SM, End))
+ return true;
+ break;
case AsmToken::String:
case AsmToken::Identifier: {
- // This could be a register or a symbolic displacement.
- unsigned TmpReg;
- const MCExpr *Val;
SMLoc IdentLoc = Tok.getLoc();
StringRef Identifier = Tok.getString();
UpdateLocLex = false;
- if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
- SM.onRegister(TmpReg);
- } else if (ParseIntelNamedOperator(Identifier, SM)) {
- UpdateLocLex = true;
- } else if (!isParsingInlineAsm()) {
- if (getParser().parsePrimaryExpr(Val, End))
+ // Register
+ unsigned Reg;
+ if (Tok.isNot(AsmToken::String) && !ParseRegister(Reg, IdentLoc, End)) {
+ if (SM.onRegister(Reg, ErrMsg))
+ return Error(Tok.getLoc(), ErrMsg);
+ break;
+ }
+ // Operator synonymous ("not", "or" etc.)
+ if ((UpdateLocLex = ParseIntelNamedOperator(Identifier, SM)))
+ break;
+ // Symbol reference, when parsing assembly content
+ InlineAsmIdentifierInfo Info;
+ const MCExpr *Val;
+ if (!isParsingInlineAsm()) {
+ if (getParser().parsePrimaryExpr(Val, End)) {
return Error(Tok.getLoc(), "Unexpected identifier!");
- SM.onIdentifierExpr(Val, Identifier);
- } else if (unsigned OpKind = IdentifyIntelOperator(Identifier)) {
- if (OpKind == IOK_OFFSET)
+ } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
+ return Error(IdentLoc, ErrMsg);
+ } else
+ break;
+ }
+ // MS InlineAsm operators (TYPE/LENGTH/SIZE)
+ if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
+ if (OpKind == IOK_OFFSET)
return Error(IdentLoc, "Dealing OFFSET operator as part of"
"a compound immediate expression is yet to be supported");
- int64_t Val = ParseIntelOperator(OpKind);
- if (!Val)
+ if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
+ if (SM.onInteger(Val, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
+ } else
return true;
- StringRef ErrMsg;
- if (SM.onInteger(Val, ErrMsg))
- return Error(IdentLoc, ErrMsg);
- } else if (Identifier.find('.') != StringRef::npos &&
- PrevTK == AsmToken::RBrac) {
- return false;
- } else {
- InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
- if (ParseIntelIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/false, End))
+ break;
+ }
+ // MS Dot Operator expression
+ if (Identifier.count('.') && PrevTK == AsmToken::RBrac) {
+ if (ParseIntelDotOperator(SM, End))
return true;
- SM.onIdentifierExpr(Val, Identifier);
+ break;
}
+ // MS InlineAsm identifier
+ if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
+ return true;
+ else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+ return Error(IdentLoc, ErrMsg);
break;
}
case AsmToken::Integer: {
- StringRef ErrMsg;
- if (isParsingInlineAsm() && SM.getAddImmPrefix())
- InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Tok.getLoc());
// Look for 'b' or 'f' following an Integer as a directional label
SMLoc Loc = getTok().getLoc();
int64_t IntVal = getTok().getIntVal();
@@ -1435,7 +1456,10 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
if (IDVal == "b" && Sym->isUndefined())
return Error(Loc, "invalid reference to undefined symbol");
StringRef Identifier = Sym->getName();
- SM.onIdentifierExpr(Val, Identifier);
+ InlineAsmIdentifierInfo Info;
+ if (SM.onIdentifierExpr(Val, Identifier, Info,
+ isParsingInlineAsm(), ErrMsg))
+ return Error(Loc, ErrMsg);
End = consumeToken();
} else {
if (SM.onInteger(IntVal, ErrMsg))
@@ -1447,11 +1471,18 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
break;
}
- case AsmToken::Plus: SM.onPlus(); break;
- case AsmToken::Minus: SM.onMinus(); break;
+ case AsmToken::Plus:
+ if (SM.onPlus(ErrMsg))
+ return Error(getTok().getLoc(), ErrMsg);
+ break;
+ case AsmToken::Minus:
+ if (SM.onMinus(ErrMsg))
+ return Error(getTok().getLoc(), ErrMsg);
+ break;
case AsmToken::Tilde: SM.onNot(); break;
case AsmToken::Star: SM.onStar(); break;
case AsmToken::Slash: SM.onDivide(); break;
+ case AsmToken::Percent: SM.onMod(); break;
case AsmToken::Pipe: SM.onOr(); break;
case AsmToken::Caret: SM.onXor(); break;
case AsmToken::Amp: SM.onAnd(); break;
@@ -1459,8 +1490,14 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
SM.onLShift(); break;
case AsmToken::GreaterGreater:
SM.onRShift(); break;
- case AsmToken::LBrac: SM.onLBrac(); break;
- case AsmToken::RBrac: SM.onRBrac(); break;
+ case AsmToken::LBrac:
+ if (SM.onLBrac())
+ return Error(Tok.getLoc(), "unexpected bracket encountered");
+ break;
+ case AsmToken::RBrac:
+ if (SM.onRBrac())
+ return Error(Tok.getLoc(), "unexpected bracket encountered");
+ break;
case AsmToken::LParen: SM.onLParen(); break;
case AsmToken::RParen: SM.onRParen(); break;
}
@@ -1475,112 +1512,49 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
return false;
}
-std::unique_ptr<X86Operand>
-X86AsmParser::ParseIntelBracExpression(unsigned SegReg, SMLoc Start,
- int64_t ImmDisp, bool isSymbol,
- unsigned Size) {
- MCAsmParser &Parser = getParser();
- const AsmToken &Tok = Parser.getTok();
- SMLoc BracLoc = Tok.getLoc(), End = Tok.getEndLoc();
- if (getLexer().isNot(AsmToken::LBrac))
- return ErrorOperand(BracLoc, "Expected '[' token!");
- Parser.Lex(); // Eat '['
-
- SMLoc StartInBrac = Parser.getTok().getLoc();
- // Parse [ Symbol + ImmDisp ] and [ BaseReg + Scale*IndexReg + ImmDisp ]. We
- // may have already parsed an immediate displacement before the bracketed
- // expression.
- IntelExprStateMachine SM(ImmDisp, /*StopOnLBrac=*/false, /*AddImmPrefix=*/true);
- if (ParseIntelExpression(SM, End))
- return nullptr;
-
- const MCExpr *Disp = nullptr;
- if (const MCExpr *Sym = SM.getSym()) {
- // A symbolic displacement.
- Disp = Sym;
- if (isParsingInlineAsm())
- RewriteIntelBracExpression(*InstInfo->AsmRewrites, SM.getSymName(),
- ImmDisp, SM.getImm(), BracLoc, StartInBrac,
- End);
- }
-
- if (SM.getImm() || !Disp) {
- const MCExpr *Imm = MCConstantExpr::create(SM.getImm(), getContext());
- if (Disp)
- Disp = MCBinaryExpr::createAdd(Disp, Imm, getContext());
- else
- Disp = Imm; // An immediate displacement only.
- }
-
- // Parse struct field access. Intel requires a dot, but MSVC doesn't. MSVC
- // will in fact do global lookup the field name inside all global typedefs,
- // but we don't emulate that.
- if ((Parser.getTok().getKind() == AsmToken::Identifier ||
- Parser.getTok().getKind() == AsmToken::Dot ||
- Parser.getTok().getKind() == AsmToken::Real) &&
- Parser.getTok().getString().find('.') != StringRef::npos) {
- const MCExpr *NewDisp;
- if (ParseIntelDotOperator(Disp, NewDisp))
- return nullptr;
-
- End = Tok.getEndLoc();
- Parser.Lex(); // Eat the field.
- Disp = NewDisp;
- }
-
- if (isSymbol) {
- if (SM.getSym()) {
- Error(Start, "cannot use more than one symbol in memory operand");
- return nullptr;
- }
- if (SM.getBaseReg()) {
- Error(Start, "cannot use base register with variable reference");
- return nullptr;
- }
- if (SM.getIndexReg()) {
- Error(Start, "cannot use index register with variable reference");
- return nullptr;
- }
- }
-
- int BaseReg = SM.getBaseReg();
- int IndexReg = SM.getIndexReg();
- int Scale = SM.getScale();
- if (!isParsingInlineAsm()) {
- // handle [-42]
- if (!BaseReg && !IndexReg) {
- if (!SegReg)
- return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, 0, 0, 1,
- Start, End, Size);
- }
- StringRef ErrMsg;
- if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
- Error(StartInBrac, ErrMsg);
- return nullptr;
- }
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
- IndexReg, Scale, Start, End, Size);
- }
-
- InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
- return CreateMemForInlineAsm(SegReg, Disp, BaseReg, IndexReg, Scale, Start,
- End, Size, SM.getSymName(), Info,
- isParsingInlineAsm());
+void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM,
+ SMLoc Start, SMLoc End) {
+ SMLoc Loc = Start;
+ unsigned ExprLen = End.getPointer() - Start.getPointer();
+ // Skip everything before a symbol displacement (if we have one)
+ if (SM.getSym()) {
+ StringRef SymName = SM.getSymName();
+ if (unsigned Len = SymName.data() - Start.getPointer())
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, Start, Len);
+ Loc = SMLoc::getFromPointer(SymName.data() + SymName.size());
+ ExprLen = End.getPointer() - (SymName.data() + SymName.size());
+ // If we have only a symbol than there's no need for complex rewrite,
+ // simply skip everything after it
+ if (!(SM.getBaseReg() || SM.getIndexReg() || SM.getImm())) {
+ if (ExprLen)
+ InstInfo->AsmRewrites->emplace_back(AOK_Skip, Loc, ExprLen);
+ return;
+ }
+ }
+ // Build an Intel Expression rewrite
+ StringRef BaseRegStr;
+ StringRef IndexRegStr;
+ if (SM.getBaseReg())
+ BaseRegStr = X86IntelInstPrinter::getRegisterName(SM.getBaseReg());
+ if (SM.getIndexReg())
+ IndexRegStr = X86IntelInstPrinter::getRegisterName(SM.getIndexReg());
+ // Emit it
+ IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), SM.getImm(), SM.isMemExpr());
+ InstInfo->AsmRewrites->emplace_back(Loc, ExprLen, Expr);
}
// Inline assembly may use variable names with namespace alias qualifiers.
-bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
- StringRef &Identifier,
- InlineAsmIdentifierInfo &Info,
- bool IsUnevaluatedOperand, SMLoc &End) {
+bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val,
+ StringRef &Identifier,
+ InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand,
+ SMLoc &End) {
MCAsmParser &Parser = getParser();
assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
Val = nullptr;
StringRef LineBuf(Identifier.data());
- void *Result =
- SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
+ SemaCallback->LookupInlineAsmIdentifier(LineBuf, Info, IsUnevaluatedOperand);
const AsmToken &Tok = Parser.getTok();
SMLoc Loc = Tok.getLoc();
@@ -1596,12 +1570,13 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
// The frontend should end parsing on an assembler token boundary, unless it
// failed parsing.
- assert((End.getPointer() == EndPtr || !Result) &&
- "frontend claimed part of a token?");
+ assert((End.getPointer() == EndPtr ||
+ Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) &&
+ "frontend claimed part of a token?");
// If the identifier lookup was unsuccessful, assume that we are dealing with
// a label.
- if (!Result) {
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Invalid)) {
StringRef InternalName =
SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
Loc, false);
@@ -1609,8 +1584,8 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
// Push a rewrite for replacing the identifier name with the internal name.
InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
InternalName);
- }
-
+ } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
+ return false;
// Create the symbol reference.
MCSymbol *Sym = getContext().getOrCreateSymbol(Identifier);
MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
@@ -1618,57 +1593,6 @@ bool X86AsmParser::ParseIntelIdentifier(const MCExpr *&Val,
return false;
}
-/// \brief Parse intel style segment override.
-std::unique_ptr<X86Operand>
-X86AsmParser::ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start,
- unsigned Size) {
- MCAsmParser &Parser = getParser();
- assert(SegReg != 0 && "Tried to parse a segment override without a segment!");
- const AsmToken &Tok = Parser.getTok(); // Eat colon.
- if (Tok.isNot(AsmToken::Colon))
- return ErrorOperand(Tok.getLoc(), "Expected ':' token!");
- Parser.Lex(); // Eat ':'
-
- int64_t ImmDisp = 0;
- if (getLexer().is(AsmToken::Integer)) {
- ImmDisp = Tok.getIntVal();
- AsmToken ImmDispToken = Parser.Lex(); // Eat the integer.
-
- if (isParsingInlineAsm())
- InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, ImmDispToken.getLoc());
-
- if (getLexer().isNot(AsmToken::LBrac)) {
- // An immediate following a 'segment register', 'colon' token sequence can
- // be followed by a bracketed expression. If it isn't we know we have our
- // final segment override.
- const MCExpr *Disp = MCConstantExpr::create(ImmDisp, getContext());
- return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp,
- /*BaseReg=*/0, /*IndexReg=*/0, /*Scale=*/1,
- Start, ImmDispToken.getEndLoc(), Size);
- }
- }
-
- if (getLexer().is(AsmToken::LBrac))
- return ParseIntelBracExpression(SegReg, Start, ImmDisp, false, Size);
-
- const MCExpr *Val;
- SMLoc End;
- if (!isParsingInlineAsm()) {
- if (getParser().parsePrimaryExpr(Val, End))
- return ErrorOperand(Tok.getLoc(), "unknown token in expression");
-
- return X86Operand::CreateMem(getPointerWidth(), Val, Start, End, Size);
- }
-
- InlineAsmIdentifierInfo Info;
- StringRef Identifier = Tok.getString();
- if (ParseIntelIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/false, End))
- return nullptr;
- return CreateMemForInlineAsm(/*SegReg=*/0, Val, /*BaseReg=*/0,/*IndexReg=*/0,
- /*Scale=*/1, Start, End, Size, Identifier, Info);
-}
-
//ParseRoundingModeOp - Parse AVX-512 rounding mode operand
std::unique_ptr<X86Operand>
X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
@@ -1708,17 +1632,9 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start, SMLoc End) {
}
/// Parse the '.' operator.
-bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
- const MCExpr *&NewDisp) {
- MCAsmParser &Parser = getParser();
- const AsmToken &Tok = Parser.getTok();
- int64_t OrigDispVal, DotDispVal;
-
- // FIXME: Handle non-constant expressions.
- if (const MCConstantExpr *OrigDisp = dyn_cast<MCConstantExpr>(Disp))
- OrigDispVal = OrigDisp->getValue();
- else
- return Error(Tok.getLoc(), "Non-constant offsets are not supported!");
+bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) {
+ const AsmToken &Tok = getTok();
+ unsigned Offset;
// Drop the optional '.'.
StringRef DotDispStr = Tok.getString();
@@ -1729,24 +1645,21 @@ bool X86AsmParser::ParseIntelDotOperator(const MCExpr *Disp,
if (Tok.is(AsmToken::Real)) {
APInt DotDisp;
DotDispStr.getAsInteger(10, DotDisp);
- DotDispVal = DotDisp.getZExtValue();
+ Offset = DotDisp.getZExtValue();
} else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
- unsigned DotDisp;
std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
- DotDisp))
+ Offset))
return Error(Tok.getLoc(), "Unable to lookup field reference!");
- DotDispVal = DotDisp;
} else
return Error(Tok.getLoc(), "Unexpected token type!");
- if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
- SMLoc Loc = SMLoc::getFromPointer(DotDispStr.data());
- unsigned Len = DotDispStr.size();
- InstInfo->AsmRewrites->emplace_back(AOK_DotOperator, Loc, Len, DotDispVal);
- }
-
- NewDisp = MCConstantExpr::create(OrigDispVal + DotDispVal, getContext());
+ // Eat the DotExpression and update End
+ End = SMLoc::getFromPointer(DotDispStr.data());
+ const char *DotExprEndLoc = DotDispStr.data() + DotDispStr.size();
+ while (Tok.getLoc().getPointer() < DotExprEndLoc)
+ Lex();
+ SM.addImm(Offset);
return false;
}
@@ -1762,10 +1675,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
InlineAsmIdentifierInfo Info;
SMLoc Start = Tok.getLoc(), End;
StringRef Identifier = Tok.getString();
- if (ParseIntelIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/false, End))
+ if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/false, End))
return nullptr;
+ void *Decl = nullptr;
+ // FIXME: MS evaluates "offset <Constant>" to the underlying integral
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
+ return ErrorOperand(Start, "offset operator cannot yet handle constants");
+ else if (Info.isKind(InlineAsmIdentifierInfo::IK_Var))
+ Decl = Info.Var.Decl;
// Don't emit the offset operator.
InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
@@ -1776,12 +1695,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX);
return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
- OffsetOfLoc, Identifier, Info.OpDecl);
+ OffsetOfLoc, Identifier, Decl);
}
// Query a candidate string for being an Intel assembly operator
// Report back its kind, or IOK_INVALID if does not evaluated as a known one
-unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) {
+unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
return StringSwitch<unsigned>(Name)
.Cases("TYPE","type",IOK_TYPE)
.Cases("SIZE","size",IOK_SIZE)
@@ -1796,41 +1715,62 @@ unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) {
/// variable. A variable's size is the product of its LENGTH and TYPE. The
/// TYPE operator returns the size of a C or C++ type or variable. If the
/// variable is an array, TYPE returns the size of a single element.
-unsigned X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+unsigned X86AsmParser::ParseIntelInlineAsmOperator(unsigned OpKind) {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
- SMLoc TypeLoc = Tok.getLoc();
Parser.Lex(); // Eat operator.
const MCExpr *Val = nullptr;
InlineAsmIdentifierInfo Info;
SMLoc Start = Tok.getLoc(), End;
StringRef Identifier = Tok.getString();
- if (ParseIntelIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/true, End))
+ if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
+ /*Unevaluated=*/true, End))
return 0;
- if (!Info.OpDecl) {
+ if (!Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
Error(Start, "unable to lookup expression");
return 0;
}
-
+
unsigned CVal = 0;
switch(OpKind) {
default: llvm_unreachable("Unexpected operand kind!");
- case IOK_LENGTH: CVal = Info.Length; break;
- case IOK_SIZE: CVal = Info.Size; break;
- case IOK_TYPE: CVal = Info.Type; break;
+ case IOK_LENGTH: CVal = Info.Var.Length; break;
+ case IOK_SIZE: CVal = Info.Var.Size; break;
+ case IOK_TYPE: CVal = Info.Var.Type; break;
}
- // Rewrite the type operator and the C or C++ type or variable in terms of an
- // immediate. E.g. TYPE foo -> $$4
- unsigned Len = End.getPointer() - TypeLoc.getPointer();
- InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
-
return CVal;
}
+bool X86AsmParser::ParseIntelMemoryOperandSize(unsigned &Size) {
+ Size = StringSwitch<unsigned>(getTok().getString())
+ .Cases("BYTE", "byte", 8)
+ .Cases("WORD", "word", 16)
+ .Cases("DWORD", "dword", 32)
+ .Cases("FLOAT", "float", 32)
+ .Cases("LONG", "long", 32)
+ .Cases("FWORD", "fword", 48)
+ .Cases("DOUBLE", "double", 64)
+ .Cases("QWORD", "qword", 64)
+ .Cases("MMWORD","mmword", 64)
+ .Cases("XWORD", "xword", 80)
+ .Cases("TBYTE", "tbyte", 80)
+ .Cases("XMMWORD", "xmmword", 128)
+ .Cases("YMMWORD", "ymmword", 256)
+ .Cases("ZMMWORD", "zmmword", 512)
+ .Cases("OPAQUE", "opaque", -1U) // needs to be non-zero, but doesn't matter
+ .Default(0);
+ if (Size) {
+ const AsmToken &Tok = Lex(); // Eat operand size (e.g., byte, word).
+ if (!(Tok.getString().equals("PTR") || Tok.getString().equals("ptr")))
+ return Error(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
+ Lex(); // Eat ptr.
+ }
+ return false;
+}
+
std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
MCAsmParser &Parser = getParser();
const AsmToken &Tok = Parser.getTok();
@@ -1840,100 +1780,76 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// Should be handled as part of immediate expression, as other operators
// Currently, only supported as a stand-alone operand
if (isParsingInlineAsm())
- if (IdentifyIntelOperator(Tok.getString()) == IOK_OFFSET)
+ if (IdentifyIntelInlineAsmOperator(Tok.getString()) == IOK_OFFSET)
return ParseIntelOffsetOfOperator();
- bool PtrInOperand = false;
- unsigned Size = getIntelMemOperandSize(Tok.getString());
- if (Size) {
- Parser.Lex(); // Eat operand size (e.g., byte, word).
- if (Tok.getString() != "PTR" && Tok.getString() != "ptr")
- return ErrorOperand(Tok.getLoc(), "Expected 'PTR' or 'ptr' token!");
- Parser.Lex(); // Eat ptr.
- PtrInOperand = true;
- }
+ // Parse optional Size directive.
+ unsigned Size;
+ if (ParseIntelMemoryOperandSize(Size))
+ return nullptr;
+ bool PtrInOperand = bool(Size);
Start = Tok.getLoc();
- // rounding mode token
+ // Rounding mode operand.
if (getSTI().getFeatureBits()[X86::FeatureAVX512] &&
getLexer().is(AsmToken::LCurly))
return ParseRoundingModeOp(Start, End);
- // Register.
+ // Register operand.
unsigned RegNo = 0;
- if (getLexer().is(AsmToken::Identifier) &&
- !ParseRegister(RegNo, Start, End)) {
- // If this is a segment register followed by a ':', then this is the start
- // of a segment override, otherwise this is a normal register reference.
- // In case it is a normal register and there is ptr in the operand this
- // is an error
+ if (Tok.is(AsmToken::Identifier) && !ParseRegister(RegNo, Start, End)) {
if (RegNo == X86::RIP)
return ErrorOperand(Start, "rip can only be used as a base register");
- if (getLexer().isNot(AsmToken::Colon)) {
- if (PtrInOperand) {
- return ErrorOperand(Start, "expected memory operand after "
- "'ptr', found register operand instead");
- }
- return X86Operand::CreateReg(RegNo, Start, End);
- }
- return ParseIntelSegmentOverride(/*SegReg=*/RegNo, Start, Size);
+ // A Register followed by ':' is considered a segment override
+ if (Tok.isNot(AsmToken::Colon))
+ return !PtrInOperand ? X86Operand::CreateReg(RegNo, Start, End) :
+ ErrorOperand(Start, "expected memory operand after 'ptr', "
+ "found register operand instead");
+ // An alleged segment override. check if we have a valid segment register
+ if (!X86MCRegisterClasses[X86::SEGMENT_REGRegClassID].contains(RegNo))
+ return ErrorOperand(Start, "invalid segment register");
+ // Eat ':' and update Start location
+ Start = Lex().getLoc();
}
// Immediates and Memory
-
- // Parse [ BaseReg + Scale*IndexReg + Disp ].
- if (getLexer().is(AsmToken::LBrac))
- return ParseIntelBracExpression(/*SegReg=*/0, Start, /*ImmDisp=*/0, false,
- Size);
-
- AsmToken StartTok = Tok;
- IntelExprStateMachine SM(/*Imm=*/0, /*StopOnLBrac=*/true,
- /*AddImmPrefix=*/false);
+ IntelExprStateMachine SM;
if (ParseIntelExpression(SM, End))
return nullptr;
- bool isSymbol = SM.getSym() && SM.getSym()->getKind() != MCExpr::Constant;
+ if (isParsingInlineAsm())
+ RewriteIntelExpression(SM, Start, Tok.getLoc());
+
int64_t Imm = SM.getImm();
- if (SM.getSym() && SM.getSym()->getKind() == MCExpr::Constant)
- SM.getSym()->evaluateAsAbsolute(Imm);
-
- if (StartTok.isNot(AsmToken::Identifier) &&
- StartTok.isNot(AsmToken::String) && isParsingInlineAsm()) {
- unsigned Len = Tok.getLoc().getPointer() - Start.getPointer();
- if (StartTok.getString().size() == Len)
- // Just add a prefix if this wasn't a complex immediate expression.
- InstInfo->AsmRewrites->emplace_back(AOK_ImmPrefix, Start);
- else
- // Otherwise, rewrite the complex expression as a single immediate.
- InstInfo->AsmRewrites->emplace_back(AOK_Imm, Start, Len, Imm);
- }
-
- if (getLexer().isNot(AsmToken::LBrac)) {
- // If a directional label (ie. 1f or 2b) was parsed above from
- // ParseIntelExpression() then SM.getSym() was set to a pointer to
- // to the MCExpr with the directional local symbol and this is a
- // memory operand not an immediate operand.
- if (isSymbol) {
- if (isParsingInlineAsm())
- return CreateMemForInlineAsm(/*SegReg=*/0, SM.getSym(), /*BaseReg=*/0,
- /*IndexReg=*/0,
- /*Scale=*/1, Start, End, Size,
- SM.getSymName(), SM.getIdentifierInfo());
- return X86Operand::CreateMem(getPointerWidth(), SM.getSym(), Start, End,
- Size);
- }
-
- const MCExpr *ImmExpr = MCConstantExpr::create(Imm, getContext());
- return X86Operand::CreateImm(ImmExpr, Start, End);
- }
-
- // Only positive immediates are valid.
- if (Imm < 0)
- return ErrorOperand(Start, "expected a positive immediate displacement "
- "before bracketed expr.");
-
- return ParseIntelBracExpression(/*SegReg=*/0, Start, Imm, isSymbol, Size);
+ const MCExpr *Disp = SM.getSym();
+ const MCExpr *ImmDisp = MCConstantExpr::create(Imm, getContext());
+ if (Disp && Imm)
+ Disp = MCBinaryExpr::createAdd(Disp, ImmDisp, getContext());
+ if (!Disp)
+ Disp = ImmDisp;
+
+ // RegNo != 0 specifies a valid segment register,
+ // and we are parsing a segment override
+ if (!SM.isMemExpr() && !RegNo)
+ return X86Operand::CreateImm(Disp, Start, End);
+
+ StringRef ErrMsg;
+ unsigned BaseReg = SM.getBaseReg();
+ unsigned IndexReg = SM.getIndexReg();
+ unsigned Scale = SM.getScale();
+
+ if ((BaseReg || IndexReg) &&
+ CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg))
+ return ErrorOperand(Start, ErrMsg);
+ if (isParsingInlineAsm())
+ return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg,
+ Scale, Start, End, Size, SM.getSymName(),
+ SM.getIdentifierInfo());
+ if (!(BaseReg || IndexReg || RegNo))
+ return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
+ return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
+ BaseReg, IndexReg, Scale, Start, End, Size);
}
std::unique_ptr<X86Operand> X86AsmParser::ParseATTOperand() {
@@ -2055,14 +1971,20 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
// no errors.
// Query for the need of further parsing for a {%k<NUM>} mark
if (!Z || getLexer().is(AsmToken::LCurly)) {
- const SMLoc StartLoc = Z ? consumeToken() : consumedToken;
+ SMLoc StartLoc = Z ? consumeToken() : consumedToken;
// Parse an op-mask register mark ({%k<NUM>}), which is now to be
// expected
- if (std::unique_ptr<X86Operand> Op = ParseOperand()) {
+ unsigned RegNo;
+ SMLoc RegLoc;
+ if (!ParseRegister(RegNo, RegLoc, StartLoc) &&
+ X86MCRegisterClasses[X86::VK1RegClassID].contains(RegNo)) {
+ if (RegNo == X86::K0)
+ return Error(RegLoc, "Register k0 can't be used as write mask");
if (!getLexer().is(AsmToken::RCurly))
return Error(getLexer().getLoc(), "Expected } at this point");
Operands.push_back(X86Operand::CreateToken("{", StartLoc));
- Operands.push_back(std::move(Op));
+ Operands.push_back(
+ X86Operand::CreateReg(RegNo, StartLoc, StartLoc));
Operands.push_back(X86Operand::CreateToken("}", consumeToken()));
} else
return Error(getLexer().getLoc(),
@@ -2072,7 +1994,8 @@ bool X86AsmParser::HandleAVX512Operand(OperandVector &Operands,
// Have we've found a parsing error, or found no (expected) {z} mark
// - report an error
if (ParseZ(Z, consumeToken()) || !Z)
- return true;
+ return Error(getLexer().getLoc(),
+ "Expected a {z} mark at this point");
}
// '{z}' on its own is meaningless, hence should be ignored.
@@ -2125,9 +2048,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
// memory operand consumed.
} else {
SMLoc ExprEnd;
+ getLexer().UnLex(AsmToken(AsmToken::LParen, "("));
- // It must be an parenthesized expression, parse it now.
- if (getParser().parseParenExpression(Disp, ExprEnd))
+ // It must be either an parenthesized expression, or an expression that
+ // begins from a parenthesized expression, parse it now. Example: (1+2) or
+ // (1+2)+3
+ if (getParser().parseExpression(Disp, ExprEnd))
return nullptr;
// After parsing the base expression we could either have a parenthesized
@@ -2258,7 +2184,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseMemOperand(unsigned SegReg,
}
StringRef ErrMsg;
- if (CheckBaseRegAndIndexReg(BaseReg, IndexReg, ErrMsg)) {
+ if (CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, ErrMsg)) {
Error(BaseLoc, ErrMsg);
return nullptr;
}
@@ -2275,7 +2201,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
InstInfo = &Info;
StringRef PatchedName = Name;
- if (Name == "jmp" && isParsingIntelSyntax() && isParsingInlineAsm()) {
+ if ((Name.equals("jmp") || Name.equals("jc") || Name.equals("jz")) &&
+ isParsingIntelSyntax() && isParsingInlineAsm()) {
StringRef NextTok = Parser.getTok().getString();
if (NextTok == "short") {
SMLoc NameEndLoc =
@@ -2417,22 +2344,57 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
}
- Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
// Determine whether this is an instruction prefix.
- bool isPrefix =
- Name == "lock" || Name == "rep" ||
- Name == "repe" || Name == "repz" ||
- Name == "repne" || Name == "repnz" ||
- Name == "rex64" || Name == "data16" || Name == "data32";
+ // FIXME:
+ // Enhance prefixes integrity robustness. for example, following forms
+ // are currently tolerated:
+ // repz repnz <insn> ; GAS errors for the use of two similar prefixes
+ // lock addq %rax, %rbx ; Destination operand must be of memory type
+ // xacquire <insn> ; xacquire must be accompanied by 'lock'
+ bool isPrefix = StringSwitch<bool>(Name)
+ .Cases("rex64", "data32", "data16", true)
+ .Cases("xacquire", "xrelease", true)
+ .Cases("acquire", "release", isParsingIntelSyntax())
+ .Default(false);
+
+ auto isLockRepeatPrefix = [](StringRef N) {
+ return StringSwitch<bool>(N)
+ .Cases("lock", "rep", "repe", "repz", "repne", "repnz", true)
+ .Default(false);
+ };
bool CurlyAsEndOfStatement = false;
+
+ unsigned Flags = X86::IP_NO_PREFIX;
+ while (isLockRepeatPrefix(Name.lower())) {
+ unsigned Prefix =
+ StringSwitch<unsigned>(Name)
+ .Cases("lock", "lock", X86::IP_HAS_LOCK)
+ .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
+ .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
+ .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
+ Flags |= Prefix;
+ Name = Parser.getTok().getString();
+ Parser.Lex(); // eat the prefix
+ // Hack: we could have something like
+ // "lock; cmpxchg16b $1" or "lock\0A\09incl" or "lock/incl"
+ while (Name.startswith(";") || Name.startswith("\n") ||
+ Name.startswith("\t") || Name.startswith("/")) {
+ Name = Parser.getTok().getString();
+ Parser.Lex(); // go to next prefix or instr
+ }
+ }
+
+ if (Flags)
+ PatchedName = Name;
+ Operands.push_back(X86Operand::CreateToken(PatchedName, NameLoc));
+
// This does the actual operand parsing. Don't parse any more if we have a
// prefix juxtaposed with an operation like "lock incl 4(%rax)", because we
// just want to parse the "lock" as the first instruction and the "incl" as
// the next one.
if (getLexer().isNot(AsmToken::EndOfStatement) && !isPrefix) {
-
// Parse '*' modifier.
if (getLexer().is(AsmToken::Star))
Operands.push_back(X86Operand::CreateToken("*", consumeToken()));
@@ -2670,6 +2632,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
}
}
+ if (Flags)
+ Operands.push_back(X86Operand::CreatePrefix(Flags, NameLoc, NameLoc));
return false;
}
@@ -2677,12 +2641,79 @@ bool X86AsmParser::processInstruction(MCInst &Inst, const OperandVector &Ops) {
return false;
}
+bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
+ const MCRegisterInfo *MRI = getContext().getRegisterInfo();
+
+ switch (Inst.getOpcode()) {
+ case X86::VGATHERDPDYrm:
+ case X86::VGATHERDPDrm:
+ case X86::VGATHERDPSYrm:
+ case X86::VGATHERDPSrm:
+ case X86::VGATHERQPDYrm:
+ case X86::VGATHERQPDrm:
+ case X86::VGATHERQPSYrm:
+ case X86::VGATHERQPSrm:
+ case X86::VPGATHERDDYrm:
+ case X86::VPGATHERDDrm:
+ case X86::VPGATHERDQYrm:
+ case X86::VPGATHERDQrm:
+ case X86::VPGATHERQDYrm:
+ case X86::VPGATHERQDrm:
+ case X86::VPGATHERQQYrm:
+ case X86::VPGATHERQQrm: {
+ unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Mask = MRI->getEncodingValue(Inst.getOperand(1).getReg());
+ unsigned Index =
+ MRI->getEncodingValue(Inst.getOperand(3 + X86::AddrIndexReg).getReg());
+ if (Dest == Mask || Dest == Index || Mask == Index)
+ return Warning(Ops[0]->getStartLoc(), "mask, index, and destination "
+ "registers should be distinct");
+ break;
+ }
+ case X86::VGATHERDPDZ128rm:
+ case X86::VGATHERDPDZ256rm:
+ case X86::VGATHERDPDZrm:
+ case X86::VGATHERDPSZ128rm:
+ case X86::VGATHERDPSZ256rm:
+ case X86::VGATHERDPSZrm:
+ case X86::VGATHERQPDZ128rm:
+ case X86::VGATHERQPDZ256rm:
+ case X86::VGATHERQPDZrm:
+ case X86::VGATHERQPSZ128rm:
+ case X86::VGATHERQPSZ256rm:
+ case X86::VGATHERQPSZrm:
+ case X86::VPGATHERDDZ128rm:
+ case X86::VPGATHERDDZ256rm:
+ case X86::VPGATHERDDZrm:
+ case X86::VPGATHERDQZ128rm:
+ case X86::VPGATHERDQZ256rm:
+ case X86::VPGATHERDQZrm:
+ case X86::VPGATHERQDZ128rm:
+ case X86::VPGATHERQDZ256rm:
+ case X86::VPGATHERQDZrm:
+ case X86::VPGATHERQQZ128rm:
+ case X86::VPGATHERQQZ256rm:
+ case X86::VPGATHERQQZrm: {
+ unsigned Dest = MRI->getEncodingValue(Inst.getOperand(0).getReg());
+ unsigned Index =
+ MRI->getEncodingValue(Inst.getOperand(4 + X86::AddrIndexReg).getReg());
+ if (Dest == Index)
+ return Warning(Ops[0]->getStartLoc(), "index and destination registers "
+ "should be distinct");
+ break;
+ }
+ }
+
+ return false;
+}
+
static const char *getSubtargetFeatureName(uint64_t Val);
void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
MCStreamer &Out) {
- Instrumentation->InstrumentAndEmitInstruction(Inst, Operands, getContext(),
- MII, Out);
+ Instrumentation->InstrumentAndEmitInstruction(
+ Inst, Operands, getContext(), MII, Out,
+ getParser().shouldPrintSchedInfo());
}
bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -2737,6 +2768,16 @@ bool X86AsmParser::ErrorMissingFeature(SMLoc IDLoc, uint64_t ErrorInfo,
return Error(IDLoc, OS.str(), SMRange(), MatchingInlineAsm);
}
+static unsigned getPrefixes(OperandVector &Operands) {
+ unsigned Result = 0;
+ X86Operand &Prefix = static_cast<X86Operand &>(*Operands.back());
+ if (Prefix.isPrefix()) {
+ Result = Prefix.getPrefix();
+ Operands.pop_back();
+ }
+ return Result;
+}
+
bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
OperandVector &Operands,
MCStreamer &Out,
@@ -2751,13 +2792,20 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
bool WasOriginallyInvalidOperand = false;
+ unsigned Prefixes = getPrefixes(Operands);
+
MCInst Inst;
+ if (Prefixes)
+ Inst.setFlags(Prefixes);
+
// First, try a direct match.
switch (MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
isParsingIntelSyntax())) {
default: llvm_unreachable("Unexpected match result!");
case Match_Success:
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the
// individual transformations can chain off each other.
@@ -2917,12 +2965,16 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
StringRef Mnemonic = Op.getToken();
SMRange EmptyRange = None;
StringRef Base = Op.getToken();
+ unsigned Prefixes = getPrefixes(Operands);
// First, handle aliases that expand to multiple instructions.
MatchFPUWaitAlias(IDLoc, Op, Operands, Out, MatchingInlineAsm);
MCInst Inst;
+ if (Prefixes)
+ Inst.setFlags(Prefixes);
+
// Find one unsized memory operand, if present.
X86Operand *UnsizedMemOp = nullptr;
for (const auto &Op : Operands) {
@@ -3043,6 +3095,8 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
// instruction will already have been filled in correctly, since the failing
// matches won't have modified it).
if (NumSuccessfulMatches == 1) {
+ if (!MatchingInlineAsm && validateInstruction(Inst, Operands))
+ return true;
// Some instructions need post-processing to, for example, tweak which
// encoding is selected. Loop on it while changes happen so the individual
// transformations can chain off each other.
@@ -3121,6 +3175,19 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
return false;
} else if (IDVal == ".even")
return parseDirectiveEven(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_proc")
+ return parseDirectiveFPOProc(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_setframe")
+ return parseDirectiveFPOSetFrame(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_pushreg")
+ return parseDirectiveFPOPushReg(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_stackalloc")
+ return parseDirectiveFPOStackAlloc(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_endprologue")
+ return parseDirectiveFPOEndPrologue(DirectiveID.getLoc());
+ else if (IDVal == ".cv_fpo_endproc")
+ return parseDirectiveFPOEndProc(DirectiveID.getLoc());
+
return true;
}
@@ -3218,6 +3285,71 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
return false;
}
+// .cv_fpo_proc foo
+bool X86AsmParser::parseDirectiveFPOProc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ StringRef ProcName;
+ int64_t ParamsSize;
+ if (Parser.parseIdentifier(ProcName))
+ return Parser.TokError("expected symbol name");
+ if (Parser.parseIntToken(ParamsSize, "expected parameter byte count"))
+ return true;
+ if (!isUIntN(32, ParamsSize))
+ return Parser.TokError("parameters size out of range");
+ if (Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_proc' directive");
+ MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
+ return getTargetStreamer().emitFPOProc(ProcSym, ParamsSize, L);
+}
+
+// .cv_fpo_setframe ebp
+bool X86AsmParser::parseDirectiveFPOSetFrame(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ unsigned Reg;
+ SMLoc DummyLoc;
+ if (ParseRegister(Reg, DummyLoc, DummyLoc) ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_setframe' directive");
+ return getTargetStreamer().emitFPOSetFrame(Reg, L);
+}
+
+// .cv_fpo_pushreg ebx
+bool X86AsmParser::parseDirectiveFPOPushReg(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ unsigned Reg;
+ SMLoc DummyLoc;
+ if (ParseRegister(Reg, DummyLoc, DummyLoc) ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_pushreg' directive");
+ return getTargetStreamer().emitFPOPushReg(Reg, L);
+}
+
+// .cv_fpo_stackalloc 20
+bool X86AsmParser::parseDirectiveFPOStackAlloc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ int64_t Offset;
+ if (Parser.parseIntToken(Offset, "expected offset") ||
+ Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_stackalloc' directive");
+ return getTargetStreamer().emitFPOStackAlloc(Offset, L);
+}
+
+// .cv_fpo_endprologue
+bool X86AsmParser::parseDirectiveFPOEndPrologue(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_endprologue' directive");
+ return getTargetStreamer().emitFPOEndPrologue(L);
+}
+
+// .cv_fpo_endproc
+bool X86AsmParser::parseDirectiveFPOEndProc(SMLoc L) {
+ MCAsmParser &Parser = getParser();
+ if (Parser.parseEOL("unexpected tokens"))
+ return addErrorSuffix(" in '.cv_fpo_endproc' directive");
+ return getTargetStreamer().emitFPOEndProc(L);
+}
+
// Force static initialization.
extern "C" void LLVMInitializeX86AsmParser() {
RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 0fba15cc692c..43a0561e769b 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -10,6 +10,7 @@
#ifndef LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
#define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
+#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86AsmParserCommon.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
@@ -28,12 +29,7 @@ namespace llvm {
/// X86Operand - Instances of this class represent a parsed X86 machine
/// instruction.
struct X86Operand : public MCParsedAsmOperand {
- enum KindTy {
- Token,
- Register,
- Immediate,
- Memory
- } Kind;
+ enum KindTy { Token, Register, Immediate, Memory, Prefix } Kind;
SMLoc StartLoc, EndLoc;
SMLoc OffsetOfLoc;
@@ -50,6 +46,10 @@ struct X86Operand : public MCParsedAsmOperand {
unsigned RegNo;
};
+ struct PrefOp {
+ unsigned Prefixes;
+ };
+
struct ImmOp {
const MCExpr *Val;
};
@@ -73,6 +73,7 @@ struct X86Operand : public MCParsedAsmOperand {
struct RegOp Reg;
struct ImmOp Imm;
struct MemOp Mem;
+ struct PrefOp Pref;
};
X86Operand(KindTy K, SMLoc Start, SMLoc End)
@@ -111,6 +112,11 @@ struct X86Operand : public MCParsedAsmOperand {
return Reg.RegNo;
}
+ unsigned getPrefix() const {
+ assert(Kind == Prefix && "Invalid access!");
+ return Pref.Prefixes;
+ }
+
const MCExpr *getImm() const {
assert(Kind == Immediate && "Invalid access!");
return Imm.Val;
@@ -387,6 +393,7 @@ struct X86Operand : public MCParsedAsmOperand {
return isMemOffs() && Mem.ModeSize == 64 && (!Mem.Size || Mem.Size == 64);
}
+ bool isPrefix() const { return Kind == Prefix; }
bool isReg() const override { return Kind == Register; }
bool isGR32orGR64() const {
@@ -509,6 +516,13 @@ struct X86Operand : public MCParsedAsmOperand {
return Res;
}
+ static std::unique_ptr<X86Operand>
+ CreatePrefix(unsigned Prefixes, SMLoc StartLoc, SMLoc EndLoc) {
+ auto Res = llvm::make_unique<X86Operand>(Prefix, StartLoc, EndLoc);
+ Res->Pref.Prefixes = Prefixes;
+ return Res;
+ }
+
static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
SMLoc StartLoc, SMLoc EndLoc) {
auto Res = llvm::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 6e08d4cff6ea..7e0df2941467 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -11,33 +11,21 @@ tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
-if(LLVM_BUILD_GLOBAL_ISEL)
- tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
- tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
-endif()
-
-add_public_tablegen_target(X86CommonTableGen)
-
-# Add GlobalISel files if the build option was enabled.
-set(GLOBAL_ISEL_FILES
- X86CallLowering.cpp
- X86LegalizerInfo.cpp
- X86RegisterBankInfo.cpp
- X86InstructionSelector.cpp
- )
+tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
+tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
-if(LLVM_BUILD_GLOBAL_ISEL)
- set(GLOBAL_ISEL_BUILD_FILES ${GLOBAL_ISEL_FILES})
-else()
- set(GLOBAL_ISEL_BUILD_FILES "")
- set(LLVM_OPTIONAL_SOURCES LLVMGlobalISel ${GLOBAL_ISEL_FILES})
+if (X86_GEN_FOLD_TABLES)
+ tablegen(LLVM X86GenFoldTables.inc -gen-x86-fold-tables)
endif()
+add_public_tablegen_target(X86CommonTableGen)
set(sources
X86AsmPrinter.cpp
X86CallFrameOptimization.cpp
+ X86CallLowering.cpp
X86CmovConversion.cpp
+ X86DomainReassignment.cpp
X86ExpandPseudo.cpp
X86FastISel.cpp
X86FixupBWInsts.cpp
@@ -45,17 +33,20 @@ set(sources
X86FixupSetCC.cpp
X86FloatingPoint.cpp
X86FrameLowering.cpp
+ X86InstructionSelector.cpp
X86ISelDAGToDAG.cpp
X86ISelLowering.cpp
X86InterleavedAccess.cpp
X86InstrFMA3Info.cpp
X86InstrInfo.cpp
X86EvexToVex.cpp
+ X86LegalizerInfo.cpp
X86MCInstLower.cpp
X86MachineFunctionInfo.cpp
X86MacroFusion.cpp
X86OptimizeLEAs.cpp
X86PadShortFunction.cpp
+ X86RegisterBankInfo.cpp
X86RegisterInfo.cpp
X86SelectionDAGInfo.cpp
X86ShuffleDecodeConstantPool.cpp
@@ -67,7 +58,6 @@ set(sources
X86WinAllocaExpander.cpp
X86WinEHState.cpp
X86CallingConv.cpp
- ${GLOBAL_ISEL_BUILD_FILES}
)
add_llvm_target(X86CodeGen ${sources})
diff --git a/lib/Target/X86/Disassembler/X86Disassembler.cpp b/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 4ce908b1da64..c58254ae38c1 100644
--- a/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -74,6 +74,7 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/X86BaseInfo.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86DisassemblerDecoder.h"
#include "llvm/MC/MCContext.h"
@@ -232,7 +233,24 @@ MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
return Fail;
} else {
Size = InternalInstr.length;
- return (!translateInstruction(Instr, InternalInstr, this)) ? Success : Fail;
+ bool Ret = translateInstruction(Instr, InternalInstr, this);
+ if (!Ret) {
+ unsigned Flags = X86::IP_NO_PREFIX;
+ if (InternalInstr.hasAdSize)
+ Flags |= X86::IP_HAS_AD_SIZE;
+ if (!InternalInstr.mandatoryPrefix) {
+ if (InternalInstr.hasOpSize)
+ Flags |= X86::IP_HAS_OP_SIZE;
+ if (InternalInstr.repeatPrefix == 0xf2)
+ Flags |= X86::IP_HAS_REPEAT_NE;
+ else if (InternalInstr.repeatPrefix == 0xf3 &&
+ // It should not be 'pause' f3 90
+ InternalInstr.opcode != 0x90)
+ Flags |= X86::IP_HAS_REPEAT;
+ }
+ Instr.setFlags(Flags);
+ }
+ return (!Ret) ? Success : Fail;
}
}
@@ -315,12 +333,12 @@ static bool translateSrcIndex(MCInst &mcInst, InternalInstruction &insn) {
unsigned baseRegNo;
if (insn.mode == MODE_64BIT)
- baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::RSI;
+ baseRegNo = insn.hasAdSize ? X86::ESI : X86::RSI;
else if (insn.mode == MODE_32BIT)
- baseRegNo = insn.prefixPresent[0x67] ? X86::SI : X86::ESI;
+ baseRegNo = insn.hasAdSize ? X86::SI : X86::ESI;
else {
assert(insn.mode == MODE_16BIT);
- baseRegNo = insn.prefixPresent[0x67] ? X86::ESI : X86::SI;
+ baseRegNo = insn.hasAdSize ? X86::ESI : X86::SI;
}
MCOperand baseReg = MCOperand::createReg(baseRegNo);
mcInst.addOperand(baseReg);
@@ -340,12 +358,12 @@ static bool translateDstIndex(MCInst &mcInst, InternalInstruction &insn) {
unsigned baseRegNo;
if (insn.mode == MODE_64BIT)
- baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::RDI;
+ baseRegNo = insn.hasAdSize ? X86::EDI : X86::RDI;
else if (insn.mode == MODE_32BIT)
- baseRegNo = insn.prefixPresent[0x67] ? X86::DI : X86::EDI;
+ baseRegNo = insn.hasAdSize ? X86::DI : X86::EDI;
else {
assert(insn.mode == MODE_16BIT);
- baseRegNo = insn.prefixPresent[0x67] ? X86::EDI : X86::DI;
+ baseRegNo = insn.hasAdSize ? X86::EDI : X86::DI;
}
MCOperand baseReg = MCOperand::createReg(baseRegNo);
mcInst.addOperand(baseReg);
@@ -746,102 +764,6 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
baseReg = MCOperand::createReg(0);
}
- // Check whether we are handling VSIB addressing mode for GATHER.
- // If sibIndex was set to SIB_INDEX_NONE, index offset is 4 and
- // we should use SIB_INDEX_XMM4|YMM4 for VSIB.
- // I don't see a way to get the correct IndexReg in readSIB:
- // We can tell whether it is VSIB or SIB after instruction ID is decoded,
- // but instruction ID may not be decoded yet when calling readSIB.
- uint32_t Opcode = mcInst.getOpcode();
- bool IndexIs128 = (Opcode == X86::VGATHERDPDrm ||
- Opcode == X86::VGATHERDPDYrm ||
- Opcode == X86::VGATHERQPDrm ||
- Opcode == X86::VGATHERDPSrm ||
- Opcode == X86::VGATHERQPSrm ||
- Opcode == X86::VPGATHERDQrm ||
- Opcode == X86::VPGATHERDQYrm ||
- Opcode == X86::VPGATHERQQrm ||
- Opcode == X86::VPGATHERDDrm ||
- Opcode == X86::VPGATHERQDrm ||
- Opcode == X86::VGATHERDPDZ128rm ||
- Opcode == X86::VGATHERDPDZ256rm ||
- Opcode == X86::VGATHERDPSZ128rm ||
- Opcode == X86::VGATHERQPDZ128rm ||
- Opcode == X86::VGATHERQPSZ128rm ||
- Opcode == X86::VPGATHERDDZ128rm ||
- Opcode == X86::VPGATHERDQZ128rm ||
- Opcode == X86::VPGATHERDQZ256rm ||
- Opcode == X86::VPGATHERQDZ128rm ||
- Opcode == X86::VPGATHERQQZ128rm ||
- Opcode == X86::VSCATTERDPDZ128mr ||
- Opcode == X86::VSCATTERDPDZ256mr ||
- Opcode == X86::VSCATTERDPSZ128mr ||
- Opcode == X86::VSCATTERQPDZ128mr ||
- Opcode == X86::VSCATTERQPSZ128mr ||
- Opcode == X86::VPSCATTERDDZ128mr ||
- Opcode == X86::VPSCATTERDQZ128mr ||
- Opcode == X86::VPSCATTERDQZ256mr ||
- Opcode == X86::VPSCATTERQDZ128mr ||
- Opcode == X86::VPSCATTERQQZ128mr);
- bool IndexIs256 = (Opcode == X86::VGATHERQPDYrm ||
- Opcode == X86::VGATHERDPSYrm ||
- Opcode == X86::VGATHERQPSYrm ||
- Opcode == X86::VGATHERDPDZrm ||
- Opcode == X86::VPGATHERDQZrm ||
- Opcode == X86::VPGATHERQQYrm ||
- Opcode == X86::VPGATHERDDYrm ||
- Opcode == X86::VPGATHERQDYrm ||
- Opcode == X86::VGATHERDPSZ256rm ||
- Opcode == X86::VGATHERQPDZ256rm ||
- Opcode == X86::VGATHERQPSZ256rm ||
- Opcode == X86::VPGATHERDDZ256rm ||
- Opcode == X86::VPGATHERQQZ256rm ||
- Opcode == X86::VPGATHERQDZ256rm ||
- Opcode == X86::VSCATTERDPDZmr ||
- Opcode == X86::VPSCATTERDQZmr ||
- Opcode == X86::VSCATTERDPSZ256mr ||
- Opcode == X86::VSCATTERQPDZ256mr ||
- Opcode == X86::VSCATTERQPSZ256mr ||
- Opcode == X86::VPSCATTERDDZ256mr ||
- Opcode == X86::VPSCATTERQQZ256mr ||
- Opcode == X86::VPSCATTERQDZ256mr ||
- Opcode == X86::VGATHERPF0DPDm ||
- Opcode == X86::VGATHERPF1DPDm ||
- Opcode == X86::VSCATTERPF0DPDm ||
- Opcode == X86::VSCATTERPF1DPDm);
- bool IndexIs512 = (Opcode == X86::VGATHERQPDZrm ||
- Opcode == X86::VGATHERDPSZrm ||
- Opcode == X86::VGATHERQPSZrm ||
- Opcode == X86::VPGATHERQQZrm ||
- Opcode == X86::VPGATHERDDZrm ||
- Opcode == X86::VPGATHERQDZrm ||
- Opcode == X86::VSCATTERQPDZmr ||
- Opcode == X86::VSCATTERDPSZmr ||
- Opcode == X86::VSCATTERQPSZmr ||
- Opcode == X86::VPSCATTERQQZmr ||
- Opcode == X86::VPSCATTERDDZmr ||
- Opcode == X86::VPSCATTERQDZmr ||
- Opcode == X86::VGATHERPF0DPSm ||
- Opcode == X86::VGATHERPF0QPDm ||
- Opcode == X86::VGATHERPF0QPSm ||
- Opcode == X86::VGATHERPF1DPSm ||
- Opcode == X86::VGATHERPF1QPDm ||
- Opcode == X86::VGATHERPF1QPSm ||
- Opcode == X86::VSCATTERPF0DPSm ||
- Opcode == X86::VSCATTERPF0QPDm ||
- Opcode == X86::VSCATTERPF0QPSm ||
- Opcode == X86::VSCATTERPF1DPSm ||
- Opcode == X86::VSCATTERPF1QPDm ||
- Opcode == X86::VSCATTERPF1QPSm);
- if (IndexIs128 || IndexIs256 || IndexIs512) {
- unsigned IndexOffset = insn.sibIndex -
- (insn.addressSize == 8 ? SIB_INDEX_RAX:SIB_INDEX_EAX);
- SIBIndex IndexBase = IndexIs512 ? SIB_INDEX_ZMM0 :
- IndexIs256 ? SIB_INDEX_YMM0 : SIB_INDEX_XMM0;
- insn.sibIndex = (SIBIndex)(IndexBase +
- (insn.sibIndex == SIB_INDEX_NONE ? 4 : IndexOffset));
- }
-
if (insn.sibIndex != SIB_INDEX_NONE) {
switch (insn.sibIndex) {
default:
@@ -969,6 +891,9 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
case TYPE_BNDR:
return translateRMRegister(mcInst, insn);
case TYPE_M:
+ case TYPE_MVSIBX:
+ case TYPE_MVSIBY:
+ case TYPE_MVSIBZ:
return translateRMMemory(mcInst, insn, Dis);
}
}
@@ -1034,6 +959,9 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
insn,
Dis);
return false;
+ case ENCODING_IRC:
+ mcInst.addOperand(MCOperand::createImm(insn.RC));
+ return false;
case ENCODING_SI:
return translateSrcIndex(mcInst, insn);
case ENCODING_DI:
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 577b7a776c6d..843d037ad3cd 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -277,38 +277,44 @@ static void dbgprintf(struct InternalInstruction* insn,
insn->dlog(insn->dlogArg, buffer);
}
-/*
- * setPrefixPresent - Marks that a particular prefix is present at a particular
- * location.
- *
- * @param insn - The instruction to be marked as having the prefix.
- * @param prefix - The prefix that is present.
- * @param location - The location where the prefix is located (in the address
- * space of the instruction's reader).
- */
-static void setPrefixPresent(struct InternalInstruction* insn,
- uint8_t prefix,
- uint64_t location)
-{
- insn->prefixPresent[prefix] = 1;
- insn->prefixLocations[prefix] = location;
+static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
+ if (insn->mode == MODE_64BIT)
+ return prefix >= 0x40 && prefix <= 0x4f;
+ return false;
}
/*
- * isPrefixAtLocation - Queries an instruction to determine whether a prefix is
- * present at a given location.
+ * setPrefixPresent - Marks that a particular prefix is present as mandatory
*
- * @param insn - The instruction to be queried.
- * @param prefix - The prefix.
- * @param location - The location to query.
- * @return - Whether the prefix is at that location.
+ * @param insn - The instruction to be marked as having the prefix.
+ * @param prefix - The prefix that is present.
*/
-static bool isPrefixAtLocation(struct InternalInstruction* insn,
- uint8_t prefix,
- uint64_t location)
-{
- return insn->prefixPresent[prefix] == 1 &&
- insn->prefixLocations[prefix] == location;
+static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) {
+ uint8_t nextByte;
+ switch (prefix) {
+ case 0xf2:
+ case 0xf3:
+ if (lookAtByte(insn, &nextByte))
+ break;
+ // TODO:
+ // 1. There could be several 0x66
+ // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then
+ // it's not mandatory prefix
+ // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need
+ // 0x0f exactly after it to be mandatory prefix
+ if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66)
+ // The last of 0xf2 /0xf3 is mandatory prefix
+ insn->mandatoryPrefix = prefix;
+ insn->repeatPrefix = prefix;
+ break;
+ case 0x66:
+ if (lookAtByte(insn, &nextByte))
+ break;
+ // 0x66 can't overwrite existing mandatory prefix and should be ignored
+ if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte)))
+ insn->mandatoryPrefix = prefix;
+ break;
+ }
}
/*
@@ -322,19 +328,12 @@ static bool isPrefixAtLocation(struct InternalInstruction* insn,
*/
static int readPrefixes(struct InternalInstruction* insn) {
bool isPrefix = true;
- bool prefixGroups[4] = { false };
- uint64_t prefixLocation;
uint8_t byte = 0;
uint8_t nextByte;
- bool hasAdSize = false;
- bool hasOpSize = false;
-
dbgprintf(insn, "readPrefixes()");
while (isPrefix) {
- prefixLocation = insn->readerCursor;
-
/* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
if (consumeByte(insn, &byte))
break;
@@ -343,13 +342,10 @@ static int readPrefixes(struct InternalInstruction* insn) {
* If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
* break and let it be disassembled as a normal "instruction".
*/
- if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0)
+ if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK
break;
- if (insn->readerCursor - 1 == insn->startLocation
- && (byte == 0xf2 || byte == 0xf3)
- && !lookAtByte(insn, &nextByte))
- {
+ if ((byte == 0xf2 || byte == 0xf3) && !lookAtByte(insn, &nextByte)) {
/*
* If the byte is 0xf2 or 0xf3, and any of the following conditions are
* met:
@@ -357,39 +353,41 @@ static int readPrefixes(struct InternalInstruction* insn) {
* - it is followed by an xchg instruction
* then it should be disassembled as a xacquire/xrelease not repne/rep.
*/
- if ((byte == 0xf2 || byte == 0xf3) &&
- ((nextByte == 0xf0) ||
- ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90)))
+ if (((nextByte == 0xf0) ||
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) {
insn->xAcquireRelease = true;
+ if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support
+ break;
+ }
/*
* Also if the byte is 0xf3, and the following condition is met:
* - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
* "mov mem, imm" (opcode 0xc6/0xc7) instructions.
* then it should be disassembled as an xrelease not rep.
*/
- if (byte == 0xf3 &&
- (nextByte == 0x88 || nextByte == 0x89 ||
- nextByte == 0xc6 || nextByte == 0xc7))
+ if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7)) {
insn->xAcquireRelease = true;
- if (insn->mode == MODE_64BIT && (nextByte & 0xf0) == 0x40) {
- if (consumeByte(insn, &nextByte))
+ if (nextByte != 0x90) // PAUSE instruction support
+ break;
+ }
+ if (isREX(insn, nextByte)) {
+ uint8_t nnextByte;
+ // Go to REX prefix after the current one
+ if (consumeByte(insn, &nnextByte))
return -1;
- if (lookAtByte(insn, &nextByte))
+ // We should be able to read next byte after REX prefix
+ if (lookAtByte(insn, &nnextByte))
return -1;
unconsumeByte(insn);
}
- if (nextByte != 0x0f && nextByte != 0x90)
- break;
}
switch (byte) {
case 0xf0: /* LOCK */
case 0xf2: /* REPNE/REPNZ */
case 0xf3: /* REP or REPE/REPZ */
- if (prefixGroups[0])
- dbgprintf(insn, "Redundant Group 1 prefix");
- prefixGroups[0] = true;
- setPrefixPresent(insn, byte, prefixLocation);
+ setPrefixPresent(insn, byte);
break;
case 0x2e: /* CS segment override -OR- Branch not taken */
case 0x36: /* SS segment override -OR- Branch taken */
@@ -420,24 +418,15 @@ static int readPrefixes(struct InternalInstruction* insn) {
debug("Unhandled override");
return -1;
}
- if (prefixGroups[1])
- dbgprintf(insn, "Redundant Group 2 prefix");
- prefixGroups[1] = true;
- setPrefixPresent(insn, byte, prefixLocation);
+ setPrefixPresent(insn, byte);
break;
case 0x66: /* Operand-size override */
- if (prefixGroups[2])
- dbgprintf(insn, "Redundant Group 3 prefix");
- prefixGroups[2] = true;
- hasOpSize = true;
- setPrefixPresent(insn, byte, prefixLocation);
+ insn->hasOpSize = true;
+ setPrefixPresent(insn, byte);
break;
case 0x67: /* Address-size override */
- if (prefixGroups[3])
- dbgprintf(insn, "Redundant Group 4 prefix");
- prefixGroups[3] = true;
- hasAdSize = true;
- setPrefixPresent(insn, byte, prefixLocation);
+ insn->hasAdSize = true;
+ setPrefixPresent(insn, byte);
break;
default: /* Not a prefix byte */
isPrefix = false;
@@ -469,7 +458,6 @@ static int readPrefixes(struct InternalInstruction* insn) {
} else {
unconsumeByte(insn); /* unconsume byte1 */
unconsumeByte(insn); /* unconsume byte */
- insn->necessaryPrefixLocation = insn->readerCursor - 2;
}
if (insn->vectorExtensionType == TYPE_EVEX) {
@@ -505,13 +493,10 @@ static int readPrefixes(struct InternalInstruction* insn) {
return -1;
}
- if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
insn->vectorExtensionType = TYPE_VEX_3B;
- insn->necessaryPrefixLocation = insn->readerCursor - 1;
- } else {
+ else
unconsumeByte(insn);
- insn->necessaryPrefixLocation = insn->readerCursor - 1;
- }
if (insn->vectorExtensionType == TYPE_VEX_3B) {
insn->vectorExtensionPrefix[0] = byte;
@@ -520,13 +505,12 @@ static int readPrefixes(struct InternalInstruction* insn) {
/* We simulate the REX prefix for simplicity's sake */
- if (insn->mode == MODE_64BIT) {
+ if (insn->mode == MODE_64BIT)
insn->rexPrefix = 0x40
| (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
| (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
| (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
| (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
- }
dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
@@ -540,26 +524,24 @@ static int readPrefixes(struct InternalInstruction* insn) {
return -1;
}
- if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) {
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
insn->vectorExtensionType = TYPE_VEX_2B;
- } else {
+ else
unconsumeByte(insn);
- }
if (insn->vectorExtensionType == TYPE_VEX_2B) {
insn->vectorExtensionPrefix[0] = byte;
consumeByte(insn, &insn->vectorExtensionPrefix[1]);
- if (insn->mode == MODE_64BIT) {
+ if (insn->mode == MODE_64BIT)
insn->rexPrefix = 0x40
| (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
- }
switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
default:
break;
case VEX_PREFIX_66:
- hasOpSize = true;
+ insn->hasOpSize = true;
break;
}
@@ -575,13 +557,10 @@ static int readPrefixes(struct InternalInstruction* insn) {
return -1;
}
- if ((byte1 & 0x38) != 0x0) { /* 0 in these 3 bits is a POP instruction. */
+ if ((byte1 & 0x38) != 0x0) /* 0 in these 3 bits is a POP instruction. */
insn->vectorExtensionType = TYPE_XOP;
- insn->necessaryPrefixLocation = insn->readerCursor - 1;
- } else {
+ else
unconsumeByte(insn);
- insn->necessaryPrefixLocation = insn->readerCursor - 1;
- }
if (insn->vectorExtensionType == TYPE_XOP) {
insn->vectorExtensionPrefix[0] = byte;
@@ -590,19 +569,18 @@ static int readPrefixes(struct InternalInstruction* insn) {
/* We simulate the REX prefix for simplicity's sake */
- if (insn->mode == MODE_64BIT) {
+ if (insn->mode == MODE_64BIT)
insn->rexPrefix = 0x40
| (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
| (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
| (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
| (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
- }
switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
default:
break;
case VEX_PREFIX_66:
- hasOpSize = true;
+ insn->hasOpSize = true;
break;
}
@@ -610,51 +588,35 @@ static int readPrefixes(struct InternalInstruction* insn) {
insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
insn->vectorExtensionPrefix[2]);
}
- } else {
- if (insn->mode == MODE_64BIT) {
- if ((byte & 0xf0) == 0x40) {
- uint8_t opcodeByte;
-
- if (lookAtByte(insn, &opcodeByte) || ((opcodeByte & 0xf0) == 0x40)) {
- dbgprintf(insn, "Redundant REX prefix");
- return -1;
- }
-
- insn->rexPrefix = byte;
- insn->necessaryPrefixLocation = insn->readerCursor - 2;
-
- dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
- } else {
- unconsumeByte(insn);
- insn->necessaryPrefixLocation = insn->readerCursor - 1;
- }
- } else {
- unconsumeByte(insn);
- insn->necessaryPrefixLocation = insn->readerCursor - 1;
- }
- }
+ } else if (isREX(insn, byte)) {
+ if (lookAtByte(insn, &nextByte))
+ return -1;
+ insn->rexPrefix = byte;
+ dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
+ } else
+ unconsumeByte(insn);
if (insn->mode == MODE_16BIT) {
- insn->registerSize = (hasOpSize ? 4 : 2);
- insn->addressSize = (hasAdSize ? 4 : 2);
- insn->displacementSize = (hasAdSize ? 4 : 2);
- insn->immediateSize = (hasOpSize ? 4 : 2);
+ insn->registerSize = (insn->hasOpSize ? 4 : 2);
+ insn->addressSize = (insn->hasAdSize ? 4 : 2);
+ insn->displacementSize = (insn->hasAdSize ? 4 : 2);
+ insn->immediateSize = (insn->hasOpSize ? 4 : 2);
} else if (insn->mode == MODE_32BIT) {
- insn->registerSize = (hasOpSize ? 2 : 4);
- insn->addressSize = (hasAdSize ? 2 : 4);
- insn->displacementSize = (hasAdSize ? 2 : 4);
- insn->immediateSize = (hasOpSize ? 2 : 4);
+ insn->registerSize = (insn->hasOpSize ? 2 : 4);
+ insn->addressSize = (insn->hasAdSize ? 2 : 4);
+ insn->displacementSize = (insn->hasAdSize ? 2 : 4);
+ insn->immediateSize = (insn->hasOpSize ? 2 : 4);
} else if (insn->mode == MODE_64BIT) {
if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
insn->registerSize = 8;
- insn->addressSize = (hasAdSize ? 4 : 8);
+ insn->addressSize = (insn->hasAdSize ? 4 : 8);
insn->displacementSize = 4;
insn->immediateSize = 4;
} else {
- insn->registerSize = (hasOpSize ? 2 : 4);
- insn->addressSize = (hasAdSize ? 4 : 8);
- insn->displacementSize = (hasOpSize ? 2 : 4);
- insn->immediateSize = (hasOpSize ? 2 : 4);
+ insn->registerSize = (insn->hasOpSize ? 2 : 4);
+ insn->addressSize = (insn->hasAdSize ? 4 : 8);
+ insn->displacementSize = (insn->hasOpSize ? 2 : 4);
+ insn->immediateSize = (insn->hasOpSize ? 2 : 4);
}
}
@@ -758,7 +720,10 @@ static int readOpcode(struct InternalInstruction* insn) {
insn->opcodeType = TWOBYTE;
}
- }
+ } else if (insn->mandatoryPrefix)
+ // The opcode with mandatory prefix must start with opcode escape.
+ // If not it's legacy repeat prefix
+ insn->mandatoryPrefix = 0;
/*
* At this point we have consumed the full opcode.
@@ -950,19 +915,44 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
} else {
return -1;
}
- } else {
- if (insn->mode != MODE_16BIT && isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation))
+ } else if (!insn->mandatoryPrefix) {
+ // If we don't have mandatory prefix we should use legacy prefixes here
+ if (insn->hasOpSize && (insn->mode != MODE_16BIT))
attrMask |= ATTR_OPSIZE;
- else if (isPrefixAtLocation(insn, 0x67, insn->necessaryPrefixLocation))
+ if (insn->hasAdSize)
attrMask |= ATTR_ADSIZE;
- else if (isPrefixAtLocation(insn, 0xf3, insn->necessaryPrefixLocation))
- attrMask |= ATTR_XS;
- else if (isPrefixAtLocation(insn, 0xf2, insn->necessaryPrefixLocation))
+ if (insn->opcodeType == ONEBYTE) {
+ if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90))
+ // Special support for PAUSE
+ attrMask |= ATTR_XS;
+ } else {
+ if (insn->repeatPrefix == 0xf2)
+ attrMask |= ATTR_XD;
+ else if (insn->repeatPrefix == 0xf3)
+ attrMask |= ATTR_XS;
+ }
+ } else {
+ switch (insn->mandatoryPrefix) {
+ case 0xf2:
attrMask |= ATTR_XD;
+ break;
+ case 0xf3:
+ attrMask |= ATTR_XS;
+ break;
+ case 0x66:
+ if (insn->mode != MODE_16BIT)
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case 0x67:
+ attrMask |= ATTR_ADSIZE;
+ break;
+ }
}
- if (insn->rexPrefix & 0x08)
+ if (insn->rexPrefix & 0x08) {
attrMask |= ATTR_REXW;
+ attrMask &= ~ATTR_ADSIZE;
+ }
/*
* JCXZ/JECXZ need special handling for 16-bit mode because the meaning
@@ -977,8 +967,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
* CALL/JMP/JCC instructions need to ignore 0x66 and consume 4 bytes
*/
- if (insn->mode == MODE_64BIT &&
- isPrefixAtLocation(insn, 0x66, insn->necessaryPrefixLocation)) {
+ if ((insn->mode == MODE_64BIT) && insn->hasOpSize) {
switch (insn->opcode) {
case 0xE8:
case 0xE9:
@@ -1058,9 +1047,9 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
*/
if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) {
/* Make sure we observed the prefixes in any position. */
- if (insn->prefixPresent[0x67])
+ if (insn->hasAdSize)
attrMask |= ATTR_ADSIZE;
- if (insn->prefixPresent[0x66])
+ if (insn->hasOpSize)
attrMask |= ATTR_OPSIZE;
/* In 16-bit, invert the attributes. */
@@ -1075,7 +1064,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
return 0;
}
- if ((insn->mode == MODE_16BIT || insn->prefixPresent[0x66]) &&
+ if ((insn->mode == MODE_16BIT || insn->hasOpSize) &&
!(attrMask & ATTR_OPSIZE)) {
/*
* The instruction tables make no distinction between instructions that
@@ -1108,7 +1097,7 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
- (insn->mode == MODE_16BIT) ^ insn->prefixPresent[0x66]) {
+ (insn->mode == MODE_16BIT) ^ insn->hasOpSize) {
insn->instructionID = instructionIDWithOpsize;
insn->spec = specifierForUID(instructionIDWithOpsize);
} else {
@@ -1169,7 +1158,6 @@ static int getID(struct InternalInstruction* insn, const void *miiArg) {
* @return - 0 if the SIB byte was successfully read; nonzero otherwise.
*/
static int readSIB(struct InternalInstruction* insn) {
- SIBIndex sibIndexBase = SIB_INDEX_NONE;
SIBBase sibBaseBase = SIB_BASE_NONE;
uint8_t index, base;
@@ -1185,11 +1173,11 @@ static int readSIB(struct InternalInstruction* insn) {
dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
return -1;
case 4:
- sibIndexBase = SIB_INDEX_EAX;
+ insn->sibIndexBase = SIB_INDEX_EAX;
sibBaseBase = SIB_BASE_EAX;
break;
case 8:
- sibIndexBase = SIB_INDEX_RAX;
+ insn->sibIndexBase = SIB_INDEX_RAX;
sibBaseBase = SIB_BASE_RAX;
break;
}
@@ -1199,26 +1187,10 @@ static int readSIB(struct InternalInstruction* insn) {
index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
- // FIXME: The fifth bit (bit index 4) is only to be used for instructions
- // that understand VSIB indexing. ORing the bit in here is mildy dangerous
- // because performing math on an 'enum SIBIndex' can produce garbage.
- // Excluding the "none" value, it should cover 6 spaces of register names:
- // - 16 possibilities for 16-bit GPR starting at SIB_INDEX_BX_SI
- // - 16 possibilities for 32-bit GPR starting at SIB_INDEX_EAX
- // - 16 possibilities for 64-bit GPR starting at SIB_INDEX_RAX
- // - 32 possibilities for each of XMM, YMM, ZMM registers
- // When sibIndexBase gets assigned SIB_INDEX_RAX as it does in 64-bit mode,
- // summing in a fully decoded index between 0 and 31 can end up with a value
- // that looks like something in the low half of the XMM range.
- // translateRMMemory() tries to reverse the damage, with only partial success,
- // as evidenced by known bugs in "test/MC/Disassembler/X86/x86-64.txt"
- if (insn->vectorExtensionType == TYPE_EVEX)
- index |= v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4;
-
if (index == 0x4) {
insn->sibIndex = SIB_INDEX_NONE;
} else {
- insn->sibIndex = (SIBIndex)(sibIndexBase + index);
+ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index);
}
insn->sibScale = 1 << scaleFromSIB(insn->sib);
@@ -1483,9 +1455,9 @@ static int readModRM(struct InternalInstruction* insn) {
case TYPE_MM64: \
return prefix##_MM0 + (index & 0x7); \
case TYPE_SEGMENTREG: \
- if (index > 5) \
+ if ((index & 7) > 5) \
*valid = 0; \
- return prefix##_ES + index; \
+ return prefix##_ES + (index & 7); \
case TYPE_DEBUGREG: \
return prefix##_DR0 + index; \
case TYPE_CONTROLREG: \
@@ -1494,6 +1466,12 @@ static int readModRM(struct InternalInstruction* insn) {
if (index > 3) \
*valid = 0; \
return prefix##_BND0 + index; \
+ case TYPE_MVSIBX: \
+ return prefix##_XMM0 + index; \
+ case TYPE_MVSIBY: \
+ return prefix##_YMM0 + index; \
+ case TYPE_MVSIBZ: \
+ return prefix##_ZMM0 + index; \
} \
}
@@ -1549,7 +1527,6 @@ static int fixupReg(struct InternalInstruction *insn,
return -1;
break;
CASE_ENCODING_RM:
- CASE_ENCODING_VSIB:
if (insn->eaBase >= insn->eaRegBase) {
insn->eaBase = (EABase)fixupRMValue(insn,
(OperandType)op->type,
@@ -1747,8 +1724,39 @@ static int readOperands(struct InternalInstruction* insn) {
needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
if (readModRM(insn))
return -1;
- if (fixupReg(insn, &Op))
+
+ // Reject if SIB wasn't used.
+ if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
+ return -1;
+
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
+ if (insn->sibIndex == SIB_INDEX_NONE)
+ insn->sibIndex = (SIBIndex)4;
+
+ // If EVEX.v2 is set this is one of the 16-31 registers.
+ if (insn->vectorExtensionType == TYPE_EVEX &&
+ v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
+
+ // Adjust the index register to the correct size.
+ switch ((OperandType)Op.type) {
+ default:
+ debug("Unhandled VSIB index type");
return -1;
+ case TYPE_MVSIBX:
+ insn->sibIndex = (SIBIndex)(SIB_INDEX_XMM0 +
+ (insn->sibIndex - insn->sibIndexBase));
+ break;
+ case TYPE_MVSIBY:
+ insn->sibIndex = (SIBIndex)(SIB_INDEX_YMM0 +
+ (insn->sibIndex - insn->sibIndexBase));
+ break;
+ case TYPE_MVSIBZ:
+ insn->sibIndex = (SIBIndex)(SIB_INDEX_ZMM0 +
+ (insn->sibIndex - insn->sibIndexBase));
+ break;
+ }
+
// Apply the AVX512 compressed displacement scaling factor.
if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
@@ -1797,6 +1805,10 @@ static int readOperands(struct InternalInstruction* insn) {
if (readImmediate(insn, insn->addressSize))
return -1;
break;
+ case ENCODING_IRC:
+ insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) |
+ lFromEVEX4of4(insn->vectorExtensionPrefix[3]);
+ break;
case ENCODING_RB:
if (readOpcodeRegister(insn, 1))
return -1;
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index b07fd0b17d35..ecd9d8dccafa 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -546,24 +546,26 @@ struct InternalInstruction {
// Prefix state
- // 1 if the prefix byte corresponding to the entry is present; 0 if not
- uint8_t prefixPresent[0x100];
- // contains the location (for use with the reader) of the prefix byte
- uint64_t prefixLocations[0x100];
+ // The possible mandatory prefix
+ uint8_t mandatoryPrefix;
// The value of the vector extension prefix(EVEX/VEX/XOP), if present
uint8_t vectorExtensionPrefix[4];
// The type of the vector extension prefix
VectorExtensionType vectorExtensionType;
// The value of the REX prefix, if present
uint8_t rexPrefix;
- // The location where a mandatory prefix would have to be (i.e., right before
- // the opcode, or right before the REX prefix if one is present).
- uint64_t necessaryPrefixLocation;
// The segment override type
SegmentOverride segmentOverride;
// 1 if the prefix byte, 0xf2 or 0xf3 is xacquire or xrelease
bool xAcquireRelease;
+ // Address-size override
+ bool hasAdSize;
+ // Operand-size override
+ bool hasOpSize;
+ // The repeat prefix if any
+ uint8_t repeatPrefix;
+
// Sizes of various critical pieces of data, in bytes
uint8_t registerSize;
uint8_t addressSize;
@@ -637,10 +639,14 @@ struct InternalInstruction {
Reg reg;
// SIB state
+ SIBIndex sibIndexBase;
SIBIndex sibIndex;
uint8_t sibScale;
SIBBase sibBase;
+ // Embedded rounding control.
+ uint8_t RC;
+
ArrayRef<OperandSpecifier> operands;
};
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
index e0f4399b3687..ad1404860fb6 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoderCommon.h
@@ -382,6 +382,7 @@ enum ModRMDecisionType {
\
ENUM_ENTRY(ENCODING_Iv, "Immediate of operand size") \
ENUM_ENTRY(ENCODING_Ia, "Immediate of address size") \
+ ENUM_ENTRY(ENCODING_IRC, "Immediate for static rounding control") \
ENUM_ENTRY(ENCODING_Rv, "Register code of operand size added to the " \
"opcode byte") \
ENUM_ENTRY(ENCODING_DUP, "Duplicate of another operand; ID is encoded " \
@@ -410,6 +411,9 @@ enum OperandEncoding {
ENUM_ENTRY(TYPE_AVX512ICC, "1-byte immediate operand for AVX512 icmp") \
ENUM_ENTRY(TYPE_UIMM8, "1-byte unsigned immediate operand") \
ENUM_ENTRY(TYPE_M, "Memory operand") \
+ ENUM_ENTRY(TYPE_MVSIBX, "Memory operand using XMM index") \
+ ENUM_ENTRY(TYPE_MVSIBY, "Memory operand using YMM index") \
+ ENUM_ENTRY(TYPE_MVSIBZ, "Memory operand using ZMM index") \
ENUM_ENTRY(TYPE_SRCIDX, "memory at source index") \
ENUM_ENTRY(TYPE_DSTIDX, "memory at destination index") \
ENUM_ENTRY(TYPE_MOFFS, "memory offset (relative to segment base)") \
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 4d91300c7ede..0c99dbbe328b 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -50,8 +50,16 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
HasCustomInstComment =
EmitAnyX86InstComments(MI, *CommentStream, getRegisterName);
+ unsigned Flags = MI->getFlags();
if (TSFlags & X86II::LOCK)
OS << "\tlock\t";
+ if (!(TSFlags & X86II::LOCK) && Flags & X86::IP_HAS_LOCK)
+ OS << "\tlock\t";
+
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ OS << "\trepne\t";
+ else if (Flags & X86::IP_HAS_REPEAT)
+ OS << "\trep\t";
// Output CALLpcrel32 as "callq" in 64-bit mode.
// In Intel annotation it's always emitted as "call".
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index f5f3a4cc83dc..a46f22ff40f5 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -205,16 +205,14 @@ static MVT getZeroExtensionResultType(const MCInst *MI) {
}
/// Wraps the destination register name with AVX512 mask/maskz filtering.
-static std::string getMaskName(const MCInst *MI, const char *DestName,
- const char *(*getRegName)(unsigned)) {
- std::string OpMaskName(DestName);
-
+static void printMasking(raw_ostream &OS, const MCInst *MI,
+ const char *(*getRegName)(unsigned)) {
bool MaskWithZero = false;
const char *MaskRegName = nullptr;
switch (MI->getOpcode()) {
default:
- return OpMaskName;
+ return;
CASE_MASKZ_MOVDUP(MOVDDUP, m)
CASE_MASKZ_MOVDUP(MOVDDUP, r)
CASE_MASKZ_MOVDUP(MOVSHDUP, m)
@@ -293,6 +291,8 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
CASE_MASKZ_INS_COMMON(BROADCASTI32X4, , rm)
CASE_MASKZ_INS_COMMON(BROADCASTF32X8, , rm)
CASE_MASKZ_INS_COMMON(BROADCASTI32X8, , rm)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, r)
+ CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z128, m)
CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, r)
CASE_MASKZ_INS_COMMON(BROADCASTI32X2, Z256, r)
CASE_MASKZ_INS_COMMON(BROADCASTF32X2, Z256, m)
@@ -382,6 +382,8 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
CASE_MASK_INS_COMMON(BROADCASTI32X4, , rm)
CASE_MASK_INS_COMMON(BROADCASTF32X8, , rm)
CASE_MASK_INS_COMMON(BROADCASTI32X8, , rm)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, r)
+ CASE_MASK_INS_COMMON(BROADCASTI32X2, Z128, m)
CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, r)
CASE_MASK_INS_COMMON(BROADCASTI32X2, Z256, r)
CASE_MASK_INS_COMMON(BROADCASTF32X2, Z256, m)
@@ -395,15 +397,11 @@ static std::string getMaskName(const MCInst *MI, const char *DestName,
}
// MASK: zmmX {%kY}
- OpMaskName += " {%";
- OpMaskName += MaskRegName;
- OpMaskName += "}";
+ OS << " {%" << MaskRegName << "}";
// MASKZ: zmmX {%kY} {z}
if (MaskWithZero)
- OpMaskName += " {z}";
-
- return OpMaskName;
+ OS << " {z}";
}
//===----------------------------------------------------------------------===//
@@ -585,12 +583,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPSLLDQYri:
case X86::VPSLLDQZ128rr:
case X86::VPSLLDQZ256rr:
- case X86::VPSLLDQZ512rr:
+ case X86::VPSLLDQZrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
LLVM_FALLTHROUGH;
case X86::VPSLLDQZ128rm:
case X86::VPSLLDQZ256rm:
- case X86::VPSLLDQZ512rm:
+ case X86::VPSLLDQZrm:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
DecodePSLLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
@@ -603,12 +601,12 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
case X86::VPSRLDQYri:
case X86::VPSRLDQZ128rr:
case X86::VPSRLDQZ256rr:
- case X86::VPSRLDQZ512rr:
+ case X86::VPSRLDQZrr:
Src1Name = getRegName(MI->getOperand(1).getReg());
LLVM_FALLTHROUGH;
case X86::VPSRLDQZ128rm:
case X86::VPSRLDQZ256rm:
- case X86::VPSRLDQZ512rm:
+ case X86::VPSRLDQZrm:
DestName = getRegName(MI->getOperand(0).getReg());
if (MI->getOperand(NumOperands - 1).isImm())
DecodePSRLDQMask(getRegOperandVectorVT(MI, MVT::i8, 0),
@@ -1090,6 +1088,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
DecodeSubVectorBroadcast(MVT::v16f32, MVT::v8f32, ShuffleMask);
DestName = getRegName(MI->getOperand(0).getReg());
break;
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r)
+ Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
+ LLVM_FALLTHROUGH;
+ CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m)
+ DecodeSubVectorBroadcast(MVT::v4f32, MVT::v2f32, ShuffleMask);
+ DestName = getRegName(MI->getOperand(0).getReg());
+ break;
CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
@@ -1149,7 +1154,13 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
return false;
if (!DestName) DestName = Src1Name;
- OS << (DestName ? getMaskName(MI, DestName, getRegName) : "mem") << " = ";
+ if (DestName) {
+ OS << DestName;
+ printMasking(OS, MI, getRegName);
+ } else
+ OS << "mem";
+
+ OS << " = ";
// If the two sources are the same, canonicalize the input elements to be
// from the first src so that we get larger element spans.
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.h b/lib/Target/X86/InstPrinter/X86InstComments.h
index c6d0d85a7d3d..629c02c95c7f 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.h
+++ b/lib/Target/X86/InstPrinter/X86InstComments.h
@@ -15,10 +15,13 @@
#ifndef LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
#define LLVM_LIB_TARGET_X86_INSTPRINTER_X86INSTCOMMENTS_H
+#include "llvm/CodeGen/MachineInstr.h"
+
namespace llvm {
enum AsmComments {
- AC_EVEX_2_VEX = 0x2 // For instr that was compressed from EVEX to VEX.
+ // For instr that was compressed from EVEX to VEX.
+ AC_EVEX_2_VEX = MachineInstr::TAsmComments
};
class MCInst;
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index d6af6712d5a1..1f02600a7982 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -41,7 +41,13 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
uint64_t TSFlags = Desc.TSFlags;
if (TSFlags & X86II::LOCK)
- OS << "\tlock\n";
+ OS << "\tlock\t";
+
+ unsigned Flags = MI->getFlags();
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ OS << "\trepne\t";
+ else if (Flags & X86::IP_HAS_REPEAT)
+ OS << "\trep\t";
printInstruction(MI, OS);
@@ -152,6 +158,7 @@ void X86IntelInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
O << formatImm((int64_t)Op.getImm());
} else {
assert(Op.isExpr() && "unknown operand kind in printOperand");
+ O << "offset ";
Op.getExpr()->print(O, &MAI);
}
}
diff --git a/lib/Target/X86/MCTargetDesc/CMakeLists.txt b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
index 33df9ec7dcde..8d0d9fa1215c 100644
--- a/lib/Target/X86/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/X86/MCTargetDesc/CMakeLists.txt
@@ -5,6 +5,7 @@ add_llvm_library(LLVMX86Desc
X86MCCodeEmitter.cpp
X86MachObjectWriter.cpp
X86ELFObjectWriter.cpp
- X86WinCOFFStreamer.cpp
X86WinCOFFObjectWriter.cpp
+ X86WinCOFFStreamer.cpp
+ X86WinCOFFTargetStreamer.cpp
)
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 733eac7c0321..78385ae1877b 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -20,12 +20,9 @@
#include "llvm/MC/MCMachObjectWriter.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
using namespace llvm;
@@ -76,7 +73,7 @@ class X86AsmBackend : public MCAsmBackend {
public:
X86AsmBackend(const Target &T, StringRef CPU)
: MCAsmBackend(), CPU(CPU),
- MaxNopLength((CPU == "slm") ? 7 : 15) {
+ MaxNopLength((CPU == "slm" || CPU == "silvermont") ? 7 : 15) {
HasNopl = CPU != "generic" && CPU != "i386" && CPU != "i486" &&
CPU != "i586" && CPU != "pentium" && CPU != "pentium-mmx" &&
CPU != "i686" && CPU != "k6" && CPU != "k6-2" && CPU != "k6-3" &&
@@ -389,7 +386,8 @@ public:
ELFX86_32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
: ELFX86AsmBackend(T, OSABI, CPU) {}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI, ELF::EM_386);
}
};
@@ -399,7 +397,8 @@ public:
ELFX86_X32AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
: ELFX86AsmBackend(T, OSABI, CPU) {}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
ELF::EM_X86_64);
}
@@ -410,7 +409,8 @@ public:
ELFX86_IAMCUAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
: ELFX86AsmBackend(T, OSABI, CPU) {}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86ELFObjectWriter(OS, /*IsELF64*/ false, OSABI,
ELF::EM_IAMCU);
}
@@ -421,7 +421,8 @@ public:
ELFX86_64AsmBackend(const Target &T, uint8_t OSABI, StringRef CPU)
: ELFX86AsmBackend(T, OSABI, CPU) {}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86ELFObjectWriter(OS, /*IsELF64*/ true, OSABI, ELF::EM_X86_64);
}
};
@@ -443,7 +444,8 @@ public:
.Default(MCAsmBackend::getFixupKind(Name));
}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86WinCOFFObjectWriter(OS, Is64Bit);
}
};
@@ -804,7 +806,8 @@ public:
StringRef CPU)
: DarwinX86AsmBackend(T, MRI, CPU, false) {}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86MachObjectWriter(OS, /*Is64Bit=*/false,
MachO::CPU_TYPE_I386,
MachO::CPU_SUBTYPE_I386_ALL);
@@ -824,7 +827,8 @@ public:
StringRef CPU, MachO::CPUSubTypeX86 st)
: DarwinX86AsmBackend(T, MRI, CPU, true), Subtype(st) {}
- MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
+ std::unique_ptr<MCObjectWriter>
+ createObjectWriter(raw_pwrite_stream &OS) const override {
return createX86MachObjectWriter(OS, /*Is64Bit=*/true,
MachO::CPU_TYPE_X86_64, Subtype);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index d8953da4abb2..07cc488d047e 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -51,6 +51,18 @@ namespace X86 {
TO_ZERO = 3,
CUR_DIRECTION = 4
};
+
+ /// The constants to describe instr prefixes if there are
+ enum IPREFIXES {
+ IP_NO_PREFIX = 0,
+ IP_HAS_OP_SIZE = 1,
+ IP_HAS_AD_SIZE = 2,
+ IP_HAS_REPEAT_NE = 4,
+ IP_HAS_REPEAT = 8,
+ IP_HAS_LOCK = 16,
+ NO_SCHED_INFO = 32 // Don't add sched comment to the current instr because
+ // it was already added
+ };
} // end namespace X86;
/// X86II - This namespace holds all of the target specific flags that
@@ -356,13 +368,15 @@ namespace X86II {
// OpSize - OpSizeFixed implies instruction never needs a 0x66 prefix.
// OpSize16 means this is a 16-bit instruction and needs 0x66 prefix in
// 32-bit mode. OpSize32 means this is a 32-bit instruction needs a 0x66
- // prefix in 16-bit mode.
+ // prefix in 16-bit mode. OpSizeIgnore means that the instruction may
+ // take a optional 0x66 byte but should not emit with one.
OpSizeShift = 7,
OpSizeMask = 0x3 << OpSizeShift,
- OpSizeFixed = 0 << OpSizeShift,
- OpSize16 = 1 << OpSizeShift,
- OpSize32 = 2 << OpSizeShift,
+ OpSizeFixed = 0 << OpSizeShift,
+ OpSize16 = 1 << OpSizeShift,
+ OpSize32 = 2 << OpSizeShift,
+ OpSizeIgnore = 3 << OpSizeShift,
// AsSize - AdSizeX implies this instruction determines its need of 0x67
// prefix from a normal ModRM memory operand. The other types indicate that
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 4da4eebec038..4cdbae4d0d96 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -15,6 +15,7 @@
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/Support/ErrorHandling.h"
#include <cassert>
@@ -297,10 +298,9 @@ unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
return getRelocType32(Ctx, Modifier, getType32(Type), IsPCRel, Kind);
}
-MCObjectWriter *llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS,
- bool IsELF64, uint8_t OSABI,
- uint16_t EMachine) {
- MCELFObjectTargetWriter *MOTW =
- new X86ELFObjectWriter(IsELF64, OSABI, EMachine);
- return createELFObjectWriter(MOTW, OS, /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectWriter>
+llvm::createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
+ uint8_t OSABI, uint16_t EMachine) {
+ auto MOTW = llvm::make_unique<X86ELFObjectWriter>(IsELF64, OSABI, EMachine);
+ return createELFObjectWriter(std::move(MOTW), OS, /*IsLittleEndian=*/true);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index 1538a515f419..fa7c352a1b63 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -13,10 +13,7 @@
#include "X86MCAsmInfo.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/BinaryFormat/ELF.h"
-#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/Support/CommandLine.h"
using namespace llvm;
@@ -27,11 +24,11 @@ enum AsmWriterFlavorTy {
ATT = 0, Intel = 1
};
-static cl::opt<AsmWriterFlavorTy>
-AsmWriterFlavor("x86-asm-syntax", cl::init(ATT),
- cl::desc("Choose style of code to emit from X86 backend:"),
- cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
- clEnumValN(Intel, "intel", "Emit Intel-style assembly")));
+static cl::opt<AsmWriterFlavorTy> AsmWriterFlavor(
+ "x86-asm-syntax", cl::init(ATT), cl::Hidden,
+ cl::desc("Choose style of code to emit from X86 backend:"),
+ cl::values(clEnumValN(ATT, "att", "Emit AT&T-style assembly"),
+ clEnumValN(Intel, "intel", "Emit Intel-style assembly")));
static cl::opt<bool>
MarkedJTDataRegions("mark-data-regions", cl::init(true),
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 10e2bbc64d3c..a7059c6914df 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -380,7 +380,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
return X86::reloc_riprel_4byte_movq_load;
case X86::CALL64m:
case X86::JMP64m:
- case X86::TEST64rm:
+ case X86::TEST64mr:
case X86::ADC64rm:
case X86::ADD64rm:
case X86::AND64rm:
@@ -1108,7 +1108,7 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
EmitByte(0x66, CurByte, OS);
// Emit the LOCK opcode prefix.
- if (TSFlags & X86II::LOCK)
+ if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
EmitByte(0xF0, CurByte, OS);
switch (TSFlags & X86II::OpPrefixMask) {
@@ -1130,6 +1130,8 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
EmitByte(0x40 | REX, CurByte, OS);
Ret = true;
}
+ } else {
+ assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode.");
}
// 0x0F escape code must be emitted just before the opcode.
@@ -1159,6 +1161,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MCII.get(Opcode);
uint64_t TSFlags = Desc.TSFlags;
+ unsigned Flags = MI.getFlags();
// Pseudo instructions don't get encoded.
if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
@@ -1194,8 +1197,10 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
MI, OS);
// Emit the repeat opcode prefix as needed.
- if (TSFlags & X86II::REP)
+ if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
EmitByte(0xF3, CurByte, OS);
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ EmitByte(0xF2, CurByte, OS);
// Emit the address size opcode prefix as needed.
bool need_address_override;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 22cb0fac33cb..cdd43478baed 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -16,6 +16,7 @@
#include "InstPrinter/X86IntelInstPrinter.h"
#include "X86MCAsmInfo.h"
#include "llvm/ADT/Triple.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
#include "llvm/MC/MCInstrAnalysis.h"
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCRegisterInfo.h"
@@ -72,52 +73,128 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
MRI->mapLLVMRegToSEHReg(Reg, SEH);
}
- // These CodeView registers are numbered sequentially starting at value 1.
- static const MCPhysReg LowCVRegs[] = {
- X86::AL, X86::CL, X86::DL, X86::BL, X86::AH, X86::CH,
- X86::DH, X86::BH, X86::AX, X86::CX, X86::DX, X86::BX,
- X86::SP, X86::BP, X86::SI, X86::DI, X86::EAX, X86::ECX,
- X86::EDX, X86::EBX, X86::ESP, X86::EBP, X86::ESI, X86::EDI,
+ // Mapping from CodeView to MC register id.
+ static const struct {
+ codeview::RegisterId CVReg;
+ MCPhysReg Reg;
+ } RegMap[] = {
+ { codeview::RegisterId::AL, X86::AL},
+ { codeview::RegisterId::CL, X86::CL},
+ { codeview::RegisterId::DL, X86::DL},
+ { codeview::RegisterId::BL, X86::BL},
+ { codeview::RegisterId::AH, X86::AH},
+ { codeview::RegisterId::CH, X86::CH},
+ { codeview::RegisterId::DH, X86::DH},
+ { codeview::RegisterId::BH, X86::BH},
+ { codeview::RegisterId::AX, X86::AX},
+ { codeview::RegisterId::CX, X86::CX},
+ { codeview::RegisterId::DX, X86::DX},
+ { codeview::RegisterId::BX, X86::BX},
+ { codeview::RegisterId::SP, X86::SP},
+ { codeview::RegisterId::BP, X86::BP},
+ { codeview::RegisterId::SI, X86::SI},
+ { codeview::RegisterId::DI, X86::DI},
+ { codeview::RegisterId::EAX, X86::EAX},
+ { codeview::RegisterId::ECX, X86::ECX},
+ { codeview::RegisterId::EDX, X86::EDX},
+ { codeview::RegisterId::EBX, X86::EBX},
+ { codeview::RegisterId::ESP, X86::ESP},
+ { codeview::RegisterId::EBP, X86::EBP},
+ { codeview::RegisterId::ESI, X86::ESI},
+ { codeview::RegisterId::EDI, X86::EDI},
+
+ { codeview::RegisterId::EFLAGS, X86::EFLAGS},
+
+ { codeview::RegisterId::ST0, X86::FP0},
+ { codeview::RegisterId::ST1, X86::FP1},
+ { codeview::RegisterId::ST2, X86::FP2},
+ { codeview::RegisterId::ST3, X86::FP3},
+ { codeview::RegisterId::ST4, X86::FP4},
+ { codeview::RegisterId::ST5, X86::FP5},
+ { codeview::RegisterId::ST6, X86::FP6},
+ { codeview::RegisterId::ST7, X86::FP7},
+
+ { codeview::RegisterId::XMM0, X86::XMM0},
+ { codeview::RegisterId::XMM1, X86::XMM1},
+ { codeview::RegisterId::XMM2, X86::XMM2},
+ { codeview::RegisterId::XMM3, X86::XMM3},
+ { codeview::RegisterId::XMM4, X86::XMM4},
+ { codeview::RegisterId::XMM5, X86::XMM5},
+ { codeview::RegisterId::XMM6, X86::XMM6},
+ { codeview::RegisterId::XMM7, X86::XMM7},
+
+ { codeview::RegisterId::XMM8, X86::XMM8},
+ { codeview::RegisterId::XMM9, X86::XMM9},
+ { codeview::RegisterId::XMM10, X86::XMM10},
+ { codeview::RegisterId::XMM11, X86::XMM11},
+ { codeview::RegisterId::XMM12, X86::XMM12},
+ { codeview::RegisterId::XMM13, X86::XMM13},
+ { codeview::RegisterId::XMM14, X86::XMM14},
+ { codeview::RegisterId::XMM15, X86::XMM15},
+
+ { codeview::RegisterId::SIL, X86::SIL},
+ { codeview::RegisterId::DIL, X86::DIL},
+ { codeview::RegisterId::BPL, X86::BPL},
+ { codeview::RegisterId::SPL, X86::SPL},
+ { codeview::RegisterId::RAX, X86::RAX},
+ { codeview::RegisterId::RBX, X86::RBX},
+ { codeview::RegisterId::RCX, X86::RCX},
+ { codeview::RegisterId::RDX, X86::RDX},
+ { codeview::RegisterId::RSI, X86::RSI},
+ { codeview::RegisterId::RDI, X86::RDI},
+ { codeview::RegisterId::RBP, X86::RBP},
+ { codeview::RegisterId::RSP, X86::RSP},
+ { codeview::RegisterId::R8, X86::R8},
+ { codeview::RegisterId::R9, X86::R9},
+ { codeview::RegisterId::R10, X86::R10},
+ { codeview::RegisterId::R11, X86::R11},
+ { codeview::RegisterId::R12, X86::R12},
+ { codeview::RegisterId::R13, X86::R13},
+ { codeview::RegisterId::R14, X86::R14},
+ { codeview::RegisterId::R15, X86::R15},
+ { codeview::RegisterId::R8B, X86::R8B},
+ { codeview::RegisterId::R9B, X86::R9B},
+ { codeview::RegisterId::R10B, X86::R10B},
+ { codeview::RegisterId::R11B, X86::R11B},
+ { codeview::RegisterId::R12B, X86::R12B},
+ { codeview::RegisterId::R13B, X86::R13B},
+ { codeview::RegisterId::R14B, X86::R14B},
+ { codeview::RegisterId::R15B, X86::R15B},
+ { codeview::RegisterId::R8W, X86::R8W},
+ { codeview::RegisterId::R9W, X86::R9W},
+ { codeview::RegisterId::R10W, X86::R10W},
+ { codeview::RegisterId::R11W, X86::R11W},
+ { codeview::RegisterId::R12W, X86::R12W},
+ { codeview::RegisterId::R13W, X86::R13W},
+ { codeview::RegisterId::R14W, X86::R14W},
+ { codeview::RegisterId::R15W, X86::R15W},
+ { codeview::RegisterId::R8D, X86::R8D},
+ { codeview::RegisterId::R9D, X86::R9D},
+ { codeview::RegisterId::R10D, X86::R10D},
+ { codeview::RegisterId::R11D, X86::R11D},
+ { codeview::RegisterId::R12D, X86::R12D},
+ { codeview::RegisterId::R13D, X86::R13D},
+ { codeview::RegisterId::R14D, X86::R14D},
+ { codeview::RegisterId::R15D, X86::R15D},
+ { codeview::RegisterId::AMD64_YMM0, X86::YMM0},
+ { codeview::RegisterId::AMD64_YMM1, X86::YMM1},
+ { codeview::RegisterId::AMD64_YMM2, X86::YMM2},
+ { codeview::RegisterId::AMD64_YMM3, X86::YMM3},
+ { codeview::RegisterId::AMD64_YMM4, X86::YMM4},
+ { codeview::RegisterId::AMD64_YMM5, X86::YMM5},
+ { codeview::RegisterId::AMD64_YMM6, X86::YMM6},
+ { codeview::RegisterId::AMD64_YMM7, X86::YMM7},
+ { codeview::RegisterId::AMD64_YMM8, X86::YMM8},
+ { codeview::RegisterId::AMD64_YMM9, X86::YMM9},
+ { codeview::RegisterId::AMD64_YMM10, X86::YMM10},
+ { codeview::RegisterId::AMD64_YMM11, X86::YMM11},
+ { codeview::RegisterId::AMD64_YMM12, X86::YMM12},
+ { codeview::RegisterId::AMD64_YMM13, X86::YMM13},
+ { codeview::RegisterId::AMD64_YMM14, X86::YMM14},
+ { codeview::RegisterId::AMD64_YMM15, X86::YMM15},
};
- unsigned CVLowRegStart = 1;
- for (unsigned I = 0; I < array_lengthof(LowCVRegs); ++I)
- MRI->mapLLVMRegToCVReg(LowCVRegs[I], I + CVLowRegStart);
-
- MRI->mapLLVMRegToCVReg(X86::EFLAGS, 34);
-
- // The x87 registers start at 128 and are numbered sequentially.
- unsigned FP0Start = 128;
- for (unsigned I = 0; I < 8; ++I)
- MRI->mapLLVMRegToCVReg(X86::FP0 + I, FP0Start + I);
-
- // The low 8 XMM registers start at 154 and are numbered sequentially.
- unsigned CVXMM0Start = 154;
- for (unsigned I = 0; I < 8; ++I)
- MRI->mapLLVMRegToCVReg(X86::XMM0 + I, CVXMM0Start + I);
-
- // The high 8 XMM registers start at 252 and are numbered sequentially.
- unsigned CVXMM8Start = 252;
- for (unsigned I = 0; I < 8; ++I)
- MRI->mapLLVMRegToCVReg(X86::XMM8 + I, CVXMM8Start + I);
-
- // FIXME: XMM16 and above from AVX512 not yet documented.
-
- // AMD64 registers start at 324 and count up.
- unsigned CVX64RegStart = 324;
- static const MCPhysReg CVX64Regs[] = {
- X86::SIL, X86::DIL, X86::BPL, X86::SPL, X86::RAX, X86::RBX,
- X86::RCX, X86::RDX, X86::RSI, X86::RDI, X86::RBP, X86::RSP,
- X86::R8, X86::R9, X86::R10, X86::R11, X86::R12, X86::R13,
- X86::R14, X86::R15, X86::R8B, X86::R9B, X86::R10B, X86::R11B,
- X86::R12B, X86::R13B, X86::R14B, X86::R15B, X86::R8W, X86::R9W,
- X86::R10W, X86::R11W, X86::R12W, X86::R13W, X86::R14W, X86::R15W,
- X86::R8D, X86::R9D, X86::R10D, X86::R11D, X86::R12D, X86::R13D,
- X86::R14D, X86::R15D, X86::YMM0, X86::YMM1, X86::YMM2, X86::YMM3,
- X86::YMM4, X86::YMM5, X86::YMM6, X86::YMM7, X86::YMM8, X86::YMM9,
- X86::YMM10, X86::YMM11, X86::YMM12, X86::YMM13, X86::YMM14, X86::YMM15,
- };
- for (unsigned I = 0; I < array_lengthof(CVX64Regs); ++I)
- MRI->mapLLVMRegToCVReg(CVX64Regs[I], CVX64RegStart + I);
+ for (unsigned I = 0; I < array_lengthof(RegMap); ++I)
+ MRI->mapLLVMRegToCVReg(RegMap[I].Reg, static_cast<int>(RegMap[I].CVReg));
}
MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
@@ -198,18 +275,6 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
return MAI;
}
-static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
- CodeModel::Model &CM) {
- bool is64Bit = TT.getArch() == Triple::x86_64;
-
- // For static codegen, if we're not already set, use Small codegen.
- if (CM == CodeModel::Default)
- CM = CodeModel::Small;
- else if (CM == CodeModel::JITDefault)
- // 64-bit JIT places everything in the same buffer except external funcs.
- CM = is64Bit ? CodeModel::Large : CodeModel::Small;
-}
-
static MCInstPrinter *createX86MCInstPrinter(const Triple &T,
unsigned SyntaxVariant,
const MCAsmInfo &MAI,
@@ -238,9 +303,6 @@ extern "C" void LLVMInitializeX86TargetMC() {
// Register the MC asm info.
RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
- // Register the MC codegen info.
- RegisterMCAdjustCodeGenOptsFn Y(*T, adjustCodeGenOpts);
-
// Register the MC instruction info.
TargetRegistry::RegisterMCInstrInfo(*T, createX86MCInstrInfo);
@@ -257,7 +319,13 @@ extern "C" void LLVMInitializeX86TargetMC() {
// Register the code emitter.
TargetRegistry::RegisterMCCodeEmitter(*T, createX86MCCodeEmitter);
- // Register the object streamer.
+ // Register the obj target streamer.
+ TargetRegistry::RegisterObjectTargetStreamer(*T,
+ createX86ObjectTargetStreamer);
+
+ // Register the asm target streamer.
+ TargetRegistry::RegisterAsmTargetStreamer(*T, createX86AsmTargetStreamer);
+
TargetRegistry::RegisterCOFFStreamer(*T, createX86WinCOFFStreamer);
// Register the MCInstPrinter.
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index f73e734b9b0e..c5859b600ad2 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -77,25 +77,41 @@ MCAsmBackend *createX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
const Triple &TT, StringRef CPU,
const MCTargetOptions &Options);
+/// Implements X86-only directives for assembly emission.
+MCTargetStreamer *createX86AsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrint,
+ bool isVerboseAsm);
+
+/// Implements X86-only directives for object files.
+MCTargetStreamer *createX86ObjectTargetStreamer(MCStreamer &OS,
+ const MCSubtargetInfo &STI);
+
/// Construct an X86 Windows COFF machine code streamer which will generate
/// PE/COFF format object files.
///
/// Takes ownership of \p AB and \p CE.
-MCStreamer *createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
- raw_pwrite_stream &OS, MCCodeEmitter *CE,
- bool RelaxAll, bool IncrementalLinkerCompatible);
+MCStreamer *createX86WinCOFFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> &&AB,
+ raw_pwrite_stream &OS,
+ std::unique_ptr<MCCodeEmitter> &&CE,
+ bool RelaxAll,
+ bool IncrementalLinkerCompatible);
/// Construct an X86 Mach-O object writer.
-MCObjectWriter *createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
- uint32_t CPUType,
- uint32_t CPUSubtype);
+std::unique_ptr<MCObjectWriter> createX86MachObjectWriter(raw_pwrite_stream &OS,
+ bool Is64Bit,
+ uint32_t CPUType,
+ uint32_t CPUSubtype);
/// Construct an X86 ELF object writer.
-MCObjectWriter *createX86ELFObjectWriter(raw_pwrite_stream &OS, bool IsELF64,
- uint8_t OSABI, uint16_t EMachine);
+std::unique_ptr<MCObjectWriter> createX86ELFObjectWriter(raw_pwrite_stream &OS,
+ bool IsELF64,
+ uint8_t OSABI,
+ uint16_t EMachine);
/// Construct an X86 Win COFF object writer.
-MCObjectWriter *createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit);
+std::unique_ptr<MCObjectWriter>
+createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit);
/// Returns the sub or super register of a specific X86 register.
/// e.g. getX86SubSuperRegister(X86::EAX, 16) returns X86::AX.
diff --git a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index 8f2017e990c5..965f7de809b3 100644
--- a/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -597,11 +597,10 @@ void X86MachObjectWriter::RecordX86Relocation(MachObjectWriter *Writer,
Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
}
-MCObjectWriter *llvm::createX86MachObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit, uint32_t CPUType,
- uint32_t CPUSubtype) {
- return createMachObjectWriter(new X86MachObjectWriter(Is64Bit,
- CPUType,
- CPUSubtype),
- OS, /*IsLittleEndian=*/true);
+std::unique_ptr<MCObjectWriter>
+llvm::createX86MachObjectWriter(raw_pwrite_stream &OS, bool Is64Bit,
+ uint32_t CPUType, uint32_t CPUSubtype) {
+ return createMachObjectWriter(
+ llvm::make_unique<X86MachObjectWriter>(Is64Bit, CPUType, CPUSubtype), OS,
+ /*IsLittleEndian=*/true);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
new file mode 100644
index 000000000000..8d38cd32b82c
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86TargetStreamer.h
@@ -0,0 +1,34 @@
+//===- X86TargetStreamer.h ------------------------------*- C++ -*---------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86TARGETSTREAMER_H
+#define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86TARGETSTREAMER_H
+
+#include "llvm/MC/MCStreamer.h"
+
+namespace llvm {
+
+/// X86 target streamer implementing x86-only assembly directives.
+class X86TargetStreamer : public MCTargetStreamer {
+public:
+ X86TargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+
+ virtual bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize,
+ SMLoc L = {}) = 0;
+ virtual bool emitFPOEndPrologue(SMLoc L = {}) = 0;
+ virtual bool emitFPOEndProc(SMLoc L = {}) = 0;
+ virtual bool emitFPOData(const MCSymbol *ProcSym, SMLoc L = {}) = 0;
+ virtual bool emitFPOPushReg(unsigned Reg, SMLoc L = {}) = 0;
+ virtual bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L = {}) = 0;
+ virtual bool emitFPOSetFrame(unsigned Reg, SMLoc L = {}) = 0;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 807f7a6ddb19..5139bb46b561 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -13,6 +13,7 @@
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCValue.h"
#include "llvm/MC/MCWinCOFFObjectWriter.h"
#include "llvm/Support/ErrorHandling.h"
@@ -104,8 +105,8 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
llvm_unreachable("Unsupported COFF machine type.");
}
-MCObjectWriter *llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS,
- bool Is64Bit) {
- MCWinCOFFObjectTargetWriter *MOTW = new X86WinCOFFObjectWriter(Is64Bit);
- return createWinCOFFObjectWriter(MOTW, OS);
+std::unique_ptr<MCObjectWriter>
+llvm::createX86WinCOFFObjectWriter(raw_pwrite_stream &OS, bool Is64Bit) {
+ auto MOTW = llvm::make_unique<X86WinCOFFObjectWriter>(Is64Bit);
+ return createWinCOFFObjectWriter(std::move(MOTW), OS);
}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index d04511873b46..5b1357ae4a7b 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -8,6 +8,9 @@
//===----------------------------------------------------------------------===//
#include "X86MCTargetDesc.h"
+#include "X86TargetStreamer.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCWin64EH.h"
#include "llvm/MC/MCWinCOFFStreamer.h"
@@ -17,17 +20,18 @@ namespace {
class X86WinCOFFStreamer : public MCWinCOFFStreamer {
Win64EH::UnwindEmitter EHStreamer;
public:
- X86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB, MCCodeEmitter *CE,
- raw_pwrite_stream &OS)
- : MCWinCOFFStreamer(C, AB, *CE, OS) {}
+ X86WinCOFFStreamer(MCContext &C, std::unique_ptr<MCAsmBackend> AB,
+ std::unique_ptr<MCCodeEmitter> CE, raw_pwrite_stream &OS)
+ : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), OS) {}
- void EmitWinEHHandlerData() override;
+ void EmitWinEHHandlerData(SMLoc Loc) override;
void EmitWindowsUnwindTables() override;
+ void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
void FinishImpl() override;
};
-void X86WinCOFFStreamer::EmitWinEHHandlerData() {
- MCStreamer::EmitWinEHHandlerData();
+void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
+ MCStreamer::EmitWinEHHandlerData(Loc);
// We have to emit the unwind info now, because this directive
// actually switches to the .xdata section!
@@ -40,6 +44,12 @@ void X86WinCOFFStreamer::EmitWindowsUnwindTables() {
EHStreamer.Emit(*this);
}
+void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
+ X86TargetStreamer *XTS =
+ static_cast<X86TargetStreamer *>(getTargetStreamer());
+ XTS->emitFPOData(ProcSym, Loc);
+}
+
void X86WinCOFFStreamer::FinishImpl() {
EmitFrames(nullptr);
EmitWindowsUnwindTables();
@@ -48,11 +58,14 @@ void X86WinCOFFStreamer::FinishImpl() {
}
}
-MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C, MCAsmBackend &AB,
+MCStreamer *llvm::createX86WinCOFFStreamer(MCContext &C,
+ std::unique_ptr<MCAsmBackend> &&AB,
raw_pwrite_stream &OS,
- MCCodeEmitter *CE, bool RelaxAll,
+ std::unique_ptr<MCCodeEmitter> &&CE,
+ bool RelaxAll,
bool IncrementalLinkerCompatible) {
- X86WinCOFFStreamer *S = new X86WinCOFFStreamer(C, AB, CE, OS);
+ X86WinCOFFStreamer *S =
+ new X86WinCOFFStreamer(C, std::move(AB), std::move(CE), OS);
S->getAssembler().setRelaxAll(RelaxAll);
S->getAssembler().setIncrementalLinkerCompatible(IncrementalLinkerCompatible);
return S;
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
new file mode 100644
index 000000000000..093dab4f2f96
--- /dev/null
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -0,0 +1,415 @@
+//===-- X86WinCOFFTargetStreamer.cpp ----------------------------*- C++ -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MCTargetDesc.h"
+#include "X86TargetStreamer.h"
+#include "llvm/DebugInfo/CodeView/CodeView.h"
+#include "llvm/MC/MCCodeView.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/FormattedStream.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+namespace {
+/// Implements Windows x86-only directives for assembly emission.
+class X86WinCOFFAsmTargetStreamer : public X86TargetStreamer {
+ formatted_raw_ostream &OS;
+ MCInstPrinter &InstPrinter;
+
+public:
+ X86WinCOFFAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+ MCInstPrinter &InstPrinter)
+ : X86TargetStreamer(S), OS(OS), InstPrinter(InstPrinter) {}
+
+ bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize,
+ SMLoc L) override;
+ bool emitFPOEndPrologue(SMLoc L) override;
+ bool emitFPOEndProc(SMLoc L) override;
+ bool emitFPOData(const MCSymbol *ProcSym, SMLoc L) override;
+ bool emitFPOPushReg(unsigned Reg, SMLoc L) override;
+ bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) override;
+ bool emitFPOSetFrame(unsigned Reg, SMLoc L) override;
+};
+
+/// Represents a single FPO directive.
+struct FPOInstruction {
+ MCSymbol *Label;
+ enum Operation {
+ PushReg,
+ StackAlloc,
+ SetFrame,
+ } Op;
+ unsigned RegOrOffset;
+};
+
+struct FPOData {
+ const MCSymbol *Function = nullptr;
+ MCSymbol *Begin = nullptr;
+ MCSymbol *PrologueEnd = nullptr;
+ MCSymbol *End = nullptr;
+ unsigned ParamsSize = 0;
+
+ SmallVector<FPOInstruction, 5> Instructions;
+};
+
+/// Implements Windows x86-only directives for object emission.
+class X86WinCOFFTargetStreamer : public X86TargetStreamer {
+ /// Map from function symbol to its FPO data.
+ DenseMap<const MCSymbol *, std::unique_ptr<FPOData>> AllFPOData;
+
+ /// Current FPO data created by .cv_fpo_proc.
+ std::unique_ptr<FPOData> CurFPOData;
+
+ bool haveOpenFPOData() { return !!CurFPOData; }
+
+ /// Diagnoses an error at L if we are not in an FPO prologue. Return true on
+ /// error.
+ bool checkInFPOPrologue(SMLoc L);
+
+ MCSymbol *emitFPOLabel();
+
+ MCContext &getContext() { return getStreamer().getContext(); }
+
+public:
+ X86WinCOFFTargetStreamer(MCStreamer &S) : X86TargetStreamer(S) {}
+
+ bool emitFPOProc(const MCSymbol *ProcSym, unsigned ParamsSize,
+ SMLoc L) override;
+ bool emitFPOEndPrologue(SMLoc L) override;
+ bool emitFPOEndProc(SMLoc L) override;
+ bool emitFPOData(const MCSymbol *ProcSym, SMLoc L) override;
+ bool emitFPOPushReg(unsigned Reg, SMLoc L) override;
+ bool emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) override;
+ bool emitFPOSetFrame(unsigned Reg, SMLoc L) override;
+};
+} // end namespace
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
+ unsigned ParamsSize, SMLoc L) {
+ OS << "\t.cv_fpo_proc\t";
+ ProcSym->print(OS, getStreamer().getContext().getAsmInfo());
+ OS << ' ' << ParamsSize << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOEndPrologue(SMLoc L) {
+ OS << "\t.cv_fpo_endprologue\n";
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOEndProc(SMLoc L) {
+ OS << "\t.cv_fpo_endproc\n";
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOData(const MCSymbol *ProcSym,
+ SMLoc L) {
+ OS << "\t.cv_fpo_data\t";
+ ProcSym->print(OS, getStreamer().getContext().getAsmInfo());
+ OS << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOPushReg(unsigned Reg, SMLoc L) {
+ OS << "\t.cv_fpo_pushreg\t";
+ InstPrinter.printRegName(OS, Reg);
+ OS << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc,
+ SMLoc L) {
+ OS << "\t.cv_fpo_stackalloc\t" << StackAlloc << '\n';
+ return false;
+}
+
+bool X86WinCOFFAsmTargetStreamer::emitFPOSetFrame(unsigned Reg, SMLoc L) {
+ OS << "\t.cv_fpo_setframe\t";
+ InstPrinter.printRegName(OS, Reg);
+ OS << '\n';
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) {
+ if (!haveOpenFPOData() || CurFPOData->PrologueEnd) {
+ getContext().reportError(
+ L,
+ "directive must appear between .cv_fpo_proc and .cv_fpo_endprologue");
+ return true;
+ }
+ return false;
+}
+
+MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() {
+ MCSymbol *Label = getContext().createTempSymbol("cfi", true);
+ getStreamer().EmitLabel(Label);
+ return Label;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOProc(const MCSymbol *ProcSym,
+ unsigned ParamsSize, SMLoc L) {
+ if (haveOpenFPOData()) {
+ getContext().reportError(
+ L, "opening new .cv_fpo_proc before closing previous frame");
+ return true;
+ }
+ CurFPOData = llvm::make_unique<FPOData>();
+ CurFPOData->Function = ProcSym;
+ CurFPOData->Begin = emitFPOLabel();
+ CurFPOData->ParamsSize = ParamsSize;
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOEndProc(SMLoc L) {
+ if (!haveOpenFPOData()) {
+ getContext().reportError(L, ".cv_fpo_endproc must appear after .cv_proc");
+ return true;
+ }
+ if (!CurFPOData->PrologueEnd) {
+ // Complain if there were prologue setup instructions but no end prologue.
+ if (!CurFPOData->Instructions.empty()) {
+ getContext().reportError(L, "missing .cv_fpo_endprologue");
+ CurFPOData->Instructions.clear();
+ }
+
+ // Claim there is a zero-length prologue to make the label math work out
+ // later.
+ CurFPOData->PrologueEnd = CurFPOData->Begin;
+ }
+
+ CurFPOData->End = emitFPOLabel();
+ const MCSymbol *Fn = CurFPOData->Function;
+ AllFPOData.insert({Fn, std::move(CurFPOData)});
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOSetFrame(unsigned Reg, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::SetFrame;
+ Inst.RegOrOffset = Reg;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOPushReg(unsigned Reg, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::PushReg;
+ Inst.RegOrOffset = Reg;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOStackAlloc(unsigned StackAlloc, SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ FPOInstruction Inst;
+ Inst.Label = emitFPOLabel();
+ Inst.Op = FPOInstruction::StackAlloc;
+ Inst.RegOrOffset = StackAlloc;
+ CurFPOData->Instructions.push_back(Inst);
+ return false;
+}
+
+bool X86WinCOFFTargetStreamer::emitFPOEndPrologue(SMLoc L) {
+ if (checkInFPOPrologue(L))
+ return true;
+ CurFPOData->PrologueEnd = emitFPOLabel();
+ return false;
+}
+
+namespace {
+struct RegSaveOffset {
+ RegSaveOffset(unsigned Reg, unsigned Offset) : Reg(Reg), Offset(Offset) {}
+
+ unsigned Reg = 0;
+ unsigned Offset = 0;
+};
+
+struct FPOStateMachine {
+ explicit FPOStateMachine(const FPOData *FPO) : FPO(FPO) {}
+
+ const FPOData *FPO = nullptr;
+ unsigned FrameReg = 0;
+ unsigned FrameRegOff = 0;
+ unsigned CurOffset = 0;
+ unsigned LocalSize = 0;
+ unsigned SavedRegSize = 0;
+ unsigned Flags = 0; // FIXME: Set HasSEH / HasEH.
+
+ SmallString<128> FrameFunc;
+
+ SmallVector<RegSaveOffset, 4> RegSaveOffsets;
+
+ void emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label);
+};
+} // end namespace
+
+static Printable printFPOReg(const MCRegisterInfo *MRI, unsigned LLVMReg) {
+ return Printable([MRI, LLVMReg](raw_ostream &OS) {
+ switch (LLVMReg) {
+ // MSVC only seems to emit symbolic register names for EIP, EBP, and ESP,
+ // but the format seems to support more than that, so we emit them.
+ case X86::EAX: OS << "$eax"; break;
+ case X86::EBX: OS << "$ebx"; break;
+ case X86::ECX: OS << "$ecx"; break;
+ case X86::EDX: OS << "$edx"; break;
+ case X86::EDI: OS << "$edi"; break;
+ case X86::ESI: OS << "$esi"; break;
+ case X86::ESP: OS << "$esp"; break;
+ case X86::EBP: OS << "$ebp"; break;
+ case X86::EIP: OS << "$eip"; break;
+ // Otherwise, get the codeview register number and print $N.
+ default:
+ OS << '$' << MRI->getCodeViewRegNum(LLVMReg);
+ break;
+ }
+ });
+}
+
+void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) {
+ unsigned CurFlags = Flags;
+ if (Label == FPO->Begin)
+ CurFlags |= FrameData::IsFunctionStart;
+
+ // Compute the new FrameFunc string.
+ FrameFunc.clear();
+ raw_svector_ostream FuncOS(FrameFunc);
+ const MCRegisterInfo *MRI = OS.getContext().getRegisterInfo();
+ if (FrameReg) {
+ // CFA is FrameReg + FrameRegOff.
+ FuncOS << "$T0 " << printFPOReg(MRI, FrameReg) << " " << FrameRegOff
+ << " + = ";
+ } else {
+ // The address of return address is ESP + CurOffset, but we use .raSearch to
+ // match MSVC. This seems to ask the debugger to subtract some combination
+ // of LocalSize and SavedRegSize from ESP and grovel around in that memory
+ // to find the address of a plausible return address.
+ FuncOS << "$T0 .raSearch = ";
+ }
+
+ // Caller's $eip should be dereferenced CFA, and $esp should be CFA plus 4.
+ FuncOS << "$eip $T0 ^ = $esp $T0 4 + = ";
+
+ // Each saved register is stored at an unchanging negative CFA offset.
+ for (RegSaveOffset RO : RegSaveOffsets)
+ FuncOS << printFPOReg(MRI, RO.Reg) << " $T0 " << RO.Offset << " - ^ = ";
+
+ // Add it to the CV string table.
+ CodeViewContext &CVCtx = OS.getContext().getCVContext();
+ unsigned FrameFuncStrTabOff = CVCtx.addToStringTable(FuncOS.str()).second;
+
+ // MSVC has only ever been observed to emit a MaxStackSize of zero.
+ unsigned MaxStackSize = 0;
+
+ // The FrameData record format is:
+ // ulittle32_t RvaStart;
+ // ulittle32_t CodeSize;
+ // ulittle32_t LocalSize;
+ // ulittle32_t ParamsSize;
+ // ulittle32_t MaxStackSize;
+ // ulittle32_t FrameFunc; // String table offset
+ // ulittle16_t PrologSize;
+ // ulittle16_t SavedRegsSize;
+ // ulittle32_t Flags;
+
+ OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart
+ OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4); // CodeSize
+ OS.EmitIntValue(LocalSize, 4);
+ OS.EmitIntValue(FPO->ParamsSize, 4);
+ OS.EmitIntValue(MaxStackSize, 4);
+ OS.EmitIntValue(FrameFuncStrTabOff, 4); // FrameFunc
+ OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2);
+ OS.EmitIntValue(SavedRegSize, 2);
+ OS.EmitIntValue(CurFlags, 4);
+}
+
+/// Compute and emit the real CodeView FrameData subsection.
+bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
+ MCStreamer &OS = getStreamer();
+ MCContext &Ctx = OS.getContext();
+
+ auto I = AllFPOData.find(ProcSym);
+ if (I == AllFPOData.end()) {
+ Ctx.reportError(L, Twine("no FPO data found for symbol ") +
+ ProcSym->getName());
+ return true;
+ }
+ const FPOData *FPO = I->second.get();
+ assert(FPO->Begin && FPO->End && FPO->PrologueEnd && "missing FPO label");
+
+ MCSymbol *FrameBegin = Ctx.createTempSymbol(),
+ *FrameEnd = Ctx.createTempSymbol();
+
+ OS.EmitIntValue(unsigned(DebugSubsectionKind::FrameData), 4);
+ OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4);
+ OS.EmitLabel(FrameBegin);
+
+ // Start with the RVA of the function in question.
+ OS.EmitValue(MCSymbolRefExpr::create(FPO->Function,
+ MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx),
+ 4);
+
+ // Emit a sequence of FrameData records.
+ FPOStateMachine FSM(FPO);
+
+ FSM.emitFrameDataRecord(OS, FPO->Begin);
+ for (const FPOInstruction &Inst : FPO->Instructions) {
+ switch (Inst.Op) {
+ case FPOInstruction::PushReg:
+ FSM.CurOffset += 4;
+ FSM.SavedRegSize += 4;
+ FSM.RegSaveOffsets.push_back({Inst.RegOrOffset, FSM.CurOffset});
+ break;
+ case FPOInstruction::SetFrame:
+ FSM.FrameReg = Inst.RegOrOffset;
+ FSM.FrameRegOff = FSM.CurOffset;
+ break;
+ case FPOInstruction::StackAlloc:
+ FSM.CurOffset += Inst.RegOrOffset;
+ FSM.LocalSize += Inst.RegOrOffset;
+ // No need to emit FrameData for stack allocations with a frame pointer.
+ if (FSM.FrameReg)
+ continue;
+ break;
+ }
+ FSM.emitFrameDataRecord(OS, Inst.Label);
+ }
+
+ OS.EmitValueToAlignment(4, 0);
+ OS.EmitLabel(FrameEnd);
+ return false;
+}
+
+MCTargetStreamer *llvm::createX86AsmTargetStreamer(MCStreamer &S,
+ formatted_raw_ostream &OS,
+ MCInstPrinter *InstPrinter,
+ bool IsVerboseAsm) {
+ // FIXME: This makes it so we textually assemble COFF directives on ELF.
+ // That's kind of nonsensical.
+ return new X86WinCOFFAsmTargetStreamer(S, OS, *InstPrinter);
+}
+
+MCTargetStreamer *
+llvm::createX86ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+ // No need to register a target streamer.
+ if (!STI.getTargetTriple().isOSBinFormatCOFF())
+ return nullptr;
+ // Registers itself to the MCStreamer.
+ return new X86WinCOFFTargetStreamer(S);
+}
diff --git a/lib/Target/X86/README-SSE.txt b/lib/Target/X86/README-SSE.txt
index e6896e805568..73cf27692447 100644
--- a/lib/Target/X86/README-SSE.txt
+++ b/lib/Target/X86/README-SSE.txt
@@ -145,15 +145,15 @@ This is the llvm code after instruction scheduling:
cond_next140 (0xa910740, LLVM BB @0xa90beb0):
%reg1078 = MOV32ri -3
- %reg1079 = ADD32rm %reg1078, %reg1068, 1, %NOREG, 0
- %reg1037 = MOV32rm %reg1024, 1, %NOREG, 40
+ %reg1079 = ADD32rm %reg1078, %reg1068, 1, %noreg, 0
+ %reg1037 = MOV32rm %reg1024, 1, %noreg, 40
%reg1080 = IMUL32rr %reg1079, %reg1037
- %reg1081 = MOV32rm %reg1058, 1, %NOREG, 0
+ %reg1081 = MOV32rm %reg1058, 1, %noreg, 0
%reg1038 = LEA32r %reg1081, 1, %reg1080, -3
- %reg1036 = MOV32rm %reg1024, 1, %NOREG, 32
+ %reg1036 = MOV32rm %reg1024, 1, %noreg, 32
%reg1082 = SHL32ri %reg1038, 4
%reg1039 = ADD32rr %reg1036, %reg1082
- %reg1083 = MOVAPSrm %reg1059, 1, %NOREG, 0
+ %reg1083 = MOVAPSrm %reg1059, 1, %noreg, 0
%reg1034 = SHUFPSrr %reg1083, %reg1083, 170
%reg1032 = SHUFPSrr %reg1083, %reg1083, 0
%reg1035 = SHUFPSrr %reg1083, %reg1083, 255
@@ -166,32 +166,32 @@ cond_next140 (0xa910740, LLVM BB @0xa90beb0):
Still ok. After register allocation:
cond_next140 (0xa910740, LLVM BB @0xa90beb0):
- %EAX = MOV32ri -3
- %EDX = MOV32rm <fi#3>, 1, %NOREG, 0
- ADD32rm %EAX<def&use>, %EDX, 1, %NOREG, 0
- %EDX = MOV32rm <fi#7>, 1, %NOREG, 0
- %EDX = MOV32rm %EDX, 1, %NOREG, 40
- IMUL32rr %EAX<def&use>, %EDX
- %ESI = MOV32rm <fi#5>, 1, %NOREG, 0
- %ESI = MOV32rm %ESI, 1, %NOREG, 0
- MOV32mr <fi#4>, 1, %NOREG, 0, %ESI
- %EAX = LEA32r %ESI, 1, %EAX, -3
- %ESI = MOV32rm <fi#7>, 1, %NOREG, 0
- %ESI = MOV32rm %ESI, 1, %NOREG, 32
- %EDI = MOV32rr %EAX
- SHL32ri %EDI<def&use>, 4
- ADD32rr %EDI<def&use>, %ESI
- %XMM0 = MOVAPSrm %ECX, 1, %NOREG, 0
- %XMM1 = MOVAPSrr %XMM0
- SHUFPSrr %XMM1<def&use>, %XMM1, 170
- %XMM2 = MOVAPSrr %XMM0
- SHUFPSrr %XMM2<def&use>, %XMM2, 0
- %XMM3 = MOVAPSrr %XMM0
- SHUFPSrr %XMM3<def&use>, %XMM3, 255
- SHUFPSrr %XMM0<def&use>, %XMM0, 85
- %EBX = MOV32rr %EDI
- AND32ri8 %EBX<def&use>, 15
- CMP32ri8 %EBX, 0
+ %eax = MOV32ri -3
+ %edx = MOV32rm %stack.3, 1, %noreg, 0
+ ADD32rm %eax<def&use>, %edx, 1, %noreg, 0
+ %edx = MOV32rm %stack.7, 1, %noreg, 0
+ %edx = MOV32rm %edx, 1, %noreg, 40
+ IMUL32rr %eax<def&use>, %edx
+ %esi = MOV32rm %stack.5, 1, %noreg, 0
+ %esi = MOV32rm %esi, 1, %noreg, 0
+ MOV32mr %stack.4, 1, %noreg, 0, %esi
+ %eax = LEA32r %esi, 1, %eax, -3
+ %esi = MOV32rm %stack.7, 1, %noreg, 0
+ %esi = MOV32rm %esi, 1, %noreg, 32
+ %edi = MOV32rr %eax
+ SHL32ri %edi<def&use>, 4
+ ADD32rr %edi<def&use>, %esi
+ %xmm0 = MOVAPSrm %ecx, 1, %noreg, 0
+ %xmm1 = MOVAPSrr %xmm0
+ SHUFPSrr %xmm1<def&use>, %xmm1, 170
+ %xmm2 = MOVAPSrr %xmm0
+ SHUFPSrr %xmm2<def&use>, %xmm2, 0
+ %xmm3 = MOVAPSrr %xmm0
+ SHUFPSrr %xmm3<def&use>, %xmm3, 255
+ SHUFPSrr %xmm0<def&use>, %xmm0, 85
+ %ebx = MOV32rr %edi
+ AND32ri8 %ebx<def&use>, 15
+ CMP32ri8 %ebx, 0
JE mbb<cond_next204,0xa914d30>
This looks really bad. The problem is shufps is a destructive opcode. Since it
diff --git a/lib/Target/X86/README-X86-64.txt b/lib/Target/X86/README-X86-64.txt
index 09626e13849d..a3ea4595ac1e 100644
--- a/lib/Target/X86/README-X86-64.txt
+++ b/lib/Target/X86/README-X86-64.txt
@@ -103,20 +103,20 @@ LBB1_3: ## bb
Before regalloc, we have:
- %reg1025<def> = IMUL32rri8 %reg1024, 45, %EFLAGS<imp-def>
+ %reg1025 = IMUL32rri8 %reg1024, 45, implicit-def %eflags
JMP mbb<bb2,0x203afb0>
Successors according to CFG: 0x203afb0 (#3)
bb1: 0x203af60, LLVM BB @0x1e02310, ID#2:
Predecessors according to CFG: 0x203aec0 (#0)
- %reg1026<def> = IMUL32rri8 %reg1024, 78, %EFLAGS<imp-def>
+ %reg1026 = IMUL32rri8 %reg1024, 78, implicit-def %eflags
Successors according to CFG: 0x203afb0 (#3)
bb2: 0x203afb0, LLVM BB @0x1e02340, ID#3:
Predecessors according to CFG: 0x203af10 (#1) 0x203af60 (#2)
- %reg1027<def> = PHI %reg1025, mbb<bb,0x203af10>,
+ %reg1027 = PHI %reg1025, mbb<bb,0x203af10>,
%reg1026, mbb<bb1,0x203af60>
- %reg1029<def> = MOVZX64rr32 %reg1027
+ %reg1029 = MOVZX64rr32 %reg1027
so we'd have to know that IMUL32rri8 leaves the high word zero extended and to
be able to recognize the zero extend. This could also presumably be implemented
diff --git a/lib/Target/X86/README.txt b/lib/Target/X86/README.txt
index 799157c926e6..11652af9f1fc 100644
--- a/lib/Target/X86/README.txt
+++ b/lib/Target/X86/README.txt
@@ -987,11 +987,11 @@ bb7: ; preds = %entry
to:
foo: # @foo
-# BB#0: # %entry
+# %bb.0: # %entry
movl 4(%esp), %ecx
cmpb $0, 16(%esp)
je .LBB0_2
-# BB#1: # %bb
+# %bb.1: # %bb
movl 8(%esp), %eax
addl %ecx, %eax
ret
@@ -1073,7 +1073,7 @@ declare void @exit(i32) noreturn nounwind
This compiles into:
_abort_gzip: ## @abort_gzip
-## BB#0: ## %entry
+## %bb.0: ## %entry
subl $12, %esp
movb _in_exit.4870.b, %al
cmpb $1, %al
@@ -1396,7 +1396,7 @@ define i32 @bar(%struct.B* nocapture %a) nounwind readonly optsize {
}
bar: # @bar
-# BB#0:
+# %bb.0:
movb (%rdi), %al
andb $1, %al
movzbl %al, %eax
@@ -1633,7 +1633,7 @@ In the real code, we get a lot more wrong than this. However, even in this
code we generate:
_foo: ## @foo
-## BB#0: ## %entry
+## %bb.0: ## %entry
movb (%rsi), %al
movb (%rdi), %cl
cmpb %al, %cl
@@ -1646,12 +1646,12 @@ LBB0_2: ## %if.end
movb 1(%rdi), %cl
cmpb %al, %cl
jne LBB0_1
-## BB#3: ## %if.end38
+## %bb.3: ## %if.end38
movb 2(%rsi), %al
movb 2(%rdi), %cl
cmpb %al, %cl
jne LBB0_1
-## BB#4: ## %if.end60
+## %bb.4: ## %if.end60
movb 3(%rdi), %al
cmpb 3(%rsi), %al
LBB0_5: ## %if.end60
diff --git a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index d2654fc67ed5..16c2b56c48b5 100644
--- a/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -22,8 +22,8 @@ Target &llvm::getTheX86_64Target() {
extern "C" void LLVMInitializeX86TargetInfo() {
RegisterTarget<Triple::x86, /*HasJIT=*/true> X(
- getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above");
+ getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above", "X86");
RegisterTarget<Triple::x86_64, /*HasJIT=*/true> Y(
- getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64");
+ getTheX86_64Target(), "x86-64", "64-bit X86: EM64T and AMD64", "X86");
}
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 91201d1fec85..5631648d2dc8 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -92,9 +92,13 @@ FunctionPass *createX86CmovConverterPass();
/// the upper portions of registers, and to save code size.
FunctionPass *createX86FixupBWInsts();
+/// Return a Machine IR pass that reassigns instruction chains from one domain
+/// to another, when profitable.
+FunctionPass *createX86DomainReassignmentPass();
+
void initializeFixupBWInstPassPass(PassRegistry &);
-/// This pass replaces EVEX ecnoded of AVX-512 instructiosn by VEX
+/// This pass replaces EVEX encoded of AVX-512 instructiosn by VEX
/// encoding when possible in order to reduce code size.
FunctionPass *createX86EvexToVexInsts();
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 54eabeac5126..08731cd0204c 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -95,8 +95,6 @@ def Feature64Bit : SubtargetFeature<"64bit", "HasX86_64", "true",
def FeatureCMPXCHG16B : SubtargetFeature<"cx16", "HasCmpxchg16b", "true",
"64-bit with cmpxchg16b",
[Feature64Bit]>;
-def FeatureSlowBTMem : SubtargetFeature<"slow-bt-mem", "IsBTMemSlow", "true",
- "Bit testing of memory is slow">;
def FeatureSlowSHLD : SubtargetFeature<"slow-shld", "IsSHLDSlow", "true",
"SHLD instruction is slow">;
def FeatureSlowPMULLD : SubtargetFeature<"slow-pmulld", "IsPMULLDSlow", "true",
@@ -118,9 +116,15 @@ def FeatureAVX : SubtargetFeature<"avx", "X86SSELevel", "AVX",
def FeatureAVX2 : SubtargetFeature<"avx2", "X86SSELevel", "AVX2",
"Enable AVX2 instructions",
[FeatureAVX]>;
+def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
+ "Enable three-operand fused multiple-add",
+ [FeatureAVX]>;
+def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
+ "Support 16-bit floating point conversion instructions",
+ [FeatureAVX]>;
def FeatureAVX512 : SubtargetFeature<"avx512f", "X86SSELevel", "AVX512F",
"Enable AVX-512 instructions",
- [FeatureAVX2]>;
+ [FeatureAVX2, FeatureFMA, FeatureF16C]>;
def FeatureERI : SubtargetFeature<"avx512er", "HasERI", "true",
"Enable AVX-512 Exponential and Reciprocal Instructions",
[FeatureAVX512]>;
@@ -148,17 +152,29 @@ def FeatureVLX : SubtargetFeature<"avx512vl", "HasVLX", "true",
def FeatureVBMI : SubtargetFeature<"avx512vbmi", "HasVBMI", "true",
"Enable AVX-512 Vector Byte Manipulation Instructions",
[FeatureBWI]>;
+def FeatureVBMI2 : SubtargetFeature<"avx512vbmi2", "HasVBMI2", "true",
+ "Enable AVX-512 further Vector Byte Manipulation Instructions",
+ [FeatureBWI]>;
def FeatureIFMA : SubtargetFeature<"avx512ifma", "HasIFMA", "true",
"Enable AVX-512 Integer Fused Multiple-Add",
[FeatureAVX512]>;
def FeaturePKU : SubtargetFeature<"pku", "HasPKU", "true",
"Enable protection keys">;
+def FeatureVNNI : SubtargetFeature<"avx512vnni", "HasVNNI", "true",
+ "Enable AVX-512 Vector Neural Network Instructions",
+ [FeatureAVX512]>;
+def FeatureBITALG : SubtargetFeature<"avx512bitalg", "HasBITALG", "true",
+ "Enable AVX-512 Bit Algorithms",
+ [FeatureBWI]>;
def FeaturePCLMUL : SubtargetFeature<"pclmul", "HasPCLMUL", "true",
"Enable packed carry-less multiplication instructions",
[FeatureSSE2]>;
-def FeatureFMA : SubtargetFeature<"fma", "HasFMA", "true",
- "Enable three-operand fused multiple-add",
- [FeatureAVX]>;
+def FeatureGFNI : SubtargetFeature<"gfni", "HasGFNI", "true",
+ "Enable Galois Field Arithmetic Instructions",
+ [FeatureSSE2]>;
+def FeatureVPCLMULQDQ : SubtargetFeature<"vpclmulqdq", "HasVPCLMULQDQ", "true",
+ "Enable vpclmulqdq instructions",
+ [FeatureAVX, FeaturePCLMUL]>;
def FeatureFMA4 : SubtargetFeature<"fma4", "HasFMA4", "true",
"Enable four-operand fused multiple-add",
[FeatureAVX, FeatureSSE4A]>;
@@ -171,6 +187,9 @@ def FeatureSSEUnalignedMem : SubtargetFeature<"sse-unaligned-mem",
def FeatureAES : SubtargetFeature<"aes", "HasAES", "true",
"Enable AES instructions",
[FeatureSSE2]>;
+def FeatureVAES : SubtargetFeature<"vaes", "HasVAES", "true",
+ "Promote selected AES instructions to AVX512/AVX registers",
+ [FeatureAVX, FeatureAES]>;
def FeatureTBM : SubtargetFeature<"tbm", "HasTBM", "true",
"Enable TBM instructions">;
def FeatureLWP : SubtargetFeature<"lwp", "HasLWP", "true",
@@ -179,9 +198,6 @@ def FeatureMOVBE : SubtargetFeature<"movbe", "HasMOVBE", "true",
"Support MOVBE instruction">;
def FeatureRDRAND : SubtargetFeature<"rdrnd", "HasRDRAND", "true",
"Support RDRAND instruction">;
-def FeatureF16C : SubtargetFeature<"f16c", "HasF16C", "true",
- "Support 16-bit floating point conversion instructions",
- [FeatureAVX]>;
def FeatureFSGSBase : SubtargetFeature<"fsgsbase", "HasFSGSBase", "true",
"Support FS/GS Base instructions">;
def FeatureLZCNT : SubtargetFeature<"lzcnt", "HasLZCNT", "true",
@@ -197,6 +213,10 @@ def FeatureADX : SubtargetFeature<"adx", "HasADX", "true",
def FeatureSHA : SubtargetFeature<"sha", "HasSHA", "true",
"Enable SHA instructions",
[FeatureSSE2]>;
+def FeatureSHSTK : SubtargetFeature<"shstk", "HasSHSTK", "true",
+ "Support CET Shadow-Stack instructions">;
+def FeatureIBT : SubtargetFeature<"ibt", "HasIBT", "true",
+ "Support CET Indirect-Branch-Tracking instructions">;
def FeaturePRFCHW : SubtargetFeature<"prfchw", "HasPRFCHW", "true",
"Support PRFCHW instructions">;
def FeatureRDSEED : SubtargetFeature<"rdseed", "HasRDSEED", "true",
@@ -226,14 +246,12 @@ def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
"Flush A Cache Line Optimized">;
def FeatureCLWB : SubtargetFeature<"clwb", "HasCLWB", "true",
"Cache Line Write Back">;
-// TODO: This feature ought to be renamed.
-// What it really refers to are CPUs for which certain instructions
-// (which ones besides the example below?) are microcoded.
-// The best examples of this are the memory forms of CALL and PUSH
-// instructions, which should be avoided in favor of a MOV + register CALL/PUSH.
-def FeatureCallRegIndirect : SubtargetFeature<"call-reg-indirect",
- "CallRegIndirect", "true",
- "Call register indirect">;
+// On some processors, instructions that implicitly take two memory operands are
+// slow. In practice, this means that CALL, PUSH, and POP with memory operands
+// should be avoided in favor of a MOV + register CALL/PUSH/POP.
+def FeatureSlowTwoMemOps : SubtargetFeature<"slow-two-mem-ops",
+ "SlowTwoMemOps", "true",
+ "Two memory operand instructions are slow">;
def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true",
"LEA instruction needs inputs at AG stage">;
def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true",
@@ -290,11 +308,50 @@ def FeatureERMSB
"ermsb", "HasERMSB", "true",
"REP MOVS/STOS are fast">;
+// Sandy Bridge and newer processors have many instructions that can be
+// fused with conditional branches and pass through the CPU as a single
+// operation.
+def FeatureMacroFusion
+ : SubtargetFeature<"macrofusion", "HasMacroFusion", "true",
+ "Various instructions can be fused with conditional branches">;
+
+// Gather is available since Haswell (AVX2 set). So technically, we can
+// generate Gathers on all AVX2 processors. But the overhead on HSW is high.
+// Skylake Client processor has faster Gathers than HSW and performance is
+// similar to Skylake Server (AVX-512).
+def FeatureHasFastGather
+ : SubtargetFeature<"fast-gather", "HasFastGather", "true",
+ "Indicates if gather is reasonably fast.">;
+
//===----------------------------------------------------------------------===//
-// X86 processors supported.
+// Register File Description
+//===----------------------------------------------------------------------===//
+
+include "X86RegisterInfo.td"
+include "X86RegisterBanks.td"
+
+//===----------------------------------------------------------------------===//
+// Instruction Descriptions
//===----------------------------------------------------------------------===//
include "X86Schedule.td"
+include "X86InstrInfo.td"
+
+def X86InstrInfo : InstrInfo;
+
+//===----------------------------------------------------------------------===//
+// X86 processors supported.
+//===----------------------------------------------------------------------===//
+
+include "X86ScheduleAtom.td"
+include "X86SchedSandyBridge.td"
+include "X86SchedHaswell.td"
+include "X86SchedBroadwell.td"
+include "X86ScheduleSLM.td"
+include "X86ScheduleZnver1.td"
+include "X86ScheduleBtVer2.td"
+include "X86SchedSkylakeClient.td"
+include "X86SchedSkylakeServer.td"
def ProcIntelAtom : SubtargetFeature<"atom", "X86ProcFamily", "IntelAtom",
"Intel Atom processors">;
@@ -302,6 +359,20 @@ def ProcIntelSLM : SubtargetFeature<"slm", "X86ProcFamily", "IntelSLM",
"Intel Silvermont processors">;
def ProcIntelGLM : SubtargetFeature<"glm", "X86ProcFamily", "IntelGLM",
"Intel Goldmont processors">;
+def ProcIntelHSW : SubtargetFeature<"haswell", "X86ProcFamily",
+ "IntelHaswell", "Intel Haswell processors">;
+def ProcIntelBDW : SubtargetFeature<"broadwell", "X86ProcFamily",
+ "IntelBroadwell", "Intel Broadwell processors">;
+def ProcIntelSKL : SubtargetFeature<"skylake", "X86ProcFamily",
+ "IntelSkylake", "Intel Skylake processors">;
+def ProcIntelKNL : SubtargetFeature<"knl", "X86ProcFamily",
+ "IntelKNL", "Intel Knights Landing processors">;
+def ProcIntelSKX : SubtargetFeature<"skx", "X86ProcFamily",
+ "IntelSKX", "Intel Skylake Server processors">;
+def ProcIntelCNL : SubtargetFeature<"cannonlake", "X86ProcFamily",
+ "IntelCannonlake", "Intel Cannonlake processors">;
+def ProcIntelICL : SubtargetFeature<"icelake", "X86ProcFamily",
+ "IntelIcelake", "Intel Icelake processors">;
class Proc<string Name, list<SubtargetFeature> Features>
: ProcessorModel<Name, GenericModel, Features>;
@@ -312,14 +383,18 @@ def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16]>;
def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+
+foreach P = ["i686", "pentiumpro"] in {
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMOV]>;
+}
+
def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureCMOV, FeatureFXSR]>;
-def : Proc<"pentium3", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE1, FeatureFXSR]>;
-def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureSlowBTMem]>;
+
+foreach P = ["pentium3", "pentium3m"] in {
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE1,
+ FeatureFXSR]>;
+}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
// The intent is to enable it for pentium4 which is the current default
@@ -333,15 +408,13 @@ def : Proc<"pentium3m", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
def : ProcessorModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
-
-def : ProcessorModel<"pentium4", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
FeatureSSE2, FeatureFXSR]>;
-def : ProcessorModel<"pentium4m", GenericPostRAModel,
- [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
- FeatureSSE2, FeatureFXSR, FeatureSlowBTMem]>;
+foreach P = ["pentium4", "pentium4m"] in {
+ def : ProcessorModel<P, GenericPostRAModel,
+ [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureSSE2, FeatureFXSR]>;
+}
// Intel Quark.
def : Proc<"lakemont", []>;
@@ -349,20 +422,19 @@ def : Proc<"lakemont", []>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureSlowBTMem]>;
+ FeatureFXSR]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureMMX, FeatureSSE3,
- FeatureFXSR, FeatureSlowBTMem]>;
+ FeatureFXSR]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
FeatureMMX,
FeatureSSE3,
FeatureFXSR,
- FeatureCMPXCHG16B,
- FeatureSlowBTMem
+ FeatureCMPXCHG16B
]>;
// Intel Core 2 Solo/Duo.
@@ -373,8 +445,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
FeatureSSSE3,
FeatureFXSR,
FeatureCMPXCHG16B,
- FeatureSlowBTMem,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureX87,
@@ -383,8 +455,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureSSE41,
FeatureFXSR,
FeatureCMPXCHG16B,
- FeatureSlowBTMem,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
// Atom CPUs.
@@ -397,11 +469,10 @@ class BonnellProc<string Name> : ProcessorModel<Name, AtomModel, [
FeatureFXSR,
FeatureCMPXCHG16B,
FeatureMOVBE,
- FeatureSlowBTMem,
FeatureLEAForSP,
FeatureSlowDivide32,
FeatureSlowDivide64,
- FeatureCallRegIndirect,
+ FeatureSlowTwoMemOps,
FeatureLEAUsesAG,
FeaturePadShortFunctions,
FeatureLAHFSAHF
@@ -421,11 +492,10 @@ class SilvermontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeaturePCLMUL,
FeatureAES,
FeatureSlowDivide64,
- FeatureCallRegIndirect,
+ FeatureSlowTwoMemOps,
FeaturePRFCHW,
FeatureSlowLEA,
FeatureSlowIncDec,
- FeatureSlowBTMem,
FeatureSlowPMULLD,
FeatureLAHFSAHF
]>;
@@ -444,10 +514,9 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeaturePCLMUL,
FeatureAES,
FeaturePRFCHW,
- FeatureCallRegIndirect,
+ FeatureSlowTwoMemOps,
FeatureSlowLEA,
FeatureSlowIncDec,
- FeatureSlowBTMem,
FeatureLAHFSAHF,
FeatureMPX,
FeatureSHA,
@@ -457,7 +526,8 @@ class GoldmontProc<string Name> : ProcessorModel<Name, SLMModel, [
FeatureXSAVEOPT,
FeatureXSAVEC,
FeatureXSAVES,
- FeatureCLFLUSHOPT
+ FeatureCLFLUSHOPT,
+ FeatureFSGSBase
]>;
def : GoldmontProc<"goldmont">;
@@ -468,9 +538,9 @@ class NehalemProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
FeatureSSE42,
FeatureFXSR,
FeatureCMPXCHG16B,
- FeatureSlowBTMem,
FeaturePOPCNT,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
def : NehalemProc<"nehalem">;
def : NehalemProc<"corei7">;
@@ -483,11 +553,11 @@ class WestmereProc<string Name> : ProcessorModel<Name, SandyBridgeModel, [
FeatureSSE42,
FeatureFXSR,
FeatureCMPXCHG16B,
- FeatureSlowBTMem,
FeaturePOPCNT,
FeatureAES,
FeaturePCLMUL,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
def : WestmereProc<"westmere">;
@@ -518,12 +588,13 @@ def SNBFeatures : ProcessorFeatures<[], [
FeatureLAHFSAHF,
FeatureSlow3OpsLEA,
FeatureFastScalarFSQRT,
- FeatureFastSHLDRotate
+ FeatureFastSHLDRotate,
+ FeatureSlowIncDec,
+ FeatureMacroFusion
]>;
class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
SNBFeatures.Value, [
- FeatureSlowBTMem,
FeatureSlowUAMem32
]>;
def : SandyBridgeProc<"sandybridge">;
@@ -537,7 +608,6 @@ def IVBFeatures : ProcessorFeatures<SNBFeatures.Value, [
class IvyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
IVBFeatures.Value, [
- FeatureSlowBTMem,
FeatureSlowUAMem32
]>;
def : IvyBridgeProc<"ivybridge">;
@@ -550,12 +620,13 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureERMSB,
FeatureFMA,
FeatureLZCNT,
- FeatureMOVBE,
- FeatureSlowIncDec
+ FeatureMOVBE
]>;
class HaswellProc<string Name> : ProcModel<Name, HaswellModel,
- HSWFeatures.Value, []>;
+ HSWFeatures.Value, [
+ ProcIntelHSW
+]>;
def : HaswellProc<"haswell">;
def : HaswellProc<"core-avx2">; // Legacy alias.
@@ -563,8 +634,10 @@ def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
FeatureADX,
FeatureRDSEED
]>;
-class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
- BDWFeatures.Value, []>;
+class BroadwellProc<string Name> : ProcModel<Name, BroadwellModel,
+ BDWFeatures.Value, [
+ ProcIntelBDW
+]>;
def : BroadwellProc<"broadwell">;
def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
@@ -577,14 +650,14 @@ def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
FeatureFastVectorFSQRT
]>;
-// FIXME: define SKL model
-class SkylakeClientProc<string Name> : ProcModel<Name, HaswellModel,
- SKLFeatures.Value, []>;
+class SkylakeClientProc<string Name> : ProcModel<Name, SkylakeClientModel,
+ SKLFeatures.Value, [
+ ProcIntelSKL,
+ FeatureHasFastGather
+]>;
def : SkylakeClientProc<"skylake">;
-// FIXME: define KNL model
-class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
- IVBFeatures.Value, [
+def KNLFeatures : ProcessorFeatures<IVBFeatures.Value, [
FeatureAVX512,
FeatureERI,
FeatureCDI,
@@ -596,11 +669,29 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
FeatureLZCNT,
FeatureBMI,
FeatureBMI2,
- FeatureFMA,
- FeatureFastPartialYMMorZMMWrite
+ FeatureFMA
+]>;
+
+// FIXME: define KNL model
+class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
+ KNLFeatures.Value, [
+ ProcIntelKNL,
+ FeatureSlowTwoMemOps,
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureHasFastGather
]>;
def : KnightsLandingProc<"knl">;
+class KnightsMillProc<string Name> : ProcModel<Name, HaswellModel,
+ KNLFeatures.Value, [
+ ProcIntelKNL,
+ FeatureSlowTwoMemOps,
+ FeatureFastPartialYMMorZMMWrite,
+ FeatureHasFastGather,
+ FeatureVPOPCNTDQ
+]>;
+def : KnightsMillProc<"knm">; // TODO Add AVX5124FMAPS/AVX5124VNNIW features
+
def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
FeatureAVX512,
FeatureCDI,
@@ -611,9 +702,11 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
FeatureCLWB
]>;
-// FIXME: define SKX model
-class SkylakeServerProc<string Name> : ProcModel<Name, HaswellModel,
- SKXFeatures.Value, []>;
+class SkylakeServerProc<string Name> : ProcModel<Name, SkylakeServerModel,
+ SKXFeatures.Value, [
+ ProcIntelSKX,
+ FeatureHasFastGather
+]>;
def : SkylakeServerProc<"skylake-avx512">;
def : SkylakeServerProc<"skx">; // Legacy alias.
@@ -623,57 +716,60 @@ def CNLFeatures : ProcessorFeatures<SKXFeatures.Value, [
FeatureSHA
]>;
-class CannonlakeProc<string Name> : ProcModel<Name, HaswellModel,
- CNLFeatures.Value, []>;
+class CannonlakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
+ CNLFeatures.Value, [
+ ProcIntelCNL,
+ FeatureHasFastGather
+]>;
def : CannonlakeProc<"cannonlake">;
+def ICLFeatures : ProcessorFeatures<CNLFeatures.Value, [
+ FeatureBITALG,
+ FeatureVAES,
+ FeatureVBMI2,
+ FeatureVNNI,
+ FeatureVPCLMULQDQ,
+ FeatureVPOPCNTDQ,
+ FeatureGFNI
+]>;
+
+class IcelakeProc<string Name> : ProcModel<Name, SkylakeServerModel,
+ ICLFeatures.Value, [
+ ProcIntelICL,
+ FeatureHasFastGather
+]>;
+def : IcelakeProc<"icelake">;
+
// AMD CPUs.
def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"athlon", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-tbird", [FeatureX87, FeatureSlowUAMem16, Feature3DNowA,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-4", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
- Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
- FeatureSlowSHLD]>;
-def : Proc<"athlon-xp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
- Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
- FeatureSlowSHLD]>;
-def : Proc<"athlon-mp", [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
- Feature3DNowA, FeatureFXSR, FeatureSlowBTMem,
- FeatureSlowSHLD]>;
-def : Proc<"k8", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
- Feature3DNowA, FeatureFXSR, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"opteron", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
- Feature3DNowA, FeatureFXSR, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon64", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
- Feature3DNowA, FeatureFXSR, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon-fx", [FeatureX87, FeatureSlowUAMem16, FeatureSSE2,
- Feature3DNowA, FeatureFXSR, Feature64Bit,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"k8-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
- Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"opteron-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
- Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"athlon64-sse3", [FeatureX87, FeatureSlowUAMem16, FeatureSSE3,
- Feature3DNowA, FeatureFXSR, FeatureCMPXCHG16B,
- FeatureSlowBTMem, FeatureSlowSHLD]>;
-def : Proc<"amdfam10", [FeatureX87, FeatureSSE4A, Feature3DNowA,
- FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
- FeatureLAHFSAHF]>;
-def : Proc<"barcelona", [FeatureX87, FeatureSSE4A, Feature3DNowA,
- FeatureFXSR, FeatureCMPXCHG16B, FeatureLZCNT,
- FeaturePOPCNT, FeatureSlowBTMem, FeatureSlowSHLD,
- FeatureLAHFSAHF]>;
+
+foreach P = ["athlon", "athlon-tbird"] in {
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, Feature3DNowA, FeatureSlowSHLD]>;
+}
+
+foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE1,
+ Feature3DNowA, FeatureFXSR, FeatureSlowSHLD]>;
+}
+
+foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE2, Feature3DNowA,
+ FeatureFXSR, Feature64Bit, FeatureSlowSHLD]>;
+}
+
+foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
+ def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureSSE3, Feature3DNowA,
+ FeatureFXSR, FeatureCMPXCHG16B, FeatureSlowSHLD]>;
+}
+
+foreach P = ["amdfam10", "barcelona"] in {
+ def : Proc<P, [FeatureX87, FeatureSSE4A, Feature3DNowA, FeatureFXSR,
+ FeatureCMPXCHG16B, FeatureLZCNT, FeaturePOPCNT,
+ FeatureSlowSHLD, FeatureLAHFSAHF]>;
+}
// Bobcat
def : Proc<"btver1", [
@@ -732,7 +828,8 @@ def : Proc<"bdver1", [
FeatureXSAVE,
FeatureLWP,
FeatureSlowSHLD,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
// Piledriver
def : Proc<"bdver2", [
@@ -756,7 +853,8 @@ def : Proc<"bdver2", [
FeatureLWP,
FeatureFMA,
FeatureSlowSHLD,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
// Steamroller
@@ -783,7 +881,8 @@ def : Proc<"bdver3", [
FeatureXSAVEOPT,
FeatureSlowSHLD,
FeatureFSGSBase,
- FeatureLAHFSAHF
+ FeatureLAHFSAHF,
+ FeatureMacroFusion
]>;
// Excavator
@@ -811,7 +910,8 @@ def : Proc<"bdver4", [
FeatureSlowSHLD,
FeatureFSGSBase,
FeatureLAHFSAHF,
- FeatureMWAITX
+ FeatureMWAITX,
+ FeatureMacroFusion
]>;
// Znver1
@@ -831,6 +931,7 @@ def: ProcessorModel<"znver1", Znver1Model, [
FeatureFastLZCNT,
FeatureLAHFSAHF,
FeatureLZCNT,
+ FeatureMacroFusion,
FeatureMMX,
FeatureMOVBE,
FeatureMWAITX,
@@ -866,24 +967,16 @@ def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
// covers a huge swath of x86 processors. If there are specific scheduling
// knobs which need to be tuned differently for AMD chips, we might consider
// forming a common base for them.
-def : ProcessorModel<"x86-64", SandyBridgeModel,
- [FeatureX87, FeatureMMX, FeatureSSE2, FeatureFXSR,
- Feature64Bit, FeatureSlowBTMem ]>;
-
-//===----------------------------------------------------------------------===//
-// Register File Description
-//===----------------------------------------------------------------------===//
-
-include "X86RegisterInfo.td"
-include "X86RegisterBanks.td"
-
-//===----------------------------------------------------------------------===//
-// Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "X86InstrInfo.td"
-
-def X86InstrInfo : InstrInfo;
+def : ProcessorModel<"x86-64", SandyBridgeModel, [
+ FeatureX87,
+ FeatureMMX,
+ FeatureSSE2,
+ FeatureFXSR,
+ Feature64Bit,
+ FeatureSlow3OpsLEA,
+ FeatureSlowIncDec,
+ FeatureMacroFusion
+]>;
//===----------------------------------------------------------------------===//
// Calling Conventions
diff --git a/lib/Target/X86/X86AsmPrinter.cpp b/lib/Target/X86/X86AsmPrinter.cpp
index dc15aeadaa61..71526dd77f11 100644
--- a/lib/Target/X86/X86AsmPrinter.cpp
+++ b/lib/Target/X86/X86AsmPrinter.cpp
@@ -15,6 +15,7 @@
#include "X86AsmPrinter.h"
#include "InstPrinter/X86ATTInstPrinter.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86TargetStreamer.h"
#include "X86InstrInfo.h"
#include "X86MachineFunctionInfo.h"
#include "llvm/BinaryFormat/COFF.h"
@@ -22,12 +23,10 @@
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/IR/DebugInfo.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Module.h"
#include "llvm/IR/Type.h"
-#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCExpr.h"
@@ -40,6 +39,10 @@
#include "llvm/Support/TargetRegistry.h"
using namespace llvm;
+X86AsmPrinter::X86AsmPrinter(TargetMachine &TM,
+ std::unique_ptr<MCStreamer> Streamer)
+ : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
+
//===----------------------------------------------------------------------===//
// Primitive Helper Functions.
//===----------------------------------------------------------------------===//
@@ -51,13 +54,16 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
SMShadowTracker.startFunction(MF);
CodeEmitter.reset(TM.getTarget().createMCCodeEmitter(
- *MF.getSubtarget().getInstrInfo(), *MF.getSubtarget().getRegisterInfo(),
+ *Subtarget->getInstrInfo(), *Subtarget->getRegisterInfo(),
MF.getContext()));
+ EmitFPOData =
+ Subtarget->isTargetWin32() && MF.getMMI().getModule()->getCodeViewFlag();
+
SetupMachineFunction(MF);
if (Subtarget->isTargetCOFF()) {
- bool Local = MF.getFunction()->hasLocalLinkage();
+ bool Local = MF.getFunction().hasLocalLinkage();
OutStreamer->BeginCOFFSymbolDef(CurrentFnSym);
OutStreamer->EmitCOFFSymbolStorageClass(
Local ? COFF::IMAGE_SYM_CLASS_STATIC : COFF::IMAGE_SYM_CLASS_EXTERNAL);
@@ -72,10 +78,30 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
// Emit the XRay table for this function.
emitXRayTable();
+ EmitFPOData = false;
+
// We didn't modify anything.
return false;
}
+void X86AsmPrinter::EmitFunctionBodyStart() {
+ if (EmitFPOData) {
+ X86TargetStreamer *XTS =
+ static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
+ unsigned ParamsSize =
+ MF->getInfo<X86MachineFunctionInfo>()->getArgumentStackSize();
+ XTS->emitFPOProc(CurrentFnSym, ParamsSize);
+ }
+}
+
+void X86AsmPrinter::EmitFunctionBodyEnd() {
+ if (EmitFPOData) {
+ X86TargetStreamer *XTS =
+ static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
+ XTS->emitFPOEndProc();
+ }
+}
+
/// printSymbolOperand - Print a raw symbol reference operand. This handles
/// jump tables, constant pools, global address and external symbols, all of
/// which print to a label with various suffixes for relocation types etc.
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index d7c3b74d3efb..7e70789ac82c 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -14,6 +14,7 @@
#include "llvm/CodeGen/AsmPrinter.h"
#include "llvm/CodeGen/FaultMaps.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/MC/MCCodeEmitter.h"
#include "llvm/Target/TargetMachine.h"
// Implemented in X86MCInstLower.cpp
@@ -30,6 +31,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
StackMaps SM;
FaultMaps FM;
std::unique_ptr<MCCodeEmitter> CodeEmitter;
+ bool EmitFPOData = false;
// This utility class tracks the length of a stackmap instruction's 'shadow'.
// It is used by the X86AsmPrinter to ensure that the stackmap shadow
@@ -95,14 +97,11 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
- // Helper function that emits the XRay sleds we've collected for a particular
- // function.
- void EmitXRayTable();
+ // Choose between emitting .seh_ directives and .cv_fpo_ directives.
+ void EmitSEHInstruction(const MachineInstr *MI);
public:
- explicit X86AsmPrinter(TargetMachine &TM,
- std::unique_ptr<MCStreamer> Streamer)
- : AsmPrinter(TM, std::move(Streamer)), SM(*this), FM(*this) {}
+ X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
StringRef getPassName() const override {
return "X86 Assembly Printer";
@@ -117,6 +116,7 @@ public:
void EmitInstruction(const MachineInstr *MI) override;
void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+ AsmPrinter::EmitBasicBlockEnd(MBB);
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
}
@@ -133,10 +133,13 @@ public:
bool doInitialization(Module &M) override {
SMShadowTracker.reset(0);
SM.reset();
+ FM.reset();
return AsmPrinter::doInitialization(M);
}
bool runOnMachineFunction(MachineFunction &F) override;
+ void EmitFunctionBodyStart() override;
+ void EmitFunctionBodyEnd() override;
};
} // end namespace llvm
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 765af67de160..522dc7926b94 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -34,14 +34,14 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
#include <cassert>
#include <cstddef>
#include <cstdint>
@@ -56,18 +56,27 @@ static cl::opt<bool>
cl::desc("Avoid optimizing x86 call frames for size"),
cl::init(false), cl::Hidden);
+namespace llvm {
+void initializeX86CallFrameOptimizationPass(PassRegistry &);
+}
+
namespace {
class X86CallFrameOptimization : public MachineFunctionPass {
public:
- X86CallFrameOptimization() : MachineFunctionPass(ID) {}
+ X86CallFrameOptimization() : MachineFunctionPass(ID) {
+ initializeX86CallFrameOptimizationPass(
+ *PassRegistry::getPassRegistry());
+ }
bool runOnMachineFunction(MachineFunction &MF) override;
+ static char ID;
+
private:
// Information we know about a particular call site
struct CallContext {
- CallContext() : FrameSetup(nullptr), MovVector(4, nullptr) {}
+ CallContext() : FrameSetup(nullptr), ArgStoreVector(4, nullptr) {}
// Iterator referring to the frame setup instruction
MachineBasicBlock::iterator FrameSetup;
@@ -81,8 +90,8 @@ private:
// The total displacement of all passed parameters
int64_t ExpectedDist = 0;
- // The sequence of movs used to pass the parameters
- SmallVector<MachineInstr *, 4> MovVector;
+ // The sequence of storing instructions used to pass the parameters
+ SmallVector<MachineInstr *, 4> ArgStoreVector;
// True if this call site has no stack parameters
bool NoStackParams = false;
@@ -120,12 +129,12 @@ private:
MachineRegisterInfo *MRI;
unsigned SlotSize;
unsigned Log2SlotSize;
- static char ID;
};
-char X86CallFrameOptimization::ID = 0;
-
} // end anonymous namespace
+char X86CallFrameOptimization::ID = 0;
+INITIALIZE_PASS(X86CallFrameOptimization, DEBUG_TYPE,
+ "X86 Call Frame Optimization", false, false)
// This checks whether the transformation is legal.
// Also returns false in cases where it's potentially legal, but
@@ -139,7 +148,7 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
// is a danger of that being generated.
if (STI->isTargetDarwin() &&
(!MF.getLandingPads().empty() ||
- (MF.getFunction()->needsUnwindTableEntry() && !TFL->hasFP(MF))))
+ (MF.getFunction().needsUnwindTableEntry() && !TFL->hasFP(MF))))
return false;
// It is not valid to change the stack pointer outside the prolog/epilog
@@ -234,7 +243,7 @@ bool X86CallFrameOptimization::runOnMachineFunction(MachineFunction &MF) {
assert(isPowerOf2_32(SlotSize) && "Expect power of 2 stack slot size");
Log2SlotSize = Log2_32(SlotSize);
- if (skipFunction(*MF.getFunction()) || !isLegal(MF))
+ if (skipFunction(MF.getFunction()) || !isLegal(MF))
return false;
unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
@@ -271,11 +280,27 @@ X86CallFrameOptimization::classifyInstruction(
if (MI == MBB.end())
return Exit;
- // The instructions we actually care about are movs onto the stack
- int Opcode = MI->getOpcode();
- if (Opcode == X86::MOV32mi || Opcode == X86::MOV32mr ||
- Opcode == X86::MOV64mi32 || Opcode == X86::MOV64mr)
- return Convert;
+ // The instructions we actually care about are movs onto the stack or special
+ // cases of constant-stores to stack
+ switch (MI->getOpcode()) {
+ case X86::AND16mi8:
+ case X86::AND32mi8:
+ case X86::AND64mi8: {
+ MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands);
+ return ImmOp.getImm() == 0 ? Convert : Exit;
+ }
+ case X86::OR16mi8:
+ case X86::OR32mi8:
+ case X86::OR64mi8: {
+ MachineOperand ImmOp = MI->getOperand(X86::AddrNumOperands);
+ return ImmOp.getImm() == -1 ? Convert : Exit;
+ }
+ case X86::MOV32mi:
+ case X86::MOV32mr:
+ case X86::MOV64mi32:
+ case X86::MOV64mr:
+ return Convert;
+ }
// Not all calling conventions have only stack MOVs between the stack
// adjust and the call.
@@ -354,32 +379,40 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
++I;
unsigned StackPtr = RegInfo.getStackRegister();
+ auto StackPtrCopyInst = MBB.end();
// SelectionDAG (but not FastISel) inserts a copy of ESP into a virtual
- // register here. If it's there, use that virtual register as stack pointer
- // instead.
- if (I->isCopy() && I->getOperand(0).isReg() && I->getOperand(1).isReg() &&
- I->getOperand(1).getReg() == StackPtr) {
- Context.SPCopy = &*I++;
- StackPtr = Context.SPCopy->getOperand(0).getReg();
- }
+ // register. If it's there, use that virtual register as stack pointer
+ // instead. Also, we need to locate this instruction so that we can later
+ // safely ignore it while doing the conservative processing of the call chain.
+ // The COPY can be located anywhere between the call-frame setup
+ // instruction and its first use. We use the call instruction as a boundary
+ // because it is usually cheaper to check if an instruction is a call than
+ // checking if an instruction uses a register.
+ for (auto J = I; !J->isCall(); ++J)
+ if (J->isCopy() && J->getOperand(0).isReg() && J->getOperand(1).isReg() &&
+ J->getOperand(1).getReg() == StackPtr) {
+ StackPtrCopyInst = J;
+ Context.SPCopy = &*J++;
+ StackPtr = Context.SPCopy->getOperand(0).getReg();
+ break;
+ }
// Scan the call setup sequence for the pattern we're looking for.
// We only handle a simple case - a sequence of store instructions that
// push a sequence of stack-slot-aligned values onto the stack, with
// no gaps between them.
if (MaxAdjust > 4)
- Context.MovVector.resize(MaxAdjust, nullptr);
+ Context.ArgStoreVector.resize(MaxAdjust, nullptr);
- InstClassification Classification;
DenseSet<unsigned int> UsedRegs;
- while ((Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs)) !=
- Exit) {
- if (Classification == Skip) {
- ++I;
+ for (InstClassification Classification = Skip; Classification != Exit; ++I) {
+ // If this is the COPY of the stack pointer, it's ok to ignore.
+ if (I == StackPtrCopyInst)
+ continue;
+ Classification = classifyInstruction(MBB, I, RegInfo, UsedRegs);
+ if (Classification != Convert)
continue;
- }
-
// We know the instruction has a supported store opcode.
// We only want movs of the form:
// mov imm/reg, k(%StackPtr)
@@ -407,13 +440,13 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
return;
StackDisp >>= Log2SlotSize;
- assert((size_t)StackDisp < Context.MovVector.size() &&
+ assert((size_t)StackDisp < Context.ArgStoreVector.size() &&
"Function call has more parameters than the stack is adjusted for.");
// If the same stack slot is being filled twice, something's fishy.
- if (Context.MovVector[StackDisp] != nullptr)
+ if (Context.ArgStoreVector[StackDisp] != nullptr)
return;
- Context.MovVector[StackDisp] = &*I;
+ Context.ArgStoreVector[StackDisp] = &*I;
for (const MachineOperand &MO : I->uses()) {
if (!MO.isReg())
@@ -422,10 +455,10 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
if (RegInfo.isPhysicalRegister(Reg))
UsedRegs.insert(Reg);
}
-
- ++I;
}
+ --I;
+
// We now expect the end of the sequence. If we stopped early,
// or reached the end of the block without finding a call, bail.
if (I == MBB.end() || !I->isCall())
@@ -436,14 +469,14 @@ void X86CallFrameOptimization::collectCallInfo(MachineFunction &MF,
return;
// Now, go through the vector, and see that we don't have any gaps,
- // but only a series of MOVs.
- auto MMI = Context.MovVector.begin(), MME = Context.MovVector.end();
+ // but only a series of storing instructions.
+ auto MMI = Context.ArgStoreVector.begin(), MME = Context.ArgStoreVector.end();
for (; MMI != MME; ++MMI, Context.ExpectedDist += SlotSize)
if (*MMI == nullptr)
break;
// If the call had no parameters, do nothing
- if (MMI == Context.MovVector.begin())
+ if (MMI == Context.ArgStoreVector.begin())
return;
// We are either at the last parameter, or a gap.
@@ -466,17 +499,23 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
DebugLoc DL = FrameSetup->getDebugLoc();
bool Is64Bit = STI->is64Bit();
- // Now, iterate through the vector in reverse order, and replace the movs
- // with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
+ // Now, iterate through the vector in reverse order, and replace the store to
+ // stack with pushes. MOVmi/MOVmr doesn't have any defs, so no need to
// replace uses.
for (int Idx = (Context.ExpectedDist >> Log2SlotSize) - 1; Idx >= 0; --Idx) {
- MachineBasicBlock::iterator MOV = *Context.MovVector[Idx];
- MachineOperand PushOp = MOV->getOperand(X86::AddrNumOperands);
+ MachineBasicBlock::iterator Store = *Context.ArgStoreVector[Idx];
+ MachineOperand PushOp = Store->getOperand(X86::AddrNumOperands);
MachineBasicBlock::iterator Push = nullptr;
unsigned PushOpcode;
- switch (MOV->getOpcode()) {
+ switch (Store->getOpcode()) {
default:
llvm_unreachable("Unexpected Opcode!");
+ case X86::AND16mi8:
+ case X86::AND32mi8:
+ case X86::AND64mi8:
+ case X86::OR16mi8:
+ case X86::OR32mi8:
+ case X86::OR64mi8:
case X86::MOV32mi:
case X86::MOV64mi32:
PushOpcode = Is64Bit ? X86::PUSH64i32 : X86::PUSHi32;
@@ -497,7 +536,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
// If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
// in preparation for the PUSH64. The upper 32 bits can be undef.
- if (Is64Bit && MOV->getOpcode() == X86::MOV32mr) {
+ if (Is64Bit && Store->getOpcode() == X86::MOV32mr) {
unsigned UndefReg = MRI->createVirtualRegister(&X86::GR64RegClass);
Reg = MRI->createVirtualRegister(&X86::GR64RegClass);
BuildMI(MBB, Context.Call, DL, TII->get(X86::IMPLICIT_DEF), UndefReg);
@@ -541,7 +580,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
MBB, std::next(Push), DL,
MCCFIInstruction::createAdjustCfaOffset(nullptr, SlotSize));
- MBB.erase(MOV);
+ MBB.erase(Store);
}
// The stack-pointer copy is no longer used in the call sequences.
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index 99aeec67c326..ccb982f9ac16 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering -----------===//
+//===- llvm/lib/Target/X86/X86CallLowering.cpp - Call lowering ------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,33 +6,50 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-///
+//
/// \file
/// This file implements the lowering of LLVM calls to machine code calls for
/// GlobalISel.
-///
+//
//===----------------------------------------------------------------------===//
#include "X86CallLowering.h"
#include "X86CallingConv.h"
#include "X86ISelLowering.h"
#include "X86InstrInfo.h"
-#include "X86TargetMachine.h"
-
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/MachineValueType.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include <cassert>
+#include <cstdint>
using namespace llvm;
#include "X86GenCallingConv.inc"
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "This shouldn't be built without GISel"
-#endif
-
X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
: CallLowering(&TLI) {}
@@ -41,7 +58,6 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
const DataLayout &DL,
MachineRegisterInfo &MRI,
SplitArgTy PerformArgSplit) const {
-
const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
LLVMContext &Context = OrigArg.Ty->getContext();
@@ -82,14 +98,29 @@ bool X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
}
namespace {
-struct FuncReturnHandler : public CallLowering::ValueHandler {
- FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
- : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+ OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+ : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
+ DL(MIRBuilder.getMF().getDataLayout()),
+ STI(MIRBuilder.getMF().getSubtarget<X86Subtarget>()) {}
unsigned getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
- llvm_unreachable("Don't know how to get a stack address yet");
+ LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
+ LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
+ unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister());
+
+ unsigned OffsetReg = MRI.createGenericVirtualRegister(SType);
+ MIRBuilder.buildConstant(OffsetReg, Offset);
+
+ unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+ MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+ MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+ return AddrReg;
}
void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
@@ -101,16 +132,43 @@ struct FuncReturnHandler : public CallLowering::ValueHandler {
void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
- llvm_unreachable("Don't know how to assign a value to an address yet");
+ unsigned ExtReg = extendRegister(ValVReg, VA);
+ auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+ MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
+ /* Alignment */ 0);
+ MIRBuilder.buildStore(ExtReg, Addr, *MMO);
+ }
+
+ bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+ CCValAssign::LocInfo LocInfo,
+ const CallLowering::ArgInfo &Info, CCState &State) override {
+ bool Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+ StackSize = State.getNextStackOffset();
+
+ static const MCPhysReg XMMArgRegs[] = {X86::XMM0, X86::XMM1, X86::XMM2,
+ X86::XMM3, X86::XMM4, X86::XMM5,
+ X86::XMM6, X86::XMM7};
+ if (!Info.IsFixed)
+ NumXMMRegs = State.getFirstUnallocated(XMMArgRegs);
+
+ return Res;
}
+ uint64_t getStackSize() { return StackSize; }
+ uint64_t getNumXmmRegs() { return NumXMMRegs; }
+
+protected:
MachineInstrBuilder &MIB;
+ uint64_t StackSize = 0;
+ const DataLayout &DL;
+ const X86Subtarget &STI;
+ unsigned NumXMMRegs = 0;
};
-} // End anonymous namespace.
+
+} // end anonymous namespace
bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
const Value *Val, unsigned VReg) const {
-
assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
@@ -119,7 +177,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
MachineFunction &MF = MIRBuilder.getMF();
MachineRegisterInfo &MRI = MF.getRegInfo();
auto &DL = MF.getDataLayout();
- const Function &F = *MF.getFunction();
+ const Function &F = MF.getFunction();
ArgInfo OrigArg{VReg, Val->getType()};
setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
@@ -131,7 +189,7 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
}))
return false;
- FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+ OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
}
@@ -141,14 +199,15 @@ bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
}
namespace {
-struct FormalArgHandler : public CallLowering::ValueHandler {
- FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
- CCAssignFn *AssignFn, const DataLayout &DL)
- : ValueHandler(MIRBuilder, MRI, AssignFn), DL(DL) {}
+
+struct IncomingValueHandler : public CallLowering::ValueHandler {
+ IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : ValueHandler(MIRBuilder, MRI, AssignFn),
+ DL(MIRBuilder.getMF().getDataLayout()) {}
unsigned getStackAddress(uint64_t Size, int64_t Offset,
MachinePointerInfo &MPO) override {
-
auto &MFI = MIRBuilder.getMF().getFrameInfo();
int FI = MFI.CreateFixedObject(Size, Offset, true);
MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
@@ -161,7 +220,6 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
MachinePointerInfo &MPO, CCValAssign &VA) override {
-
auto MMO = MIRBuilder.getMF().getMachineMemOperand(
MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
0);
@@ -170,13 +228,54 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
CCValAssign &VA) override {
- MIRBuilder.getMBB().addLiveIn(PhysReg);
- MIRBuilder.buildCopy(ValVReg, PhysReg);
+ markPhysRegUsed(PhysReg);
+ switch (VA.getLocInfo()) {
+ default:
+ MIRBuilder.buildCopy(ValVReg, PhysReg);
+ break;
+ case CCValAssign::LocInfo::SExt:
+ case CCValAssign::LocInfo::ZExt:
+ case CCValAssign::LocInfo::AExt: {
+ auto Copy = MIRBuilder.buildCopy(LLT{VA.getLocVT()}, PhysReg);
+ MIRBuilder.buildTrunc(ValVReg, Copy);
+ break;
+ }
+ }
}
+ /// How the physical register gets marked varies between formal
+ /// parameters (it's a basic-block live-in), and a call instruction
+ /// (it's an implicit-def of the BL).
+ virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+protected:
const DataLayout &DL;
};
-} // namespace
+
+struct FormalArgHandler : public IncomingValueHandler {
+ FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn)
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIRBuilder.getMBB().addLiveIn(PhysReg);
+ }
+};
+
+struct CallReturnHandler : public IncomingValueHandler {
+ CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+ CCAssignFn *AssignFn, MachineInstrBuilder &MIB)
+ : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+ void markPhysRegUsed(unsigned PhysReg) override {
+ MIB.addDef(PhysReg, RegState::Implicit);
+ }
+
+protected:
+ MachineInstrBuilder &MIB;
+};
+
+} // end anonymous namespace
bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
const Function &F,
@@ -219,7 +318,7 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
if (!MBB.empty())
MIRBuilder.setInstr(*MBB.begin());
- FormalArgHandler Handler(MIRBuilder, MRI, CC_X86, DL);
+ FormalArgHandler Handler(MIRBuilder, MRI, CC_X86);
if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
return false;
@@ -228,3 +327,114 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
return true;
}
+
+bool X86CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+ CallingConv::ID CallConv,
+ const MachineOperand &Callee,
+ const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs) const {
+ MachineFunction &MF = MIRBuilder.getMF();
+ const Function &F = MF.getFunction();
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ auto &DL = F.getParent()->getDataLayout();
+ const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+ const TargetInstrInfo &TII = *STI.getInstrInfo();
+ auto TRI = STI.getRegisterInfo();
+
+ // Handle only Linux C, X86_64_SysV calling conventions for now.
+ if (!STI.isTargetLinux() ||
+ !(CallConv == CallingConv::C || CallConv == CallingConv::X86_64_SysV))
+ return false;
+
+ unsigned AdjStackDown = TII.getCallFrameSetupOpcode();
+ auto CallSeqStart = MIRBuilder.buildInstr(AdjStackDown);
+
+ // Create a temporarily-floating call instruction so we can add the implicit
+ // uses of arg registers.
+ bool Is64Bit = STI.is64Bit();
+ unsigned CallOpc = Callee.isReg()
+ ? (Is64Bit ? X86::CALL64r : X86::CALL32r)
+ : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
+
+ auto MIB = MIRBuilder.buildInstrNoInsert(CallOpc).add(Callee).addRegMask(
+ TRI->getCallPreservedMask(MF, CallConv));
+
+ SmallVector<ArgInfo, 8> SplitArgs;
+ for (const auto &OrigArg : OrigArgs) {
+
+ // TODO: handle not simple cases.
+ if (OrigArg.Flags.isByVal())
+ return false;
+
+ if (!splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+ [&](ArrayRef<unsigned> Regs) {
+ MIRBuilder.buildUnmerge(Regs, OrigArg.Reg);
+ }))
+ return false;
+ }
+ // Do the actual argument marshalling.
+ OutgoingValueHandler Handler(MIRBuilder, MRI, MIB, CC_X86);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+
+ bool IsFixed = OrigArgs.empty() ? true : OrigArgs.back().IsFixed;
+ if (STI.is64Bit() && !IsFixed && !STI.isCallingConvWin64(CallConv)) {
+ // From AMD64 ABI document:
+ // For calls that may call functions that use varargs or stdargs
+ // (prototype-less calls or calls to functions containing ellipsis (...) in
+ // the declaration) %al is used as hidden argument to specify the number
+ // of SSE registers used. The contents of %al do not need to match exactly
+ // the number of registers, but must be an ubound on the number of SSE
+ // registers used and is in the range 0 - 8 inclusive.
+
+ MIRBuilder.buildInstr(X86::MOV8ri)
+ .addDef(X86::AL)
+ .addImm(Handler.getNumXmmRegs());
+ MIB.addUse(X86::AL, RegState::Implicit);
+ }
+
+ // Now we can add the actual call instruction to the correct basic block.
+ MIRBuilder.insertInstr(MIB);
+
+ // If Callee is a reg, since it is used by a target specific
+ // instruction, it must have a register class matching the
+ // constraint of that instruction.
+ if (Callee.isReg())
+ MIB->getOperand(0).setReg(constrainOperandRegClass(
+ MF, *TRI, MRI, *MF.getSubtarget().getInstrInfo(),
+ *MF.getSubtarget().getRegBankInfo(), *MIB, MIB->getDesc(),
+ Callee.getReg(), 0));
+
+ // Finally we can copy the returned value back into its virtual-register. In
+ // symmetry with the arguments, the physical register must be an
+ // implicit-define of the call instruction.
+
+ if (OrigRet.Reg) {
+ SplitArgs.clear();
+ SmallVector<unsigned, 8> NewRegs;
+
+ if (!splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
+ [&](ArrayRef<unsigned> Regs) {
+ NewRegs.assign(Regs.begin(), Regs.end());
+ }))
+ return false;
+
+ CallReturnHandler Handler(MIRBuilder, MRI, RetCC_X86, MIB);
+ if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+ return false;
+
+ if (!NewRegs.empty())
+ MIRBuilder.buildMerge(OrigRet.Reg, NewRegs);
+ }
+
+ CallSeqStart.addImm(Handler.getStackSize())
+ .addImm(0 /* see getFrameTotalSize */)
+ .addImm(0 /* see getFrameAdjustment */);
+
+ unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
+ MIRBuilder.buildInstr(AdjStackUp)
+ .addImm(Handler.getStackSize())
+ .addImm(0 /* NumBytesForCalleeToPop */);
+
+ return true;
+}
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index 6a5dabf33a0a..6c9dc1565dad 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/X86/X86CallLowering.h - Call lowering -----===//
+//===- llvm/lib/Target/X86/X86CallLowering.h - Call lowering ----*- C++ -*-===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,24 +6,24 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
-///
+//
/// \file
/// This file describes how to lower LLVM calls to machine code calls.
-///
+//
//===----------------------------------------------------------------------===//
-#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING
-#define LLVM_LIB_TARGET_X86_X86CALLLOWERING
+#ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
+#define LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
#include "llvm/ADT/ArrayRef.h"
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include <functional>
namespace llvm {
-class Function;
-class MachineIRBuilder;
+class DataLayout;
+class MachineRegisterInfo;
class X86TargetLowering;
-class Value;
class X86CallLowering : public CallLowering {
public:
@@ -35,14 +35,20 @@ public:
bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
ArrayRef<unsigned> VRegs) const override;
+ bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+ const MachineOperand &Callee, const ArgInfo &OrigRet,
+ ArrayRef<ArgInfo> OrigArgs) const override;
+
private:
/// A function of this type is used to perform value split action.
- typedef std::function<void(ArrayRef<unsigned>)> SplitArgTy;
+ using SplitArgTy = std::function<void(ArrayRef<unsigned>)>;
bool splitToValueTypes(const ArgInfo &OrigArgInfo,
SmallVectorImpl<ArgInfo> &SplitArgs,
const DataLayout &DL, MachineRegisterInfo &MRI,
SplitArgTy SplitArg) const;
};
-} // namespace llvm
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index 26461986427d..5d806fe60b86 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -500,7 +500,7 @@ def CC_X86_64_C : CallingConv<[
// A SwiftError is passed in R12.
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
- // For Swift Calling Convention, pass sret in %RAX.
+ // For Swift Calling Convention, pass sret in %rax.
CCIfCC<"CallingConv::Swift",
CCIfSRet<CCIfType<[i64], CCAssignToReg<[RAX]>>>>,
@@ -592,6 +592,9 @@ def CC_X86_Win64_C : CallingConv<[
// The 'nest' parameter, if any, is passed in R10.
CCIfNest<CCAssignToReg<[R10]>>,
+ // A SwiftError is passed in R12.
+ CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+
// 128 bit vectors are passed by pointer
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
@@ -1047,6 +1050,8 @@ def CSR_Win64_NoSSE : CalleeSavedRegs<(add RBX, RBP, RDI, RSI, R12, R13, R14, R1
def CSR_Win64 : CalleeSavedRegs<(add CSR_Win64_NoSSE,
(sequence "XMM%u", 6, 15))>;
+def CSR_Win64_SwiftError : CalleeSavedRegs<(sub CSR_Win64, R12)>;
+
// The function used by Darwin to obtain the address of a thread-local variable
// uses rdi to pass a single parameter and rax for the return value. All other
// GPRs are preserved.
diff --git a/lib/Target/X86/X86CmovConversion.cpp b/lib/Target/X86/X86CmovConversion.cpp
index bfc834435de5..489d9d86e254 100644
--- a/lib/Target/X86/X86CmovConversion.cpp
+++ b/lib/Target/X86/X86CmovConversion.cpp
@@ -1,4 +1,4 @@
-//====-- X86CmovConversion.cpp - Convert Cmov to Branch -------------------===//
+//====- X86CmovConversion.cpp - Convert Cmov to Branch --------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -6,104 +6,146 @@
// License. See LICENSE.TXT for details.
//
//===----------------------------------------------------------------------===//
+//
/// \file
-/// This file implements a pass that converts X86 cmov instructions into branch
-/// when profitable. This pass is conservative, i.e., it applies transformation
-/// if and only if it can gaurantee a gain with high confidence.
+/// This file implements a pass that converts X86 cmov instructions into
+/// branches when profitable. This pass is conservative. It transforms if and
+/// only if it can guarantee a gain with high confidence.
///
/// Thus, the optimization applies under the following conditions:
-/// 1. Consider as a candidate only CMOV in most inner loop, assuming that
-/// most hotspots are represented by these loops.
-/// 2. Given a group of CMOV instructions, that are using same EFLAGS def
+/// 1. Consider as candidates only CMOVs in innermost loops (assume that
+/// most hotspots are represented by these loops).
+/// 2. Given a group of CMOV instructions that are using the same EFLAGS def
/// instruction:
-/// a. Consider them as candidates only if all have same code condition or
-/// opposite one, to prevent generating more than one conditional jump
-/// per EFLAGS def instruction.
+/// a. Consider them as candidates only if all have the same code condition
+/// or the opposite one to prevent generating more than one conditional
+/// jump per EFLAGS def instruction.
/// b. Consider them as candidates only if all are profitable to be
-/// converted, assuming that one bad conversion may casue a degradation.
-/// 3. Apply conversion only for loop that are found profitable and only for
+/// converted (assume that one bad conversion may cause a degradation).
+/// 3. Apply conversion only for loops that are found profitable and only for
/// CMOV candidates that were found profitable.
-/// a. Loop is considered profitable only if conversion will reduce its
-/// depth cost by some thrishold.
+/// a. A loop is considered profitable only if conversion will reduce its
+/// depth cost by some threshold.
/// b. CMOV is considered profitable if the cost of its condition is higher
/// than the average cost of its true-value and false-value by 25% of
-/// branch-misprediction-penalty, this to assure no degredassion even
-/// with 25% branch misprediction.
+/// branch-misprediction-penalty. This assures no degradation even with
+/// 25% branch misprediction.
///
/// Note: This pass is assumed to run on SSA machine code.
+//
//===----------------------------------------------------------------------===//
//
// External interfaces:
// FunctionPass *llvm::createX86CmovConverterPass();
// bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF);
//
+//===----------------------------------------------------------------------===//
#include "X86.h"
#include "X86InstrInfo.h"
-#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/CodeGen/TargetSchedule.h"
-#include "llvm/IR/InstIterator.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <utility>
+
using namespace llvm;
-#define DEBUG_TYPE "x86-cmov-converter"
+#define DEBUG_TYPE "x86-cmov-conversion"
STATISTIC(NumOfSkippedCmovGroups, "Number of unsupported CMOV-groups");
STATISTIC(NumOfCmovGroupCandidate, "Number of CMOV-group candidates");
STATISTIC(NumOfLoopCandidate, "Number of CMOV-conversion profitable loops");
STATISTIC(NumOfOptimizedCmovGroups, "Number of optimized CMOV-groups");
-namespace {
+namespace llvm {
+
+void initializeX86CmovConverterPassPass(PassRegistry &);
+
+} // end namespace llvm
+
// This internal switch can be used to turn off the cmov/branch optimization.
static cl::opt<bool>
EnableCmovConverter("x86-cmov-converter",
cl::desc("Enable the X86 cmov-to-branch optimization."),
cl::init(true), cl::Hidden);
+static cl::opt<unsigned>
+ GainCycleThreshold("x86-cmov-converter-threshold",
+ cl::desc("Minimum gain per loop (in cycles) threshold."),
+ cl::init(4), cl::Hidden);
+
+static cl::opt<bool> ForceMemOperand(
+ "x86-cmov-converter-force-mem-operand",
+ cl::desc("Convert cmovs to branches whenever they have memory operands."),
+ cl::init(true), cl::Hidden);
+
+namespace {
+
/// Converts X86 cmov instructions into branches when profitable.
class X86CmovConverterPass : public MachineFunctionPass {
public:
- X86CmovConverterPass() : MachineFunctionPass(ID) {}
- ~X86CmovConverterPass() {}
+ X86CmovConverterPass() : MachineFunctionPass(ID) {
+ initializeX86CmovConverterPassPass(*PassRegistry::getPassRegistry());
+ }
StringRef getPassName() const override { return "X86 cmov Conversion"; }
bool runOnMachineFunction(MachineFunction &MF) override;
void getAnalysisUsage(AnalysisUsage &AU) const override;
-private:
/// Pass identification, replacement for typeid.
static char ID;
- const MachineRegisterInfo *MRI;
+private:
+ MachineRegisterInfo *MRI;
const TargetInstrInfo *TII;
+ const TargetRegisterInfo *TRI;
TargetSchedModel TSchedModel;
/// List of consecutive CMOV instructions.
- typedef SmallVector<MachineInstr *, 2> CmovGroup;
- typedef SmallVector<CmovGroup, 2> CmovGroups;
+ using CmovGroup = SmallVector<MachineInstr *, 2>;
+ using CmovGroups = SmallVector<CmovGroup, 2>;
/// Collect all CMOV-group-candidates in \p CurrLoop and update \p
/// CmovInstGroups accordingly.
///
- /// \param CurrLoop Loop being processed.
+ /// \param Blocks List of blocks to process.
/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
/// \returns true iff it found any CMOV-group-candidate.
- bool collectCmovCandidates(MachineLoop *CurrLoop, CmovGroups &CmovInstGroups);
+ bool collectCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
+ CmovGroups &CmovInstGroups,
+ bool IncludeLoads = false);
/// Check if it is profitable to transform each CMOV-group-candidates into
/// branch. Remove all groups that are not profitable from \p CmovInstGroups.
///
- /// \param CurrLoop Loop being processed.
+ /// \param Blocks List of blocks to process.
/// \param CmovInstGroups List of consecutive CMOV instructions in CurrLoop.
/// \returns true iff any CMOV-group-candidate remain.
- bool checkForProfitableCmovCandidates(MachineLoop *CurrLoop,
+ bool checkForProfitableCmovCandidates(ArrayRef<MachineBasicBlock *> Blocks,
CmovGroups &CmovInstGroups);
/// Convert the given list of consecutive CMOV instructions into a branch.
@@ -112,6 +154,8 @@ private:
void convertCmovInstsToBranches(SmallVectorImpl<MachineInstr *> &Group) const;
};
+} // end anonymous namespace
+
char X86CmovConverterPass::ID = 0;
void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -120,7 +164,7 @@ void X86CmovConverterPass::getAnalysisUsage(AnalysisUsage &AU) const {
}
bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
if (!EnableCmovConverter)
return false;
@@ -133,10 +177,36 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
const TargetSubtargetInfo &STI = MF.getSubtarget();
MRI = &MF.getRegInfo();
TII = STI.getInstrInfo();
+ TRI = STI.getRegisterInfo();
TSchedModel.init(STI.getSchedModel(), &STI, TII);
+ // Before we handle the more subtle cases of register-register CMOVs inside
+ // of potentially hot loops, we want to quickly remove all CMOVs with
+ // a memory operand. The CMOV will risk a stall waiting for the load to
+ // complete that speculative execution behind a branch is better suited to
+ // handle on modern x86 chips.
+ if (ForceMemOperand) {
+ CmovGroups AllCmovGroups;
+ SmallVector<MachineBasicBlock *, 4> Blocks;
+ for (auto &MBB : MF)
+ Blocks.push_back(&MBB);
+ if (collectCmovCandidates(Blocks, AllCmovGroups, /*IncludeLoads*/ true)) {
+ for (auto &Group : AllCmovGroups) {
+ // Skip any group that doesn't do at least one memory operand cmov.
+ if (!llvm::any_of(Group, [&](MachineInstr *I) { return I->mayLoad(); }))
+ continue;
+
+ // For CMOV groups which we can rewrite and which contain a memory load,
+ // always rewrite them. On x86, a CMOV will dramatically amplify any
+ // memory latency by blocking speculative execution.
+ Changed = true;
+ convertCmovInstsToBranches(Group);
+ }
+ }
+ }
+
//===--------------------------------------------------------------------===//
- // Algorithm
+ // Register-operand Conversion Algorithm
// ---------
// For each inner most loop
// collectCmovCandidates() {
@@ -157,32 +227,41 @@ bool X86CmovConverterPass::runOnMachineFunction(MachineFunction &MF) {
//
// Note: For more details, see each function description.
//===--------------------------------------------------------------------===//
- for (MachineBasicBlock &MBB : MF) {
- MachineLoop *CurrLoop = MLI.getLoopFor(&MBB);
+ // Build up the loops in pre-order.
+ SmallVector<MachineLoop *, 4> Loops(MLI.begin(), MLI.end());
+ // Note that we need to check size on each iteration as we accumulate child
+ // loops.
+ for (int i = 0; i < (int)Loops.size(); ++i)
+ for (MachineLoop *Child : Loops[i]->getSubLoops())
+ Loops.push_back(Child);
+
+ for (MachineLoop *CurrLoop : Loops) {
// Optimize only inner most loops.
- if (!CurrLoop || CurrLoop->getHeader() != &MBB ||
- !CurrLoop->getSubLoops().empty())
+ if (!CurrLoop->getSubLoops().empty())
continue;
// List of consecutive CMOV instructions to be processed.
CmovGroups CmovInstGroups;
- if (!collectCmovCandidates(CurrLoop, CmovInstGroups))
+ if (!collectCmovCandidates(CurrLoop->getBlocks(), CmovInstGroups))
continue;
- if (!checkForProfitableCmovCandidates(CurrLoop, CmovInstGroups))
+ if (!checkForProfitableCmovCandidates(CurrLoop->getBlocks(),
+ CmovInstGroups))
continue;
Changed = true;
for (auto &Group : CmovInstGroups)
convertCmovInstsToBranches(Group);
}
+
return Changed;
}
-bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
- CmovGroups &CmovInstGroups) {
+bool X86CmovConverterPass::collectCmovCandidates(
+ ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups,
+ bool IncludeLoads) {
//===--------------------------------------------------------------------===//
// Collect all CMOV-group-candidates and add them into CmovInstGroups.
//
@@ -204,24 +283,29 @@ bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
// Current processed CMOV-Group.
CmovGroup Group;
- for (auto *MBB : CurrLoop->getBlocks()) {
+ for (auto *MBB : Blocks) {
Group.clear();
// Condition code of first CMOV instruction current processed range and its
// opposite condition code.
- X86::CondCode FirstCC, FirstOppCC;
+ X86::CondCode FirstCC, FirstOppCC, MemOpCC;
// Indicator of a non CMOVrr instruction in the current processed range.
bool FoundNonCMOVInst = false;
// Indicator for current processed CMOV-group if it should be skipped.
bool SkipGroup = false;
for (auto &I : *MBB) {
+ // Skip debug instructions.
+ if (I.isDebugValue())
+ continue;
X86::CondCode CC = X86::getCondFromCMovOpc(I.getOpcode());
// Check if we found a X86::CMOVrr instruction.
- if (CC != X86::COND_INVALID && !I.mayLoad()) {
+ if (CC != X86::COND_INVALID && (IncludeLoads || !I.mayLoad())) {
if (Group.empty()) {
// We found first CMOV in the range, reset flags.
FirstCC = CC;
FirstOppCC = X86::GetOppositeBranchCondition(CC);
+ // Clear out the prior group's memory operand CC.
+ MemOpCC = X86::COND_INVALID;
FoundNonCMOVInst = false;
SkipGroup = false;
}
@@ -231,6 +315,24 @@ bool X86CmovConverterPass::collectCmovCandidates(MachineLoop *CurrLoop,
if (FoundNonCMOVInst || (CC != FirstCC && CC != FirstOppCC))
// Mark the SKipGroup indicator to skip current processed CMOV-Group.
SkipGroup = true;
+ if (I.mayLoad()) {
+ if (MemOpCC == X86::COND_INVALID)
+ // The first memory operand CMOV.
+ MemOpCC = CC;
+ else if (CC != MemOpCC)
+ // Can't handle mixed conditions with memory operands.
+ SkipGroup = true;
+ }
+ // Check if we were relying on zero-extending behavior of the CMOV.
+ if (!SkipGroup &&
+ llvm::any_of(
+ MRI->use_nodbg_instructions(I.defs().begin()->getReg()),
+ [&](MachineInstr &UseI) {
+ return UseI.getOpcode() == X86::SUBREG_TO_REG;
+ }))
+ // FIXME: We should model the cost of using an explicit MOV to handle
+ // the zero-extension rather than just refusing to handle this.
+ SkipGroup = true;
continue;
}
// If Group is empty, keep looking for first CMOV in the range.
@@ -278,7 +380,7 @@ static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
}
bool X86CmovConverterPass::checkForProfitableCmovCandidates(
- MachineLoop *CurrLoop, CmovGroups &CmovInstGroups) {
+ ArrayRef<MachineBasicBlock *> Blocks, CmovGroups &CmovInstGroups) {
struct DepthInfo {
/// Depth of original loop.
unsigned Depth;
@@ -328,10 +430,13 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
//===--------------------------------------------------------------------===//
for (unsigned I = 0; I < LoopIterations; ++I) {
DepthInfo &MaxDepth = LoopDepth[I];
- for (auto *MBB : CurrLoop->getBlocks()) {
+ for (auto *MBB : Blocks) {
// Clear physical registers Def map.
RegDefMaps[PhyRegType].clear();
for (MachineInstr &MI : *MBB) {
+ // Skip debug instructions.
+ if (MI.isDebugValue())
+ continue;
unsigned MIDepth = 0;
unsigned MIDepthOpt = 0;
bool IsCMOV = CmovInstructions.count(&MI);
@@ -389,19 +494,28 @@ bool X86CmovConverterPass::checkForProfitableCmovCandidates(
// Critical-path is iteration dependent - there is dependency of
// critical-path instructions on critical-path instructions of
// previous iteration.
- // Thus, it is required to check the gradient of the gain - the
- // change in Depth-Diff compared to the change in Loop-Depth between
- // 1st and 2nd iterations.
+ // Thus, check the gain percent of the 2nd iteration (similar to the
+ // previous case), but it is also required to check the gradient of
+ // the gain - the change in Depth-Diff compared to the change in
+ // Loop-Depth between 1st and 2nd iterations.
// To be conservative, the gradient need to be at least 50%.
//
+ // In addition, In order not to optimize loops with very small gain, the
+ // gain (in cycles) after 2nd iteration should not be less than a given
+ // threshold. Thus, the check (Diff[1] >= GainCycleThreshold) must apply.
+ //
// If loop is not worth optimizing, remove all CMOV-group-candidates.
//===--------------------------------------------------------------------===//
+ if (Diff[1] < GainCycleThreshold)
+ return false;
+
bool WorthOptLoop = false;
if (Diff[1] == Diff[0])
WorthOptLoop = Diff[0] * 8 >= LoopDepth[0].Depth;
else if (Diff[1] > Diff[0])
WorthOptLoop =
- (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth);
+ (Diff[1] - Diff[0]) * 2 >= (LoopDepth[1].Depth - LoopDepth[0].Depth) &&
+ (Diff[1] * 8 >= LoopDepth[1].Depth);
if (!WorthOptLoop)
return false;
@@ -481,11 +595,36 @@ static bool checkEFLAGSLive(MachineInstr *MI) {
return false;
}
+/// Given /p First CMOV instruction and /p Last CMOV instruction representing a
+/// group of CMOV instructions, which may contain debug instructions in between,
+/// move all debug instructions to after the last CMOV instruction, making the
+/// CMOV group consecutive.
+static void packCmovGroup(MachineInstr *First, MachineInstr *Last) {
+ assert(X86::getCondFromCMovOpc(Last->getOpcode()) != X86::COND_INVALID &&
+ "Last instruction in a CMOV group must be a CMOV instruction");
+
+ SmallVector<MachineInstr *, 2> DBGInstructions;
+ for (auto I = First->getIterator(), E = Last->getIterator(); I != E; I++) {
+ if (I->isDebugValue())
+ DBGInstructions.push_back(&*I);
+ }
+
+ // Splice the debug instruction after the cmov group.
+ MachineBasicBlock *MBB = First->getParent();
+ for (auto *MI : DBGInstructions)
+ MBB->insertAfter(Last, MI->removeFromParent());
+}
+
void X86CmovConverterPass::convertCmovInstsToBranches(
SmallVectorImpl<MachineInstr *> &Group) const {
assert(!Group.empty() && "No CMOV instructions to convert");
++NumOfOptimizedCmovGroups;
+ // If the CMOV group is not packed, e.g., there are debug instructions between
+ // first CMOV and last CMOV, then pack the group and make the CMOV instruction
+ // consecutive by moving the debug instructions to after the last CMOV.
+ packCmovGroup(Group.front(), Group.back());
+
// To convert a CMOVcc instruction, we actually have to insert the diamond
// control-flow pattern. The incoming instruction knows the destination vreg
// to set, the condition code register to branch on, the true/false values to
@@ -518,8 +657,18 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MachineInstr &MI = *Group.front();
MachineInstr *LastCMOV = Group.back();
DebugLoc DL = MI.getDebugLoc();
+
X86::CondCode CC = X86::CondCode(X86::getCondFromCMovOpc(MI.getOpcode()));
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+ // Potentially swap the condition codes so that any memory operand to a CMOV
+ // is in the *false* position instead of the *true* position. We can invert
+ // any non-memory operand CMOV instructions to cope with this and we ensure
+ // memory operand CMOVs are only included with a single condition code.
+ if (llvm::any_of(Group, [&](MachineInstr *I) {
+ return I->mayLoad() && X86::getCondFromCMovOpc(I->getOpcode()) == CC;
+ }))
+ std::swap(CC, OppCC);
+
MachineBasicBlock *MBB = MI.getParent();
MachineFunction::iterator It = ++MBB->getIterator();
MachineFunction *F = MBB->getParent();
@@ -556,7 +705,111 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd =
std::next(MachineBasicBlock::iterator(LastCMOV));
+ MachineBasicBlock::iterator FalseInsertionPoint = FalseMBB->begin();
MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
+
+ // First we need to insert an explicit load on the false path for any memory
+ // operand. We also need to potentially do register rewriting here, but it is
+ // simpler as the memory operands are always on the false path so we can
+ // simply take that input, whatever it is.
+ DenseMap<unsigned, unsigned> FalseBBRegRewriteTable;
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd;) {
+ auto &MI = *MIIt++;
+ // Skip any CMOVs in this group which don't load from memory.
+ if (!MI.mayLoad()) {
+ // Remember the false-side register input.
+ unsigned FalseReg =
+ MI.getOperand(X86::getCondFromCMovOpc(MI.getOpcode()) == CC ? 1 : 2)
+ .getReg();
+ // Walk back through any intermediate cmovs referenced.
+ while (true) {
+ auto FRIt = FalseBBRegRewriteTable.find(FalseReg);
+ if (FRIt == FalseBBRegRewriteTable.end())
+ break;
+ FalseReg = FRIt->second;
+ }
+ FalseBBRegRewriteTable[MI.getOperand(0).getReg()] = FalseReg;
+ continue;
+ }
+
+ // The condition must be the *opposite* of the one we've decided to branch
+ // on as the branch will go *around* the load and the load should happen
+ // when the CMOV condition is false.
+ assert(X86::getCondFromCMovOpc(MI.getOpcode()) == OppCC &&
+ "Can only handle memory-operand cmov instructions with a condition "
+ "opposite to the selected branch direction.");
+
+ // The goal is to rewrite the cmov from:
+ //
+ // MBB:
+ // %A = CMOVcc %B (tied), (mem)
+ //
+ // to
+ //
+ // MBB:
+ // %A = CMOVcc %B (tied), %C
+ // FalseMBB:
+ // %C = MOV (mem)
+ //
+ // Which will allow the next loop to rewrite the CMOV in terms of a PHI:
+ //
+ // MBB:
+ // JMP!cc SinkMBB
+ // FalseMBB:
+ // %C = MOV (mem)
+ // SinkMBB:
+ // %A = PHI [ %C, FalseMBB ], [ %B, MBB]
+
+ // Get a fresh register to use as the destination of the MOV.
+ const TargetRegisterClass *RC = MRI->getRegClass(MI.getOperand(0).getReg());
+ unsigned TmpReg = MRI->createVirtualRegister(RC);
+
+ SmallVector<MachineInstr *, 4> NewMIs;
+ bool Unfolded = TII->unfoldMemoryOperand(*MBB->getParent(), MI, TmpReg,
+ /*UnfoldLoad*/ true,
+ /*UnfoldStore*/ false, NewMIs);
+ (void)Unfolded;
+ assert(Unfolded && "Should never fail to unfold a loading cmov!");
+
+ // Move the new CMOV to just before the old one and reset any impacted
+ // iterator.
+ auto *NewCMOV = NewMIs.pop_back_val();
+ assert(X86::getCondFromCMovOpc(NewCMOV->getOpcode()) == OppCC &&
+ "Last new instruction isn't the expected CMOV!");
+ DEBUG(dbgs() << "\tRewritten cmov: "; NewCMOV->dump());
+ MBB->insert(MachineBasicBlock::iterator(MI), NewCMOV);
+ if (&*MIItBegin == &MI)
+ MIItBegin = MachineBasicBlock::iterator(NewCMOV);
+
+ // Sink whatever instructions were needed to produce the unfolded operand
+ // into the false block.
+ for (auto *NewMI : NewMIs) {
+ DEBUG(dbgs() << "\tRewritten load instr: "; NewMI->dump());
+ FalseMBB->insert(FalseInsertionPoint, NewMI);
+ // Re-map any operands that are from other cmovs to the inputs for this block.
+ for (auto &MOp : NewMI->uses()) {
+ if (!MOp.isReg())
+ continue;
+ auto It = FalseBBRegRewriteTable.find(MOp.getReg());
+ if (It == FalseBBRegRewriteTable.end())
+ continue;
+
+ MOp.setReg(It->second);
+ // This might have been a kill when it referenced the cmov result, but
+ // it won't necessarily be once rewritten.
+ // FIXME: We could potentially improve this by tracking whether the
+ // operand to the cmov was also a kill, and then skipping the PHI node
+ // construction below.
+ MOp.setIsKill(false);
+ }
+ }
+ MBB->erase(MachineBasicBlock::iterator(MI),
+ std::next(MachineBasicBlock::iterator(MI)));
+
+ // Add this PHI to the rewrite table.
+ FalseBBRegRewriteTable[NewCMOV->getOperand(0).getReg()] = TmpReg;
+ }
+
// As we are creating the PHIs, we have to be careful if there is more than
// one. Later CMOVs may reference the results of earlier CMOVs, but later
// PHIs have to reference the individual true/false inputs from earlier PHIs.
@@ -604,7 +857,11 @@ void X86CmovConverterPass::convertCmovInstsToBranches(
MBB->erase(MIItBegin, MIItEnd);
}
-} // End anonymous namespace.
+INITIALIZE_PASS_BEGIN(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
+ false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(X86CmovConverterPass, DEBUG_TYPE, "X86 cmov Conversion",
+ false, false)
FunctionPass *llvm::createX86CmovConverterPass() {
return new X86CmovConverterPass();
diff --git a/lib/Target/X86/X86DomainReassignment.cpp b/lib/Target/X86/X86DomainReassignment.cpp
new file mode 100644
index 000000000000..0a87fb4533c2
--- /dev/null
+++ b/lib/Target/X86/X86DomainReassignment.cpp
@@ -0,0 +1,753 @@
+//===--- X86DomainReassignment.cpp - Selectively switch register classes---===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass attempts to find instruction chains (closures) in one domain,
+// and convert them to equivalent instructions in a different domain,
+// if profitable.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Support/Debug.h"
+#include <bitset>
+
+using namespace llvm;
+
+namespace llvm {
+void initializeX86DomainReassignmentPass(PassRegistry &);
+}
+
+#define DEBUG_TYPE "x86-domain-reassignment"
+
+STATISTIC(NumClosuresConverted, "Number of closures converted by the pass");
+
+static cl::opt<bool> DisableX86DomainReassignment(
+ "disable-x86-domain-reassignment", cl::Hidden,
+ cl::desc("X86: Disable Virtual Register Reassignment."), cl::init(false));
+
+namespace {
+enum RegDomain { NoDomain = -1, GPRDomain, MaskDomain, OtherDomain, NumDomains };
+
+static bool isGPR(const TargetRegisterClass *RC) {
+ return X86::GR64RegClass.hasSubClassEq(RC) ||
+ X86::GR32RegClass.hasSubClassEq(RC) ||
+ X86::GR16RegClass.hasSubClassEq(RC) ||
+ X86::GR8RegClass.hasSubClassEq(RC);
+}
+
+static bool isMask(const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+ return X86::VK16RegClass.hasSubClassEq(RC);
+}
+
+static RegDomain getDomain(const TargetRegisterClass *RC,
+ const TargetRegisterInfo *TRI) {
+ if (isGPR(RC))
+ return GPRDomain;
+ if (isMask(RC, TRI))
+ return MaskDomain;
+ return OtherDomain;
+}
+
+/// Return a register class equivalent to \p SrcRC, in \p Domain.
+static const TargetRegisterClass *getDstRC(const TargetRegisterClass *SrcRC,
+ RegDomain Domain) {
+ assert(Domain == MaskDomain && "add domain");
+ if (X86::GR8RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK8RegClass;
+ if (X86::GR16RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK16RegClass;
+ if (X86::GR32RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK32RegClass;
+ if (X86::GR64RegClass.hasSubClassEq(SrcRC))
+ return &X86::VK64RegClass;
+ llvm_unreachable("add register class");
+ return nullptr;
+}
+
+/// Abstract Instruction Converter class.
+class InstrConverterBase {
+protected:
+ unsigned SrcOpcode;
+
+public:
+ InstrConverterBase(unsigned SrcOpcode) : SrcOpcode(SrcOpcode) {}
+
+ virtual ~InstrConverterBase() {}
+
+ /// \returns true if \p MI is legal to convert.
+ virtual bool isLegal(const MachineInstr *MI,
+ const TargetInstrInfo *TII) const {
+ assert(MI->getOpcode() == SrcOpcode &&
+ "Wrong instruction passed to converter");
+ return true;
+ }
+
+ /// Applies conversion to \p MI.
+ ///
+ /// \returns true if \p MI is no longer need, and can be deleted.
+ virtual bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const = 0;
+
+ /// \returns the cost increment incurred by converting \p MI.
+ virtual double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const = 0;
+};
+
+/// An Instruction Converter which ignores the given instruction.
+/// For example, PHI instructions can be safely ignored since only the registers
+/// need to change.
+class InstrIgnore : public InstrConverterBase {
+public:
+ InstrIgnore(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ return false;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ return 0;
+ }
+};
+
+/// An Instruction Converter which replaces an instruction with another.
+class InstrReplacer : public InstrConverterBase {
+public:
+ /// Opcode of the destination instruction.
+ unsigned DstOpcode;
+
+ InstrReplacer(unsigned SrcOpcode, unsigned DstOpcode)
+ : InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {}
+
+ bool isLegal(const MachineInstr *MI,
+ const TargetInstrInfo *TII) const override {
+ if (!InstrConverterBase::isLegal(MI, TII))
+ return false;
+ // It's illegal to replace an instruction that implicitly defines a register
+ // with an instruction that doesn't, unless that register dead.
+ for (auto &MO : MI->implicit_operands())
+ if (MO.isReg() && MO.isDef() && !MO.isDead() &&
+ !TII->get(DstOpcode).hasImplicitDefOfPhysReg(MO.getReg()))
+ return false;
+ return true;
+ }
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ MachineInstrBuilder Bld =
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(DstOpcode));
+ // Transfer explicit operands from original instruction. Implicit operands
+ // are handled by BuildMI.
+ for (auto &Op : MI->explicit_operands())
+ Bld.add(Op);
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ // Assuming instructions have the same cost.
+ return 0;
+ }
+};
+
+/// An Instruction Converter which replaces an instruction with another, and
+/// adds a COPY from the new instruction's destination to the old one's.
+class InstrReplacerDstCOPY : public InstrConverterBase {
+public:
+ unsigned DstOpcode;
+
+ InstrReplacerDstCOPY(unsigned SrcOpcode, unsigned DstOpcode)
+ : InstrConverterBase(SrcOpcode), DstOpcode(DstOpcode) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ MachineBasicBlock *MBB = MI->getParent();
+ auto &DL = MI->getDebugLoc();
+
+ unsigned Reg = MRI->createVirtualRegister(
+ TII->getRegClass(TII->get(DstOpcode), 0, MRI->getTargetRegisterInfo(),
+ *MBB->getParent()));
+ MachineInstrBuilder Bld = BuildMI(*MBB, MI, DL, TII->get(DstOpcode), Reg);
+ for (unsigned Idx = 1, End = MI->getNumOperands(); Idx < End; ++Idx)
+ Bld.add(MI->getOperand(Idx));
+
+ BuildMI(*MBB, MI, DL, TII->get(TargetOpcode::COPY))
+ .add(MI->getOperand(0))
+ .addReg(Reg);
+
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ // Assuming instructions have the same cost, and that COPY is in the same
+ // domain so it will be eliminated.
+ return 0;
+ }
+};
+
+/// An Instruction Converter for replacing COPY instructions.
+class InstrCOPYReplacer : public InstrReplacer {
+public:
+ RegDomain DstDomain;
+
+ InstrCOPYReplacer(unsigned SrcOpcode, RegDomain DstDomain, unsigned DstOpcode)
+ : InstrReplacer(SrcOpcode, DstOpcode), DstDomain(DstDomain) {}
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ assert(MI->getOpcode() == TargetOpcode::COPY && "Expected a COPY");
+
+ for (auto &MO : MI->operands()) {
+ // Physical registers will not be converted. Assume that converting the
+ // COPY to the destination domain will eventually result in a actual
+ // instruction.
+ if (TargetRegisterInfo::isPhysicalRegister(MO.getReg()))
+ return 1;
+
+ RegDomain OpDomain = getDomain(MRI->getRegClass(MO.getReg()),
+ MRI->getTargetRegisterInfo());
+ // Converting a cross domain COPY to a same domain COPY should eliminate
+ // an insturction
+ if (OpDomain == DstDomain)
+ return -1;
+ }
+ return 0;
+ }
+};
+
+/// An Instruction Converter which replaces an instruction with a COPY.
+class InstrReplaceWithCopy : public InstrConverterBase {
+public:
+ // Source instruction operand Index, to be used as the COPY source.
+ unsigned SrcOpIdx;
+
+ InstrReplaceWithCopy(unsigned SrcOpcode, unsigned SrcOpIdx)
+ : InstrConverterBase(SrcOpcode), SrcOpIdx(SrcOpIdx) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ BuildMI(*MI->getParent(), MI, MI->getDebugLoc(),
+ TII->get(TargetOpcode::COPY))
+ .add({MI->getOperand(0), MI->getOperand(SrcOpIdx)});
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ return 0;
+ }
+};
+
+/// An Instruction Converter which completely deletes an instruction.
+/// For example, IMPLICIT_DEF instructions can be deleted when converting from
+/// GPR to mask.
+class InstrDeleter : public InstrConverterBase {
+public:
+ InstrDeleter(unsigned SrcOpcode) : InstrConverterBase(SrcOpcode) {}
+
+ bool convertInstr(MachineInstr *MI, const TargetInstrInfo *TII,
+ MachineRegisterInfo *MRI) const override {
+ assert(isLegal(MI, TII) && "Cannot convert instruction");
+ return true;
+ }
+
+ double getExtraCost(const MachineInstr *MI,
+ MachineRegisterInfo *MRI) const override {
+ return 0;
+ }
+};
+
+// Key type to be used by the Instruction Converters map.
+// A converter is identified by <destination domain, source opcode>
+typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;
+
+typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *>
+ InstrConverterBaseMap;
+
+/// A closure is a set of virtual register representing all of the edges in
+/// the closure, as well as all of the instructions connected by those edges.
+///
+/// A closure may encompass virtual registers in the same register bank that
+/// have different widths. For example, it may contain 32-bit GPRs as well as
+/// 64-bit GPRs.
+///
+/// A closure that computes an address (i.e. defines a virtual register that is
+/// used in a memory operand) excludes the instructions that contain memory
+/// operands using the address. Such an instruction will be included in a
+/// different closure that manipulates the loaded or stored value.
+class Closure {
+private:
+ const TargetInstrInfo *TII;
+ MachineRegisterInfo *MRI;
+
+ /// Virtual registers in the closure.
+ DenseSet<unsigned> Edges;
+
+ /// Instructions in the closure.
+ SmallVector<MachineInstr *, 8> Instrs;
+
+ /// A map of available Instruction Converters.
+ const InstrConverterBaseMap &Converters;
+
+ /// The register domain of this closure.
+ RegDomain Domain;
+
+ /// Domains which this closure can legally be reassigned to.
+ std::bitset<NumDomains> LegalDstDomains;
+
+ /// Enqueue \p Reg to be considered for addition to the closure.
+ void visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist);
+
+ /// Add \p MI to this closure.
+ void encloseInstr(MachineInstr *MI);
+
+ /// Calculate the total cost of reassigning the closure to \p Domain.
+ double calculateCost(RegDomain Domain) const;
+
+ /// All edges that are included in some closure.
+ DenseSet<unsigned> &EnclosedEdges;
+
+ /// All instructions that are included in some closure.
+ DenseMap<MachineInstr *, Closure *> &EnclosedInstrs;
+
+public:
+ Closure(const TargetInstrInfo *TII, MachineRegisterInfo *MRI,
+ const InstrConverterBaseMap &Converters,
+ std::initializer_list<RegDomain> LegalDstDomainList,
+ DenseSet<unsigned> &EnclosedEdges,
+ DenseMap<MachineInstr *, Closure *> &EnclosedInstrs)
+ : TII(TII), MRI(MRI), Converters(Converters), Domain(NoDomain),
+ EnclosedEdges(EnclosedEdges), EnclosedInstrs(EnclosedInstrs) {
+ for (RegDomain D : LegalDstDomainList)
+ LegalDstDomains.set(D);
+ }
+
+ /// Starting from \Reg, expand the closure as much as possible.
+ void buildClosure(unsigned E);
+
+ /// /returns true if it is profitable to reassign the closure to \p Domain.
+ bool isReassignmentProfitable(RegDomain Domain) const;
+
+ /// Reassign the closure to \p Domain.
+ void Reassign(RegDomain Domain) const;
+
+ /// Mark this closure as illegal for reassignment to all domains.
+ void setAllIllegal() { LegalDstDomains.reset(); }
+
+ /// \returns true if this closure has domains which are legal to reassign to.
+ bool hasLegalDstDomain() const { return LegalDstDomains.any(); }
+
+ /// \returns true if is legal to reassign this closure to domain \p RD.
+ bool isLegal(RegDomain RD) const { return LegalDstDomains[RD]; }
+
+ bool empty() const { return Edges.empty(); }
+};
+
+class X86DomainReassignment : public MachineFunctionPass {
+public:
+ static char ID;
+
+ X86DomainReassignment() : MachineFunctionPass(ID) {
+ initializeX86DomainReassignmentPass(*PassRegistry::getPassRegistry());
+ }
+
+ bool runOnMachineFunction(MachineFunction &MF) override;
+
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.setPreservesCFG();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
+ StringRef getPassName() const override {
+ return "X86 Domain Reassignment Pass";
+ }
+
+private:
+ const X86Subtarget *STI;
+ MachineRegisterInfo *MRI;
+ const X86InstrInfo *TII;
+
+ /// A map of available Instruction Converters.
+ InstrConverterBaseMap Converters;
+
+ /// Initialize Converters map.
+ void initConverters();
+};
+
+char X86DomainReassignment::ID = 0;
+
+} // End anonymous namespace.
+
+void Closure::visitRegister(unsigned Reg, SmallVectorImpl<unsigned> &Worklist) {
+ if (EnclosedEdges.count(Reg))
+ return;
+
+ if (!TargetRegisterInfo::isVirtualRegister(Reg))
+ return;
+
+ if (!MRI->hasOneDef(Reg))
+ return;
+
+ RegDomain RD = getDomain(MRI->getRegClass(Reg), MRI->getTargetRegisterInfo());
+ // First edge in closure sets the domain.
+ if (Domain == NoDomain)
+ Domain = RD;
+
+ if (Domain != RD)
+ return;
+
+ Worklist.push_back(Reg);
+}
+
+void Closure::encloseInstr(MachineInstr *MI) {
+ auto I = EnclosedInstrs.find(MI);
+ if (I != EnclosedInstrs.end()) {
+ if (I->second != this)
+ // Instruction already belongs to another closure, avoid conflicts between
+ // closure and mark this closure as illegal.
+ setAllIllegal();
+ return;
+ }
+
+ EnclosedInstrs[MI] = this;
+ Instrs.push_back(MI);
+
+ // Mark closure as illegal for reassignment to domains, if there is no
+ // converter for the instruction or if the converter cannot convert the
+ // instruction.
+ for (unsigned i = 0; i != LegalDstDomains.size(); ++i) {
+ if (LegalDstDomains[i]) {
+ InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()});
+ if (!IC || !IC->isLegal(MI, TII))
+ LegalDstDomains[i] = false;
+ }
+ }
+}
+
+double Closure::calculateCost(RegDomain DstDomain) const {
+ assert(isLegal(DstDomain) && "Cannot calculate cost for illegal closure");
+
+ double Cost = 0.0;
+ for (auto MI : Instrs)
+ Cost +=
+ Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI);
+ return Cost;
+}
+
+bool Closure::isReassignmentProfitable(RegDomain Domain) const {
+ return calculateCost(Domain) < 0.0;
+}
+
+void Closure::Reassign(RegDomain Domain) const {
+ assert(isLegal(Domain) && "Cannot convert illegal closure");
+
+ // Iterate all instructions in the closure, convert each one using the
+ // appropriate converter.
+ SmallVector<MachineInstr *, 8> ToErase;
+ for (auto MI : Instrs)
+ if (Converters.lookup({Domain, MI->getOpcode()})
+ ->convertInstr(MI, TII, MRI))
+ ToErase.push_back(MI);
+
+ // Iterate all registers in the closure, replace them with registers in the
+ // destination domain.
+ for (unsigned Reg : Edges) {
+ MRI->setRegClass(Reg, getDstRC(MRI->getRegClass(Reg), Domain));
+ for (auto &MO : MRI->use_operands(Reg)) {
+ if (MO.isReg())
+ // Remove all subregister references as they are not valid in the
+ // destination domain.
+ MO.setSubReg(0);
+ }
+ }
+
+ for (auto MI : ToErase)
+ MI->eraseFromParent();
+}
+
+/// \returns true when \p Reg is used as part of an address calculation in \p
+/// MI.
+static bool usedAsAddr(const MachineInstr &MI, unsigned Reg,
+ const TargetInstrInfo *TII) {
+ if (!MI.mayLoadOrStore())
+ return false;
+
+ const MCInstrDesc &Desc = TII->get(MI.getOpcode());
+ int MemOpStart = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemOpStart == -1)
+ return false;
+
+ MemOpStart += X86II::getOperandBias(Desc);
+ for (unsigned MemOpIdx = MemOpStart,
+ MemOpEnd = MemOpStart + X86::AddrNumOperands;
+ MemOpIdx < MemOpEnd; ++MemOpIdx) {
+ auto &Op = MI.getOperand(MemOpIdx);
+ if (Op.isReg() && Op.getReg() == Reg)
+ return true;
+ }
+ return false;
+}
+
+void Closure::buildClosure(unsigned Reg) {
+ SmallVector<unsigned, 4> Worklist;
+ visitRegister(Reg, Worklist);
+ while (!Worklist.empty()) {
+ unsigned CurReg = Worklist.pop_back_val();
+
+ // Register already in this closure.
+ if (!Edges.insert(CurReg).second)
+ continue;
+
+ MachineInstr *DefMI = MRI->getVRegDef(CurReg);
+ encloseInstr(DefMI);
+
+ // Add register used by the defining MI to the worklist.
+ // Do not add registers which are used in address calculation, they will be
+ // added to a different closure.
+ int OpEnd = DefMI->getNumOperands();
+ const MCInstrDesc &Desc = DefMI->getDesc();
+ int MemOp = X86II::getMemoryOperandNo(Desc.TSFlags);
+ if (MemOp != -1)
+ MemOp += X86II::getOperandBias(Desc);
+ for (int OpIdx = 0; OpIdx < OpEnd; ++OpIdx) {
+ if (OpIdx == MemOp) {
+ // skip address calculation.
+ OpIdx += (X86::AddrNumOperands - 1);
+ continue;
+ }
+ auto &Op = DefMI->getOperand(OpIdx);
+ if (!Op.isReg() || !Op.isUse())
+ continue;
+ visitRegister(Op.getReg(), Worklist);
+ }
+
+ // Expand closure through register uses.
+ for (auto &UseMI : MRI->use_nodbg_instructions(CurReg)) {
+ // We would like to avoid converting closures which calculare addresses,
+ // as this should remain in GPRs.
+ if (usedAsAddr(UseMI, CurReg, TII)) {
+ setAllIllegal();
+ continue;
+ }
+ encloseInstr(&UseMI);
+
+ for (auto &DefOp : UseMI.defs()) {
+ if (!DefOp.isReg())
+ continue;
+
+ unsigned DefReg = DefOp.getReg();
+ if (!TargetRegisterInfo::isVirtualRegister(DefReg)) {
+ setAllIllegal();
+ continue;
+ }
+ visitRegister(DefReg, Worklist);
+ }
+ }
+ }
+}
+
+void X86DomainReassignment::initConverters() {
+ Converters[{MaskDomain, TargetOpcode::PHI}] =
+ new InstrIgnore(TargetOpcode::PHI);
+
+ Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
+ new InstrDeleter(TargetOpcode::IMPLICIT_DEF);
+
+ Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
+ new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2);
+
+ Converters[{MaskDomain, TargetOpcode::COPY}] =
+ new InstrCOPYReplacer(TargetOpcode::COPY, MaskDomain, TargetOpcode::COPY);
+
+ auto createReplacerDstCOPY = [&](unsigned From, unsigned To) {
+ Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(From, To);
+ };
+
+ createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm);
+ createReplacerDstCOPY(X86::MOVZX64rm16, X86::KMOVWkm);
+
+ createReplacerDstCOPY(X86::MOVZX32rr16, X86::KMOVWkk);
+ createReplacerDstCOPY(X86::MOVZX64rr16, X86::KMOVWkk);
+
+ if (STI->hasDQI()) {
+ createReplacerDstCOPY(X86::MOVZX16rm8, X86::KMOVBkm);
+ createReplacerDstCOPY(X86::MOVZX32rm8, X86::KMOVBkm);
+ createReplacerDstCOPY(X86::MOVZX64rm8, X86::KMOVBkm);
+
+ createReplacerDstCOPY(X86::MOVZX16rr8, X86::KMOVBkk);
+ createReplacerDstCOPY(X86::MOVZX32rr8, X86::KMOVBkk);
+ createReplacerDstCOPY(X86::MOVZX64rr8, X86::KMOVBkk);
+ }
+
+ auto createReplacer = [&](unsigned From, unsigned To) {
+ Converters[{MaskDomain, From}] = new InstrReplacer(From, To);
+ };
+
+ createReplacer(X86::MOV16rm, X86::KMOVWkm);
+ createReplacer(X86::MOV16mr, X86::KMOVWmk);
+ createReplacer(X86::MOV16rr, X86::KMOVWkk);
+ createReplacer(X86::SHR16ri, X86::KSHIFTRWri);
+ createReplacer(X86::SHL16ri, X86::KSHIFTLWri);
+ createReplacer(X86::NOT16r, X86::KNOTWrr);
+ createReplacer(X86::OR16rr, X86::KORWrr);
+ createReplacer(X86::AND16rr, X86::KANDWrr);
+ createReplacer(X86::XOR16rr, X86::KXORWrr);
+
+ if (STI->hasBWI()) {
+ createReplacer(X86::MOV32rm, X86::KMOVDkm);
+ createReplacer(X86::MOV64rm, X86::KMOVQkm);
+
+ createReplacer(X86::MOV32mr, X86::KMOVDmk);
+ createReplacer(X86::MOV64mr, X86::KMOVQmk);
+
+ createReplacer(X86::MOV32rr, X86::KMOVDkk);
+ createReplacer(X86::MOV64rr, X86::KMOVQkk);
+
+ createReplacer(X86::SHR32ri, X86::KSHIFTRDri);
+ createReplacer(X86::SHR64ri, X86::KSHIFTRQri);
+
+ createReplacer(X86::SHL32ri, X86::KSHIFTLDri);
+ createReplacer(X86::SHL64ri, X86::KSHIFTLQri);
+
+ createReplacer(X86::ADD32rr, X86::KADDDrr);
+ createReplacer(X86::ADD64rr, X86::KADDQrr);
+
+ createReplacer(X86::NOT32r, X86::KNOTDrr);
+ createReplacer(X86::NOT64r, X86::KNOTQrr);
+
+ createReplacer(X86::OR32rr, X86::KORDrr);
+ createReplacer(X86::OR64rr, X86::KORQrr);
+
+ createReplacer(X86::AND32rr, X86::KANDDrr);
+ createReplacer(X86::AND64rr, X86::KANDQrr);
+
+ createReplacer(X86::ANDN32rr, X86::KANDNDrr);
+ createReplacer(X86::ANDN64rr, X86::KANDNQrr);
+
+ createReplacer(X86::XOR32rr, X86::KXORDrr);
+ createReplacer(X86::XOR64rr, X86::KXORQrr);
+
+ createReplacer(X86::TEST32rr, X86::KTESTDrr);
+ createReplacer(X86::TEST64rr, X86::KTESTQrr);
+ }
+
+ if (STI->hasDQI()) {
+ createReplacer(X86::ADD8rr, X86::KADDBrr);
+ createReplacer(X86::ADD16rr, X86::KADDWrr);
+
+ createReplacer(X86::AND8rr, X86::KANDBrr);
+
+ createReplacer(X86::MOV8rm, X86::KMOVBkm);
+ createReplacer(X86::MOV8mr, X86::KMOVBmk);
+ createReplacer(X86::MOV8rr, X86::KMOVBkk);
+
+ createReplacer(X86::NOT8r, X86::KNOTBrr);
+
+ createReplacer(X86::OR8rr, X86::KORBrr);
+
+ createReplacer(X86::SHR8ri, X86::KSHIFTRBri);
+ createReplacer(X86::SHL8ri, X86::KSHIFTLBri);
+
+ createReplacer(X86::TEST8rr, X86::KTESTBrr);
+ createReplacer(X86::TEST16rr, X86::KTESTWrr);
+
+ createReplacer(X86::XOR8rr, X86::KXORBrr);
+ }
+}
+
+bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
+ if (skipFunction(MF.getFunction()))
+ return false;
+ if (DisableX86DomainReassignment)
+ return false;
+
+ DEBUG(dbgs() << "***** Machine Function before Domain Reassignment *****\n");
+ DEBUG(MF.print(dbgs()));
+
+ STI = &MF.getSubtarget<X86Subtarget>();
+ // GPR->K is the only transformation currently supported, bail out early if no
+ // AVX512.
+ if (!STI->hasAVX512())
+ return false;
+
+ MRI = &MF.getRegInfo();
+ assert(MRI->isSSA() && "Expected MIR to be in SSA form");
+
+ TII = STI->getInstrInfo();
+ initConverters();
+ bool Changed = false;
+
+ DenseSet<unsigned> EnclosedEdges;
+ DenseMap<MachineInstr *, Closure *> EnclosedInstrs;
+
+ std::vector<Closure> Closures;
+
+ // Go over all virtual registers and calculate a closure.
+ for (unsigned Idx = 0; Idx < MRI->getNumVirtRegs(); ++Idx) {
+ unsigned Reg = TargetRegisterInfo::index2VirtReg(Idx);
+
+ // GPR only current source domain supported.
+ if (!isGPR(MRI->getRegClass(Reg)))
+ continue;
+
+ // Register already in closure.
+ if (EnclosedEdges.count(Reg))
+ continue;
+
+ // Calculate closure starting with Reg.
+ Closure C(TII, MRI, Converters, {MaskDomain}, EnclosedEdges,
+ EnclosedInstrs);
+ C.buildClosure(Reg);
+
+ // Collect all closures that can potentially be converted.
+ if (!C.empty() && C.isLegal(MaskDomain))
+ Closures.push_back(std::move(C));
+ }
+
+ for (Closure &C : Closures)
+ if (C.isReassignmentProfitable(MaskDomain)) {
+ C.Reassign(MaskDomain);
+ ++NumClosuresConverted;
+ Changed = true;
+ }
+
+ for (auto I : Converters)
+ delete I.second;
+
+ DEBUG(dbgs() << "***** Machine Function after Domain Reassignment *****\n");
+ DEBUG(MF.print(dbgs()));
+
+ return Changed;
+}
+
+INITIALIZE_PASS(X86DomainReassignment, "x86-domain-reassignment",
+ "X86 Domain Reassignment Pass", false, false)
+
+/// Returns an instance of the Domain Reassignment pass.
+FunctionPass *llvm::createX86DomainReassignmentPass() {
+ return new X86DomainReassignment();
+}
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index 6472bbbc9016..6dd4631a4844 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -1,4 +1,4 @@
-//===----------------------- X86EvexToVex.cpp ----------------------------===//
+//===- X86EvexToVex.cpp ---------------------------------------------------===//
// Compress EVEX instructions to VEX encoding when possible to reduce code size
//
// The LLVM Compiler Infrastructure
@@ -6,18 +6,19 @@
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
-//===---------------------------------------------------------------------===//
+//===----------------------------------------------------------------------===//
+//
/// \file
/// This file defines the pass that goes over all AVX-512 instructions which
/// are encoded using the EVEX prefix and if possible replaces them by their
/// corresponding VEX encoding which is usually shorter by 2 bytes.
/// EVEX instructions may be encoded via the VEX prefix when the AVX-512
/// instruction has a corresponding AVX/AVX2 opcode and when it does not
-/// use the xmm or the mask registers or xmm/ymm registers wuith indexes
+/// use the xmm or the mask registers or xmm/ymm registers with indexes
/// higher than 15.
/// The pass applies code reduction on the generated code for AVX-512 instrs.
-///
-//===---------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
#include "InstPrinter/X86InstComments.h"
#include "MCTargetDesc/X86BaseInfo.h"
@@ -54,7 +55,7 @@ namespace {
class EvexToVexInstPass : public MachineFunctionPass {
/// X86EvexToVexCompressTable - Evex to Vex encoding opcode map.
- typedef DenseMap<unsigned, uint16_t> EvexToVexTableType;
+ using EvexToVexTableType = DenseMap<unsigned, uint16_t>;
EvexToVexTableType EvexToVex128Table;
EvexToVexTableType EvexToVex256Table;
@@ -101,10 +102,10 @@ private:
const X86InstrInfo *TII;
};
-char EvexToVexInstPass::ID = 0;
-
} // end anonymous namespace
+char EvexToVexInstPass::ID = 0;
+
bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
@@ -118,8 +119,8 @@ bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
/// EVEX encoded instrs by VEX encoding when possible.
for (MachineBasicBlock &MBB : MF) {
- // Traverse the basic block.
- for (MachineInstr &MI : MBB)
+ // Traverse the basic block.
+ for (MachineInstr &MI : MBB)
Changed |= CompressEvexToVexImpl(MI);
}
@@ -131,6 +132,75 @@ void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
EvexToVexTable[EvexOp] = VexOp;
}
+static bool usesExtendedRegister(const MachineInstr &MI) {
+ auto isHiRegIdx = [](unsigned Reg) {
+ // Check for XMM register with indexes between 16 - 31.
+ if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
+ return true;
+
+ // Check for YMM register with indexes between 16 - 31.
+ if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
+ return true;
+
+ return false;
+ };
+
+ // Check that operands are not ZMM regs or
+ // XMM/YMM regs with hi indexes between 16 - 31.
+ for (const MachineOperand &MO : MI.explicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ unsigned Reg = MO.getReg();
+
+ assert(!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31) &&
+ "ZMM instructions should not be in the EVEX->VEX tables");
+
+ if (isHiRegIdx(Reg))
+ return true;
+ }
+
+ return false;
+}
+
+// Do any custom cleanup needed to finalize the conversion.
+static void performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
+ (void)NewOpc;
+ unsigned Opc = MI.getOpcode();
+ switch (Opc) {
+ case X86::VALIGNDZ128rri:
+ case X86::VALIGNDZ128rmi:
+ case X86::VALIGNQZ128rri:
+ case X86::VALIGNQZ128rmi: {
+ assert((NewOpc == X86::VPALIGNRrri || NewOpc == X86::VPALIGNRrmi) &&
+ "Unexpected new opcode!");
+ unsigned Scale = (Opc == X86::VALIGNQZ128rri ||
+ Opc == X86::VALIGNQZ128rmi) ? 8 : 4;
+ MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ Imm.setImm(Imm.getImm() * Scale);
+ break;
+ }
+ case X86::VSHUFF32X4Z256rmi:
+ case X86::VSHUFF32X4Z256rri:
+ case X86::VSHUFF64X2Z256rmi:
+ case X86::VSHUFF64X2Z256rri:
+ case X86::VSHUFI32X4Z256rmi:
+ case X86::VSHUFI32X4Z256rri:
+ case X86::VSHUFI64X2Z256rmi:
+ case X86::VSHUFI64X2Z256rri: {
+ assert((NewOpc == X86::VPERM2F128rr || NewOpc == X86::VPERM2I128rr ||
+ NewOpc == X86::VPERM2F128rm || NewOpc == X86::VPERM2I128rm) &&
+ "Unexpected new opcode!");
+ MachineOperand &Imm = MI.getOperand(MI.getNumExplicitOperands()-1);
+ int64_t ImmVal = Imm.getImm();
+ // Set bit 5, move bit 1 to bit 4, copy bit 0.
+ Imm.setImm(0x20 | ((ImmVal & 2) << 3) | (ImmVal & 1));
+ break;
+ }
+ }
+}
+
+
// For EVEX instructions that can be encoded using VEX encoding
// replace them by the VEX encoding in order to reduce size.
bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
@@ -147,18 +217,18 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
// Check for EVEX instructions only.
if ((Desc.TSFlags & X86II::EncodingMask) != X86II::EVEX)
return false;
-
- // Check for EVEX instructions with mask or broadcast as in these cases
- // the EVEX prefix is needed in order to carry this information
+
+ // Check for EVEX instructions with mask or broadcast as in these cases
+ // the EVEX prefix is needed in order to carry this information
// thus preventing the transformation to VEX encoding.
if (Desc.TSFlags & (X86II::EVEX_K | X86II::EVEX_B))
return false;
-
+
// Check for non EVEX_V512 instrs only.
// EVEX_V512 instr: bit EVEX_L2 = 1; bit VEX_L = 0.
if ((Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L))
- return false;
-
+ return false;
+
// EVEX_V128 instr: bit EVEX_L2 = 0, bit VEX_L = 0.
bool IsEVEX_V128 =
(!(Desc.TSFlags & X86II::EVEX_L2) && !(Desc.TSFlags & X86II::VEX_L));
@@ -176,7 +246,6 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
if (It != EvexToVex256Table.end())
NewOpc = It->second;
}
-
// Check for EVEX_V128 or Scalar instructions.
else if (IsEVEX_V128) {
// Search for opcode in the EvexToVex128 table.
@@ -188,36 +257,14 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
if (!NewOpc)
return false;
- auto isHiRegIdx = [](unsigned Reg) {
- // Check for XMM register with indexes between 16 - 31.
- if (Reg >= X86::XMM16 && Reg <= X86::XMM31)
- return true;
-
- // Check for YMM register with indexes between 16 - 31.
- if (Reg >= X86::YMM16 && Reg <= X86::YMM31)
- return true;
-
+ if (usesExtendedRegister(MI))
return false;
- };
-
- // Check that operands are not ZMM regs or
- // XMM/YMM regs with hi indexes between 16 - 31.
- for (const MachineOperand &MO : MI.explicit_operands()) {
- if (!MO.isReg())
- continue;
- unsigned Reg = MO.getReg();
-
- assert (!(Reg >= X86::ZMM0 && Reg <= X86::ZMM31));
+ performCustomAdjustments(MI, NewOpc);
- if (isHiRegIdx(Reg))
- return false;
- }
-
- const MCInstrDesc &MCID = TII->get(NewOpc);
- MI.setDesc(MCID);
+ MI.setDesc(TII->get(NewOpc));
MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
- return true;
+ return true;
}
INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index 5dfd95f71301..ab2ef26d1cc9 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -222,7 +222,7 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
case X86::EH_RESTORE: {
// Restore ESP and EBP, and optionally ESI if required.
bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
- MBB.getParent()->getFunction()->getPersonalityFn()));
+ MBB.getParent()->getFunction().getPersonalityFn()));
X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
MBBI->eraseFromParent();
return true;
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index 527e5d568ac6..5dae485f4c9f 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -110,6 +110,8 @@ private:
bool X86SelectZExt(const Instruction *I);
+ bool X86SelectSExt(const Instruction *I);
+
bool X86SelectBranch(const Instruction *I);
bool X86SelectShift(const Instruction *I);
@@ -208,8 +210,8 @@ getX86SSEConditionCode(CmpInst::Predicate Predicate) {
case CmpInst::FCMP_ULT: NeedSwap = true; LLVM_FALLTHROUGH;
case CmpInst::FCMP_UGT: CC = 6; break;
case CmpInst::FCMP_ORD: CC = 7; break;
- case CmpInst::FCMP_UEQ:
- case CmpInst::FCMP_ONE: CC = 8; break;
+ case CmpInst::FCMP_UEQ: CC = 8; break;
+ case CmpInst::FCMP_ONE: CC = 12; break;
}
return std::make_pair(CC, NeedSwap);
@@ -329,10 +331,6 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
switch (VT.getSimpleVT().SimpleTy) {
default: return false;
case MVT::i1:
- // TODO: Support this properly.
- if (Subtarget->hasAVX512())
- return false;
- LLVM_FALLTHROUGH;
case MVT::i8:
Opc = X86::MOV8rm;
RC = &X86::GR8RegClass;
@@ -353,7 +351,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
case MVT::f32:
if (X86ScalarSSEf32) {
Opc = HasAVX512 ? X86::VMOVSSZrm : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = &X86::FR32RegClass;
+ RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
RC = &X86::RFP32RegClass;
@@ -362,7 +360,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
case MVT::f64:
if (X86ScalarSSEf64) {
Opc = HasAVX512 ? X86::VMOVSDZrm : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = &X86::FR64RegClass;
+ RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
RC = &X86::RFP64RegClass;
@@ -381,7 +379,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVUPSZ128rm :
HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm;
- RC = &X86::VR128RegClass;
+ RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v2f64:
if (IsNonTemporal && Alignment >= 16 && HasSSE41)
@@ -393,7 +391,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVUPDZ128rm :
HasAVX ? X86::VMOVUPDrm : X86::MOVUPDrm;
- RC = &X86::VR128RegClass;
+ RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v4i32:
case MVT::v2i64:
@@ -408,7 +406,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
else
Opc = HasVLX ? X86::VMOVDQU64Z128rm :
HasAVX ? X86::VMOVDQUrm : X86::MOVDQUrm;
- RC = &X86::VR128RegClass;
+ RC = HasVLX ? &X86::VR128XRegClass : &X86::VR128RegClass;
break;
case MVT::v8f32:
assert(HasAVX);
@@ -420,19 +418,19 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
Opc = HasVLX ? X86::VMOVAPSZ256rm : X86::VMOVAPSYrm;
else
Opc = HasVLX ? X86::VMOVUPSZ256rm : X86::VMOVUPSYrm;
- RC = &X86::VR256RegClass;
+ RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v4f64:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
- Opc = X86::VMOVNTDQAYrm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
else if (IsNonTemporal && Alignment >= 16)
return false; // Force split for X86::VMOVNTDQArm
else if (Alignment >= 32)
Opc = HasVLX ? X86::VMOVAPDZ256rm : X86::VMOVAPDYrm;
else
Opc = HasVLX ? X86::VMOVUPDZ256rm : X86::VMOVUPDYrm;
- RC = &X86::VR256RegClass;
+ RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v8i32:
case MVT::v4i64:
@@ -440,14 +438,14 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
case MVT::v32i8:
assert(HasAVX);
if (IsNonTemporal && Alignment >= 32 && HasAVX2)
- Opc = X86::VMOVNTDQAYrm;
+ Opc = HasVLX ? X86::VMOVNTDQAZ256rm : X86::VMOVNTDQAYrm;
else if (IsNonTemporal && Alignment >= 16)
return false; // Force split for X86::VMOVNTDQArm
else if (Alignment >= 32)
Opc = HasVLX ? X86::VMOVDQA64Z256rm : X86::VMOVDQAYrm;
else
Opc = HasVLX ? X86::VMOVDQU64Z256rm : X86::VMOVDQUYrm;
- RC = &X86::VR256RegClass;
+ RC = HasVLX ? &X86::VR256XRegClass : &X86::VR256RegClass;
break;
case MVT::v16f32:
assert(HasAVX512);
@@ -510,16 +508,6 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
case MVT::f80: // No f80 support yet.
default: return false;
case MVT::i1: {
- // In case ValReg is a K register, COPY to a GPR
- if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) {
- unsigned KValReg = ValReg;
- ValReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ValReg)
- .addReg(KValReg);
- ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true,
- X86::sub_8bit);
- }
// Mask out all but lowest bit.
unsigned AndResult = createResultReg(&X86::GR8RegClass);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -1077,10 +1065,6 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
(AM.Base.Reg != 0 || AM.IndexReg != 0))
return false;
- // Can't handle DLL Import.
- if (GV->hasDLLImportStorageClass())
- return false;
-
// Can't handle TLS.
if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
if (GVar->isThreadLocal())
@@ -1089,8 +1073,9 @@ bool X86FastISel::X86SelectCallAddress(const Value *V, X86AddressMode &AM) {
// Okay, we've committed to selecting this global. Set up the basic address.
AM.GV = GV;
- // No ABI requires an extra load for anything other than DLLImport, which
- // we rejected above. Return a direct reference to the global.
+ // Return a direct reference to the global. Fastisel can handle calls to
+ // functions that require loads, such as dllimport and nonlazybind
+ // functions.
if (Subtarget->isPICStyleRIPRel()) {
// Use rip-relative addressing if we can. Above we verified that the
// base and index registers are unused.
@@ -1254,16 +1239,6 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
if (SrcVT == MVT::i1) {
if (Outs[0].Flags.isSExt())
return false;
- // In case SrcReg is a K register, COPY to a GPR
- if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) {
- unsigned KSrcReg = SrcReg;
- SrcReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), SrcReg)
- .addReg(KSrcReg);
- SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
- X86::sub_8bit);
- }
SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
SrcVT = MVT::i8;
}
@@ -1367,6 +1342,7 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
}
static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
+ bool HasAVX512 = Subtarget->hasAVX512();
bool HasAVX = Subtarget->hasAVX();
bool X86ScalarSSEf32 = Subtarget->hasSSE1();
bool X86ScalarSSEf64 = Subtarget->hasSSE2();
@@ -1378,9 +1354,15 @@ static unsigned X86ChooseCmpOpcode(EVT VT, const X86Subtarget *Subtarget) {
case MVT::i32: return X86::CMP32rr;
case MVT::i64: return X86::CMP64rr;
case MVT::f32:
- return X86ScalarSSEf32 ? (HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr) : 0;
+ return X86ScalarSSEf32
+ ? (HasAVX512 ? X86::VUCOMISSZrr
+ : HasAVX ? X86::VUCOMISSrr : X86::UCOMISSrr)
+ : 0;
case MVT::f64:
- return X86ScalarSSEf64 ? (HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr) : 0;
+ return X86ScalarSSEf64
+ ? (HasAVX512 ? X86::VUCOMISDZrr
+ : HasAVX ? X86::VUCOMISDrr : X86::UCOMISDrr)
+ : 0;
}
}
@@ -1453,9 +1435,6 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
if (!isTypeLegal(I->getOperand(0)->getType(), VT))
return false;
- if (I->getType()->isIntegerTy(1) && Subtarget->hasAVX512())
- return false;
-
// Try to optimize or fold the cmp.
CmpInst::Predicate Predicate = optimizeCmpPredicate(CI);
unsigned ResultReg = 0;
@@ -1555,17 +1534,6 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
// Handle zero-extension from i1 to i8, which is common.
MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
if (SrcVT == MVT::i1) {
- // In case ResultReg is a K register, COPY to a GPR
- if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) {
- unsigned KResultReg = ResultReg;
- ResultReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ResultReg)
- .addReg(KResultReg);
- ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
- X86::sub_8bit);
- }
-
// Set the high bits to zero.
ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
SrcVT = MVT::i8;
@@ -1593,6 +1561,15 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::SUBREG_TO_REG),
ResultReg)
.addImm(0).addReg(Result32).addImm(X86::sub_32bit);
+ } else if (DstVT == MVT::i16) {
+ // i8->i16 doesn't exist in the autogenerated isel table. Need to zero
+ // extend to 32-bits and then extract down to 16-bits.
+ unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
+ Result32).addReg(ResultReg);
+
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true,
+ X86::sub_16bit);
} else if (DstVT != MVT::i8) {
ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::ZERO_EXTEND,
ResultReg, /*Kill=*/true);
@@ -1604,6 +1581,52 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
return true;
}
+bool X86FastISel::X86SelectSExt(const Instruction *I) {
+ EVT DstVT = TLI.getValueType(DL, I->getType());
+ if (!TLI.isTypeLegal(DstVT))
+ return false;
+
+ unsigned ResultReg = getRegForValue(I->getOperand(0));
+ if (ResultReg == 0)
+ return false;
+
+ // Handle sign-extension from i1 to i8.
+ MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
+ if (SrcVT == MVT::i1) {
+ // Set the high bits to zero.
+ unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
+ /*TODO: Kill=*/false);
+ if (ZExtReg == 0)
+ return false;
+
+ // Negate the result to make an 8-bit sign extended value.
+ ResultReg = createResultReg(&X86::GR8RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::NEG8r),
+ ResultReg).addReg(ZExtReg);
+
+ SrcVT = MVT::i8;
+ }
+
+ if (DstVT == MVT::i16) {
+ // i8->i16 doesn't exist in the autogenerated isel table. Need to sign
+ // extend to 32-bits and then extract down to 16-bits.
+ unsigned Result32 = createResultReg(&X86::GR32RegClass);
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
+ Result32).addReg(ResultReg);
+
+ ResultReg = fastEmitInst_extractsubreg(MVT::i16, Result32, /*Kill=*/true,
+ X86::sub_16bit);
+ } else if (DstVT != MVT::i8) {
+ ResultReg = fastEmit_r(MVT::i8, DstVT.getSimpleVT(), ISD::SIGN_EXTEND,
+ ResultReg, /*Kill=*/true);
+ if (ResultReg == 0)
+ return false;
+ }
+
+ updateValueMap(I, ResultReg);
+ return true;
+}
+
bool X86FastISel::X86SelectBranch(const Instruction *I) {
// Unconditional branches are selected by tablegen-generated code.
// Handle a conditional branch.
@@ -1766,41 +1789,34 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
bool X86FastISel::X86SelectShift(const Instruction *I) {
unsigned CReg = 0, OpReg = 0;
const TargetRegisterClass *RC = nullptr;
- if (I->getType()->isIntegerTy(8)) {
- CReg = X86::CL;
- RC = &X86::GR8RegClass;
- switch (I->getOpcode()) {
- case Instruction::LShr: OpReg = X86::SHR8rCL; break;
- case Instruction::AShr: OpReg = X86::SAR8rCL; break;
- case Instruction::Shl: OpReg = X86::SHL8rCL; break;
- default: return false;
- }
- } else if (I->getType()->isIntegerTy(16)) {
+ assert(!I->getType()->isIntegerTy(8) &&
+ "i8 shifts should be handled by autogenerated table");
+ if (I->getType()->isIntegerTy(16)) {
CReg = X86::CX;
RC = &X86::GR16RegClass;
switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected shift opcode");
case Instruction::LShr: OpReg = X86::SHR16rCL; break;
case Instruction::AShr: OpReg = X86::SAR16rCL; break;
case Instruction::Shl: OpReg = X86::SHL16rCL; break;
- default: return false;
}
} else if (I->getType()->isIntegerTy(32)) {
CReg = X86::ECX;
RC = &X86::GR32RegClass;
switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected shift opcode");
case Instruction::LShr: OpReg = X86::SHR32rCL; break;
case Instruction::AShr: OpReg = X86::SAR32rCL; break;
case Instruction::Shl: OpReg = X86::SHL32rCL; break;
- default: return false;
}
} else if (I->getType()->isIntegerTy(64)) {
CReg = X86::RCX;
RC = &X86::GR64RegClass;
switch (I->getOpcode()) {
+ default: llvm_unreachable("Unexpected shift opcode");
case Instruction::LShr: OpReg = X86::SHR64rCL; break;
case Instruction::AShr: OpReg = X86::SAR64rCL; break;
case Instruction::Shl: OpReg = X86::SHL64rCL; break;
- default: return false;
}
} else {
return false;
@@ -1820,10 +1836,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
// The shift instruction uses X86::CL. If we defined a super-register
// of X86::CL, emit a subreg KILL to precisely describe what we're doing here.
- if (CReg != X86::CL)
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::KILL), X86::CL)
- .addReg(CReg, RegState::Kill);
+ assert(CReg != X86::CL && "CReg should be a super register of CL");
+ BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+ TII.get(TargetOpcode::KILL), X86::CL)
+ .addReg(CReg, RegState::Kill);
unsigned ResultReg = createResultReg(RC);
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
@@ -1960,12 +1976,12 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
// Generate the DIV/IDIV instruction.
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
TII.get(OpEntry.OpDivRem)).addReg(Op1Reg);
- // For i8 remainder, we can't reference AH directly, as we'll end
- // up with bogus copies like %R9B = COPY %AH. Reference AX
- // instead to prevent AH references in a REX instruction.
+ // For i8 remainder, we can't reference ah directly, as we'll end
+ // up with bogus copies like %r9b = COPY %ah. Reference ax
+ // instead to prevent ah references in a rex instruction.
//
// The current assumption of the fast register allocator is that isel
- // won't generate explicit references to the GPR8_NOREX registers. If
+ // won't generate explicit references to the GR8_NOREX registers. If
// the allocator and/or the backend get enhanced to be more robust in
// that regard, this can be, and should be, removed.
unsigned ResultReg = 0;
@@ -2159,7 +2175,7 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
unsigned CC;
bool NeedSwap;
std::tie(CC, NeedSwap) = getX86SSEConditionCode(Predicate);
- if (CC > 7)
+ if (CC > 7 && !Subtarget->hasAVX())
return false;
if (NeedSwap)
@@ -2394,7 +2410,8 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
if (!Subtarget->hasAVX())
return false;
- if (!I->getOperand(0)->getType()->isIntegerTy(32))
+ Type *InTy = I->getOperand(0)->getType();
+ if (!InTy->isIntegerTy(32) && !InTy->isIntegerTy(64))
return false;
// Select integer to float/double conversion.
@@ -2407,11 +2424,11 @@ bool X86FastISel::X86SelectSIToFP(const Instruction *I) {
if (I->getType()->isDoubleTy()) {
// sitofp int -> double
- Opcode = X86::VCVTSI2SDrr;
+ Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SDrr : X86::VCVTSI2SDrr;
RC = &X86::FR64RegClass;
} else if (I->getType()->isFloatTy()) {
// sitofp int -> float
- Opcode = X86::VCVTSI2SSrr;
+ Opcode = InTy->isIntegerTy(64) ? X86::VCVTSI642SSrr : X86::VCVTSI2SSrr;
RC = &X86::FR32RegClass;
} else
return false;
@@ -2461,9 +2478,13 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
bool X86FastISel::X86SelectFPExt(const Instruction *I) {
if (X86ScalarSSEf64 && I->getType()->isDoubleTy() &&
I->getOperand(0)->getType()->isFloatTy()) {
+ bool HasAVX512 = Subtarget->hasAVX512();
// fpext from float to double.
- unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
- return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR64RegClass);
+ unsigned Opc =
+ HasAVX512 ? X86::VCVTSS2SDZrr
+ : Subtarget->hasAVX() ? X86::VCVTSS2SDrr : X86::CVTSS2SDrr;
+ return X86SelectFPExtOrFPTrunc(
+ I, Opc, HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass);
}
return false;
@@ -2472,9 +2493,13 @@ bool X86FastISel::X86SelectFPExt(const Instruction *I) {
bool X86FastISel::X86SelectFPTrunc(const Instruction *I) {
if (X86ScalarSSEf64 && I->getType()->isFloatTy() &&
I->getOperand(0)->getType()->isDoubleTy()) {
+ bool HasAVX512 = Subtarget->hasAVX512();
// fptrunc from double to float.
- unsigned Opc = Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
- return X86SelectFPExtOrFPTrunc(I, Opc, &X86::FR32RegClass);
+ unsigned Opc =
+ HasAVX512 ? X86::VCVTSD2SSZrr
+ : Subtarget->hasAVX() ? X86::VCVTSD2SSrr : X86::CVTSD2SSrr;
+ return X86SelectFPExtOrFPTrunc(
+ I, Opc, HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass);
}
return false;
@@ -2485,8 +2510,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
EVT DstVT = TLI.getValueType(DL, I->getType());
// This code only handles truncation to byte.
- // TODO: Support truncate to i1 with AVX512.
- if (DstVT != MVT::i8 && (DstVT != MVT::i1 || Subtarget->hasAVX512()))
+ if (DstVT != MVT::i8 && DstVT != MVT::i1)
return false;
if (!TLI.isTypeLegal(SrcVT))
return false;
@@ -2502,22 +2526,9 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
return true;
}
- bool KillInputReg = false;
- if (!Subtarget->is64Bit()) {
- // If we're on x86-32; we can't extract an i8 from a general register.
- // First issue a copy to GR16_ABCD or GR32_ABCD.
- const TargetRegisterClass *CopyRC =
- (SrcVT == MVT::i16) ? &X86::GR16_ABCDRegClass : &X86::GR32_ABCDRegClass;
- unsigned CopyReg = createResultReg(CopyRC);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), CopyReg).addReg(InputReg);
- InputReg = CopyReg;
- KillInputReg = true;
- }
-
// Issue an extract_subreg.
unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
- InputReg, KillInputReg,
+ InputReg, false,
X86::sub_8bit);
if (!ResultReg)
return false;
@@ -3300,16 +3311,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
// Handle zero-extension from i1 to i8, which is common.
if (ArgVT == MVT::i1) {
- // In case SrcReg is a K register, COPY to a GPR
- if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) {
- unsigned KArgReg = ArgReg;
- ArgReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(TargetOpcode::COPY), ArgReg)
- .addReg(KArgReg);
- ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true,
- X86::sub_8bit);
- }
// Set the high bits to zero.
ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
ArgVT = MVT::i8;
@@ -3455,19 +3456,26 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
} else {
// Direct call.
assert(GV && "Not a direct call");
- unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-
// See if we need any target-specific flags on the GV operand.
unsigned char OpFlags = Subtarget->classifyGlobalFunctionReference(GV);
- // Ignore NonLazyBind attribute in FastISel
- if (OpFlags == X86II::MO_GOTPCREL)
- OpFlags = 0;
+
+ // This will be a direct call, or an indirect call through memory for
+ // NonLazyBind calls or dllimport calls.
+ bool NeedLoad =
+ OpFlags == X86II::MO_DLLIMPORT || OpFlags == X86II::MO_GOTPCREL;
+ unsigned CallOpc = NeedLoad
+ ? (Is64Bit ? X86::CALL64m : X86::CALL32m)
+ : (Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32);
MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(CallOpc));
+ if (NeedLoad)
+ MIB.addReg(Is64Bit ? X86::RIP : 0).addImm(1).addReg(0);
if (Symbol)
MIB.addSym(Symbol, OpFlags);
else
MIB.addGlobalAddress(GV, 0, OpFlags);
+ if (NeedLoad)
+ MIB.addReg(0);
}
// Add a register mask operand representing the call-preserved registers.
@@ -3515,16 +3523,6 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
report_fatal_error("SSE register return with SSE disabled");
}
- // If the return value is an i1 and AVX-512 is enabled, we need
- // to do a fixup to make the copy legal.
- if (CopyVT == MVT::i1 && SrcReg == X86::AL && Subtarget->hasAVX512()) {
- // Need to copy to a GR32 first.
- // TODO: MOVZX isn't great here. We don't care about the upper bits.
- SrcReg = createResultReg(&X86::GR32RegClass);
- BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
- TII.get(X86::MOVZX32rr8), SrcReg).addReg(X86::AL);
- }
-
// If we prefer to use the value in xmm registers, copy it out as f80 and
// use a truncate to move it from fp stack reg to xmm reg.
if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
@@ -3577,6 +3575,8 @@ X86FastISel::fastSelectInstruction(const Instruction *I) {
return X86SelectCmp(I);
case Instruction::ZExt:
return X86SelectZExt(I);
+ case Instruction::SExt:
+ return X86SelectSExt(I);
case Instruction::Br:
return X86SelectBranch(I);
case Instruction::LShr:
@@ -3723,8 +3723,10 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
default: return 0;
case MVT::f32:
if (X86ScalarSSEf32) {
- Opc = Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
- RC = &X86::FR32RegClass;
+ Opc = Subtarget->hasAVX512()
+ ? X86::VMOVSSZrm
+ : Subtarget->hasAVX() ? X86::VMOVSSrm : X86::MOVSSrm;
+ RC = Subtarget->hasAVX512() ? &X86::FR32XRegClass : &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp32m;
RC = &X86::RFP32RegClass;
@@ -3732,8 +3734,10 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
break;
case MVT::f64:
if (X86ScalarSSEf64) {
- Opc = Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
- RC = &X86::FR64RegClass;
+ Opc = Subtarget->hasAVX512()
+ ? X86::VMOVSDZrm
+ : Subtarget->hasAVX() ? X86::VMOVSDrm : X86::MOVSDrm;
+ RC = Subtarget->hasAVX512() ? &X86::FR64XRegClass : &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp64m;
RC = &X86::RFP64RegClass;
@@ -3871,14 +3875,15 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
return 0;
// Get opcode and regclass for the given zero.
+ bool HasAVX512 = Subtarget->hasAVX512();
unsigned Opc = 0;
const TargetRegisterClass *RC = nullptr;
switch (VT.SimpleTy) {
default: return 0;
case MVT::f32:
if (X86ScalarSSEf32) {
- Opc = X86::FsFLD0SS;
- RC = &X86::FR32RegClass;
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SS : X86::FsFLD0SS;
+ RC = HasAVX512 ? &X86::FR32XRegClass : &X86::FR32RegClass;
} else {
Opc = X86::LD_Fp032;
RC = &X86::RFP32RegClass;
@@ -3886,8 +3891,8 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
break;
case MVT::f64:
if (X86ScalarSSEf64) {
- Opc = X86::FsFLD0SD;
- RC = &X86::FR64RegClass;
+ Opc = HasAVX512 ? X86::AVX512_FsFLD0SD : X86::FsFLD0SD;
+ RC = HasAVX512 ? &X86::FR64XRegClass : &X86::FR64RegClass;
} else {
Opc = X86::LD_Fp064;
RC = &X86::RFP64RegClass;
@@ -3964,7 +3969,7 @@ unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
- Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 3);
+ Op3 = constrainOperandRegClass(II, Op3, II.getNumDefs() + 3);
if (II.getNumDefs() >= 1)
BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index 95c6f2a3fa34..01d10fe4cae4 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -55,9 +55,9 @@
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
#define FIXUPBW_DESC "X86 Byte/Word Instruction Fixup"
@@ -146,12 +146,12 @@ INITIALIZE_PASS(FixupBWInstPass, FIXUPBW_NAME, FIXUPBW_DESC, false, false)
FunctionPass *llvm::createX86FixupBWInsts() { return new FixupBWInstPass(); }
bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
- if (!FixupBWInsts || skipFunction(*MF.getFunction()))
+ if (!FixupBWInsts || skipFunction(MF.getFunction()))
return false;
this->MF = &MF;
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- OptForSize = MF.getFunction()->optForSize();
+ OptForSize = MF.getFunction().optForSize();
MLI = &getAnalysis<MachineLoopInfo>();
LiveRegs.init(TII->getRegisterInfo());
@@ -166,15 +166,86 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
return true;
}
-// TODO: This method of analysis can miss some legal cases, because the
-// super-register could be live into the address expression for a memory
-// reference for the instruction, and still be killed/last used by the
-// instruction. However, the existing query interfaces don't seem to
-// easily allow that to be checked.
-//
-// What we'd really like to know is whether after OrigMI, the
-// only portion of SuperDestReg that is alive is the portion that
-// was the destination register of OrigMI.
+/// Check if register \p Reg is live after the \p MI.
+///
+/// \p LiveRegs should be in a state describing liveness information in
+/// that exact place as this function tries to precise analysis made
+/// by \p LiveRegs by exploiting the information about particular
+/// instruction \p MI. \p MI is expected to be one of the MOVs handled
+/// by the x86FixupBWInsts pass.
+/// Note: similar to LivePhysRegs::contains this would state that
+/// super-register is not used if only some part of it is used.
+///
+/// X86 backend does not have subregister liveness tracking enabled,
+/// so liveness information might be overly conservative. However, for
+/// some specific instructions (this pass only cares about MOVs) we can
+/// produce more precise results by analysing that MOV's operands.
+///
+/// Indeed, if super-register is not live before the mov it means that it
+/// was originally <read-undef> and so we are free to modify these
+/// undef upper bits. That may happen in case where the use is in another MBB
+/// and the vreg/physreg corresponding to the move has higher width than
+/// necessary (e.g. due to register coalescing with a "truncate" copy).
+/// So, it handles pattern like this:
+///
+/// %bb.2: derived from LLVM BB %if.then
+/// Live Ins: %rdi
+/// Predecessors according to CFG: %bb.0
+/// %ax = MOV16rm killed %rdi, 1, %noreg, 0, %noreg, implicit-def %eax;
+/// mem:LD2[%p]
+/// No implicit %eax
+/// Successors according to CFG: %bb.3(?%)
+///
+/// %bb.3: derived from LLVM BB %if.end
+/// Live Ins: %eax Only %ax is actually live
+/// Predecessors according to CFG: %bb.2 %bb.1
+/// %ax = KILL %ax, implicit killed %eax
+/// RET 0, %ax
+static bool isLive(const MachineInstr &MI,
+ const LivePhysRegs &LiveRegs,
+ const TargetRegisterInfo *TRI,
+ unsigned Reg) {
+ if (!LiveRegs.contains(Reg))
+ return false;
+
+ unsigned Opc = MI.getOpcode(); (void)Opc;
+ // These are the opcodes currently handled by the pass, if something
+ // else will be added we need to ensure that new opcode has the same
+ // properties.
+ assert((Opc == X86::MOV8rm || Opc == X86::MOV16rm || Opc == X86::MOV8rr ||
+ Opc == X86::MOV16rr) &&
+ "Unexpected opcode.");
+
+ bool IsDefined = false;
+ for (auto &MO: MI.implicit_operands()) {
+ if (!MO.isReg())
+ continue;
+
+ assert((MO.isDef() || MO.isUse()) && "Expected Def or Use only!");
+
+ for (MCSuperRegIterator Supers(Reg, TRI, true); Supers.isValid(); ++Supers) {
+ if (*Supers == MO.getReg()) {
+ if (MO.isDef())
+ IsDefined = true;
+ else
+ return true; // SuperReg Imp-used' -> live before the MI
+ }
+ }
+ }
+ // Reg is not Imp-def'ed -> it's live both before/after the instruction.
+ if (!IsDefined)
+ return true;
+
+ // Otherwise, the Reg is not live before the MI and the MOV can't
+ // make it really live, so it's in fact dead even after the MI.
+ return false;
+}
+
+/// \brief Check if after \p OrigMI the only portion of super register
+/// of the destination register of \p OrigMI that is alive is that
+/// destination register.
+///
+/// If so, return that super register in \p SuperDestReg.
bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
unsigned &SuperDestReg) const {
auto *TRI = &TII->getRegisterInfo();
@@ -191,7 +262,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
if (SubRegIdx == X86::sub_8bit_hi)
return false;
- if (LiveRegs.contains(SuperDestReg))
+ if (isLive(*OrigMI, LiveRegs, TRI, SuperDestReg))
return false;
if (SubRegIdx == X86::sub_8bit) {
@@ -201,7 +272,7 @@ bool FixupBWInstPass::getSuperRegDestIfDead(MachineInstr *OrigMI,
unsigned UpperByteReg =
getX86SubSuperRegister(SuperDestReg, 8, /*High=*/true);
- if (LiveRegs.contains(UpperByteReg))
+ if (isLive(*OrigMI, LiveRegs, TRI, UpperByteReg))
return false;
}
@@ -328,7 +399,7 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
MachineInstr *MI = &*I;
-
+
if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
MIReplacements.push_back(std::make_pair(MI, NewMI));
diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp
index 9f649dad8bc0..b41bf99f19b2 100644
--- a/lib/Target/X86/X86FixupLEAs.cpp
+++ b/lib/Target/X86/X86FixupLEAs.cpp
@@ -17,14 +17,12 @@
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveVariables.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
namespace llvm {
@@ -193,12 +191,12 @@ FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI,
FunctionPass *llvm::createX86FixupLEAs() { return new FixupLEAPass(); }
bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) {
- if (skipFunction(*Func.getFunction()))
+ if (skipFunction(Func.getFunction()))
return false;
MF = &Func;
const X86Subtarget &ST = Func.getSubtarget<X86Subtarget>();
- OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize();
+ OptIncDec = !ST.slowIncDec() || Func.getFunction().optForMinSize();
OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA();
if (!OptLEA && !OptIncDec)
diff --git a/lib/Target/X86/X86FloatingPoint.cpp b/lib/Target/X86/X86FloatingPoint.cpp
index 5582526541ba..9a72e7114be0 100644
--- a/lib/Target/X86/X86FloatingPoint.cpp
+++ b/lib/Target/X86/X86FloatingPoint.cpp
@@ -37,13 +37,13 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/InlineAsm.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include <algorithm>
#include <bitset>
using namespace llvm;
@@ -349,7 +349,7 @@ bool FPS::runOnMachineFunction(MachineFunction &MF) {
// In regcall convention, some FP registers may not be passed through
// the stack, so they will need to be assigned to the stack first
- if ((Entry->getParent()->getFunction()->getCallingConv() ==
+ if ((Entry->getParent()->getFunction().getCallingConv() ==
CallingConv::X86_RegCall) && (Bundle.Mask && !Bundle.FixCount)) {
// In the register calling convention, up to one FP argument could be
// saved in the first FP register.
@@ -499,7 +499,7 @@ bool FPS::processBasicBlock(MachineFunction &MF, MachineBasicBlock &BB) {
/// setupBlockStack - Use the live bundles to set up our model of the stack
/// to match predecessors' live out stack.
void FPS::setupBlockStack() {
- DEBUG(dbgs() << "\nSetting up live-ins for BB#" << MBB->getNumber()
+ DEBUG(dbgs() << "\nSetting up live-ins for " << printMBBReference(*MBB)
<< " derived from " << MBB->getName() << ".\n");
StackTop = 0;
// Get the live-in bundle for MBB.
@@ -516,7 +516,7 @@ void FPS::setupBlockStack() {
// Push the fixed live-in registers.
for (unsigned i = Bundle.FixCount; i > 0; --i) {
- DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %FP"
+ DEBUG(dbgs() << "Live-in st(" << (i-1) << "): %fp"
<< unsigned(Bundle.FixStack[i-1]) << '\n');
pushReg(Bundle.FixStack[i-1]);
}
@@ -538,7 +538,7 @@ void FPS::finishBlockStack() {
if (MBB->succ_empty())
return;
- DEBUG(dbgs() << "Setting up live-outs for BB#" << MBB->getNumber()
+ DEBUG(dbgs() << "Setting up live-outs for " << printMBBReference(*MBB)
<< " derived from " << MBB->getName() << ".\n");
// Get MBB's live-out bundle.
@@ -893,7 +893,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
while (Kills && Defs) {
unsigned KReg = countTrailingZeros(Kills);
unsigned DReg = countTrailingZeros(Defs);
- DEBUG(dbgs() << "Renaming %FP" << KReg << " as imp %FP" << DReg << "\n");
+ DEBUG(dbgs() << "Renaming %fp" << KReg << " as imp %fp" << DReg << "\n");
std::swap(Stack[getSlot(KReg)], Stack[getSlot(DReg)]);
std::swap(RegMap[KReg], RegMap[DReg]);
Kills &= ~(1 << KReg);
@@ -907,7 +907,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
unsigned KReg = getStackEntry(0);
if (!(Kills & (1 << KReg)))
break;
- DEBUG(dbgs() << "Popping %FP" << KReg << "\n");
+ DEBUG(dbgs() << "Popping %fp" << KReg << "\n");
popStackAfter(I2);
Kills &= ~(1 << KReg);
}
@@ -916,7 +916,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Manually kill the rest.
while (Kills) {
unsigned KReg = countTrailingZeros(Kills);
- DEBUG(dbgs() << "Killing %FP" << KReg << "\n");
+ DEBUG(dbgs() << "Killing %fp" << KReg << "\n");
freeStackSlotBefore(I, KReg);
Kills &= ~(1 << KReg);
}
@@ -924,7 +924,7 @@ void FPS::adjustLiveRegs(unsigned Mask, MachineBasicBlock::iterator I) {
// Load zeros for all the imp-defs.
while(Defs) {
unsigned DReg = countTrailingZeros(Defs);
- DEBUG(dbgs() << "Defining %FP" << DReg << " as 0\n");
+ DEBUG(dbgs() << "Defining %fp" << DReg << " as 0\n");
BuildMI(*MBB, I, DebugLoc(), TII->get(X86::LD_F0));
pushReg(DReg);
Defs &= ~(1 << DReg);
@@ -973,7 +973,7 @@ void FPS::handleCall(MachineBasicBlock::iterator &I) {
unsigned R = MO.getReg() - X86::FP0;
if (R < 8) {
- if (MF->getFunction()->getCallingConv() != CallingConv::X86_RegCall) {
+ if (MF->getFunction().getCallingConv() != CallingConv::X86_RegCall) {
assert(MO.isDef() && MO.isImplicit());
}
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index f294e819090b..80b1cc192a88 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -148,8 +148,7 @@ static unsigned findDeadCallerSavedReg(MachineBasicBlock &MBB,
const X86RegisterInfo *TRI,
bool Is64Bit) {
const MachineFunction *MF = MBB.getParent();
- const Function *F = MF->getFunction();
- if (!F || MF->callsEHReturn())
+ if (MF->callsEHReturn())
return 0;
const TargetRegisterClass &AvailableRegs = *TRI->getGPRsForTailCall(*MF);
@@ -820,7 +819,7 @@ uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) con
const MachineFrameInfo &MFI = MF.getFrameInfo();
uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
unsigned StackAlign = getStackAlignment();
- if (MF.getFunction()->hasFnAttribute("stackrealign")) {
+ if (MF.getFunction().hasFnAttribute("stackrealign")) {
if (MFI.hasCalls())
MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
else if (MaxAlign < SlotSize)
@@ -924,6 +923,7 @@ void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
Notes:
- .seh directives are emitted only for Windows 64 ABI
+ - .cv_fpo directives are emitted on win32 when emitting CodeView
- .cfi directives are emitted for all other ABIs
- for 32-bit code, substitute %e?? registers for %r??
*/
@@ -934,31 +934,35 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
"MF used frame lowering for wrong subtarget");
MachineBasicBlock::iterator MBBI = MBB.begin();
MachineFrameInfo &MFI = MF.getFrameInfo();
- const Function *Fn = MF.getFunction();
+ const Function &Fn = MF.getFunction();
MachineModuleInfo &MMI = MF.getMMI();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
uint64_t MaxAlign = calculateMaxStackAlign(MF); // Desired stack alignment.
uint64_t StackSize = MFI.getStackSize(); // Number of bytes to allocate.
bool IsFunclet = MBB.isEHFuncletEntry();
EHPersonality Personality = EHPersonality::Unknown;
- if (Fn->hasPersonalityFn())
- Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ if (Fn.hasPersonalityFn())
+ Personality = classifyEHPersonality(Fn.getPersonalityFn());
bool FnHasClrFunclet =
MF.hasEHFunclets() && Personality == EHPersonality::CoreCLR;
bool IsClrFunclet = IsFunclet && FnHasClrFunclet;
bool HasFP = hasFP(MF);
- bool IsWin64CC = STI.isCallingConvWin64(Fn->getCallingConv());
+ bool IsWin64CC = STI.isCallingConvWin64(Fn.getCallingConv());
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool NeedsWinCFI = IsWin64Prologue && Fn->needsUnwindTableEntry();
+ bool NeedsWin64CFI = IsWin64Prologue && Fn.needsUnwindTableEntry();
+ // FIXME: Emit FPO data for EH funclets.
+ bool NeedsWinFPO =
+ !IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
+ bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
bool NeedsDwarfCFI =
- !IsWin64Prologue && (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+ !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry());
unsigned FramePtr = TRI->getFrameRegister(MF);
const unsigned MachineFramePtr =
STI.isTarget64BitILP32()
? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
unsigned BasePtr = TRI->getBaseRegister();
bool HasWinCFI = false;
-
+
// Debug location must be unknown since the first debug location is used
// to determine the end of the prologue.
DebugLoc DL;
@@ -977,16 +981,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// The default stack probe size is 4096 if the function has no stackprobesize
// attribute.
unsigned StackProbeSize = 4096;
- if (Fn->hasFnAttribute("stack-probe-size"))
- Fn->getFnAttribute("stack-probe-size")
+ if (Fn.hasFnAttribute("stack-probe-size"))
+ Fn.getFnAttribute("stack-probe-size")
.getValueAsString()
.getAsInteger(0, StackProbeSize);
// Re-align the stack on 64-bit if the x86-interrupt calling convention is
// used and an error code was pushed, since the x86-64 ABI requires a 16-byte
// stack alignment.
- if (Fn->getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
- Fn->arg_size() == 2) {
+ if (Fn.getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
+ Fn.arg_size() == 2) {
StackSize += 8;
MFI.setStackSize(StackSize);
emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false);
@@ -997,7 +1001,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// pointer, calls, or dynamic alloca then we do not need to adjust the
// stack pointer (we fit in the Red Zone). We also check that we don't
// push and pop from the stack.
- if (Is64Bit && !Fn->hasFnAttribute(Attribute::NoRedZone) &&
+ if (Is64Bit && !Fn.hasFnAttribute(Attribute::NoRedZone) &&
!TRI->needsStackRealignment(MF) &&
!MFI.hasVarSizedObjects() && // No dynamic alloca.
!MFI.adjustsStack() && // No calls.
@@ -1120,6 +1124,15 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaRegister(
nullptr, DwarfFramePtr));
}
+
+ if (NeedsWinFPO) {
+ // .cv_fpo_setframe $FramePtr
+ HasWinCFI = true;
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
+ .addImm(FramePtr)
+ .addImm(0)
+ .setMIFlag(MachineInstr::FrameSetup);
+ }
}
} else {
assert(!IsFunclet && "funclets without FPs not yet implemented");
@@ -1155,8 +1168,9 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (NeedsWinCFI) {
HasWinCFI = true;
- BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg)).addImm(Reg).setMIFlag(
- MachineInstr::FrameSetup);
+ BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_PushReg))
+ .addImm(Reg)
+ .setMIFlag(MachineInstr::FrameSetup);
}
}
@@ -1295,6 +1309,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// If this is not a funclet, emit the CFI describing our frame pointer.
if (NeedsWinCFI && !IsFunclet) {
+ assert(!NeedsWinFPO && "this setframe incompatible with FPO data");
HasWinCFI = true;
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SetFrame))
.addImm(FramePtr)
@@ -1333,6 +1348,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
Offset += SEHFrameOffset;
HasWinCFI = true;
+ assert(!NeedsWinFPO && "SEH_SaveXMM incompatible with FPO data");
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_SaveXMM))
.addImm(Reg)
.addImm(Offset)
@@ -1419,8 +1435,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
}
// Emit DWARF info specifying the offsets of the callee-saved registers.
- if (PushedRegs)
- emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+ emitCalleeSavedFrameMoves(MBB, MBBI, DL);
}
// X86 Interrupt handling function cannot assume anything about the direction
@@ -1431,7 +1446,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
// 1. The interrupt handling function uses any of the "rep" instructions.
// 2. Interrupt handling function calls another function.
//
- if (Fn->getCallingConv() == CallingConv::X86_INTR)
+ if (Fn.getCallingConv() == CallingConv::X86_INTR)
BuildMI(MBB, MBBI, DL, TII.get(X86::CLD))
.setMIFlag(MachineInstr::FrameSetup);
@@ -1492,7 +1507,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
// This is the amount of stack a funclet needs to allocate.
unsigned UsedSize;
EHPersonality Personality =
- classifyEHPersonality(MF.getFunction()->getPersonalityFn());
+ classifyEHPersonality(MF.getFunction().getPersonalityFn());
if (Personality == EHPersonality::CoreCLR) {
// CLR funclets need to hold enough space to include the PSPSym, at the
// same offset from the stack pointer (immediately after the prolog) as it
@@ -1522,10 +1537,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
MachineBasicBlock &MBB) const {
const MachineFrameInfo &MFI = MF.getFrameInfo();
X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
- MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
- Optional<unsigned> RetOpcode;
- if (MBBI != MBB.end())
- RetOpcode = MBBI->getOpcode();
+ MachineBasicBlock::iterator Terminator = MBB.getFirstTerminator();
+ MachineBasicBlock::iterator MBBI = Terminator;
DebugLoc DL;
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
@@ -1536,38 +1549,21 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
Is64BitILP32 ? getX86SubSuperRegister(FramePtr, 64) : FramePtr;
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool NeedsWinCFI =
- IsWin64Prologue && MF.getFunction()->needsUnwindTableEntry();
+ bool NeedsWin64CFI =
+ IsWin64Prologue && MF.getFunction().needsUnwindTableEntry();
bool IsFunclet = MBBI == MBB.end() ? false : isFuncletReturnInstr(*MBBI);
- MachineBasicBlock *TargetMBB = nullptr;
// Get the number of bytes to allocate from the FrameInfo.
uint64_t StackSize = MFI.getStackSize();
uint64_t MaxAlign = calculateMaxStackAlign(MF);
unsigned CSSize = X86FI->getCalleeSavedFrameSize();
+ bool HasFP = hasFP(MF);
uint64_t NumBytes = 0;
- if (RetOpcode && *RetOpcode == X86::CATCHRET) {
- // SEH shouldn't use catchret.
- assert(!isAsynchronousEHPersonality(
- classifyEHPersonality(MF.getFunction()->getPersonalityFn())) &&
- "SEH should not use CATCHRET");
-
- NumBytes = getWinEHFuncletFrameSize(MF);
- assert(hasFP(MF) && "EH funclets without FP not yet implemented");
- TargetMBB = MBBI->getOperand(0).getMBB();
-
- // Pop EBP.
- BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
- MachineFramePtr)
- .setMIFlag(MachineInstr::FrameDestroy);
- } else if (RetOpcode && *RetOpcode == X86::CLEANUPRET) {
+ if (IsFunclet) {
+ assert(HasFP && "EH funclets without FP not yet implemented");
NumBytes = getWinEHFuncletFrameSize(MF);
- assert(hasFP(MF) && "EH funclets without FP not yet implemented");
- BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
- MachineFramePtr)
- .setMIFlag(MachineInstr::FrameDestroy);
- } else if (hasFP(MF)) {
+ } else if (HasFP) {
// Calculate required stack adjustment.
uint64_t FrameSize = StackSize - SlotSize;
NumBytes = FrameSize - CSSize;
@@ -1576,16 +1572,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// realigned.
if (TRI->needsStackRealignment(MF) && !IsWin64Prologue)
NumBytes = alignTo(FrameSize, MaxAlign);
-
- // Pop EBP.
- BuildMI(MBB, MBBI, DL,
- TII.get(Is64Bit ? X86::POP64r : X86::POP32r), MachineFramePtr)
- .setMIFlag(MachineInstr::FrameDestroy);
} else {
NumBytes = StackSize - CSSize;
}
uint64_t SEHStackAllocAmt = NumBytes;
+ if (HasFP) {
+ // Pop EBP.
+ BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
+ MachineFramePtr)
+ .setMIFlag(MachineInstr::FrameDestroy);
+ }
+
MachineBasicBlock::iterator FirstCSPop = MBBI;
// Skip the callee-saved pop instructions.
while (MBBI != MBB.begin()) {
@@ -1603,26 +1601,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
}
MBBI = FirstCSPop;
- if (TargetMBB) {
- // Fill EAX/RAX with the address of the target block.
- unsigned ReturnReg = STI.is64Bit() ? X86::RAX : X86::EAX;
- if (STI.is64Bit()) {
- // LEA64r TargetMBB(%rip), %rax
- BuildMI(MBB, FirstCSPop, DL, TII.get(X86::LEA64r), ReturnReg)
- .addReg(X86::RIP)
- .addImm(0)
- .addReg(0)
- .addMBB(TargetMBB)
- .addReg(0);
- } else {
- // MOV32ri $TargetMBB, %eax
- BuildMI(MBB, FirstCSPop, DL, TII.get(X86::MOV32ri), ReturnReg)
- .addMBB(TargetMBB);
- }
- // Record that we've taken the address of TargetMBB and no longer just
- // reference it in a terminator.
- TargetMBB->setHasAddressTaken();
- }
+ if (IsFunclet && Terminator->getOpcode() == X86::CATCHRET)
+ emitCatchRetReturnValue(MBB, FirstCSPop, &*Terminator);
if (MBBI != MBB.end())
DL = MBBI->getDebugLoc();
@@ -1674,19 +1654,17 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
// into the epilogue. To cope with that, we insert an epilogue marker here,
// then replace it with a 'nop' if it ends up immediately after a CALL in the
// final emitted code.
- if (NeedsWinCFI && MF.hasWinCFI())
+ if (NeedsWin64CFI && MF.hasWinCFI())
BuildMI(MBB, MBBI, DL, TII.get(X86::SEH_Epilogue));
- if (!RetOpcode || !isTailCallOpcode(*RetOpcode)) {
+ if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
// Add the return addr area delta back since we are not tail calling.
int Offset = -1 * X86FI->getTCReturnAddrDelta();
assert(Offset >= 0 && "TCDelta should never be positive");
if (Offset) {
- MBBI = MBB.getFirstTerminator();
-
// Check for possible merge with preceding ADD instruction.
- Offset += mergeSPUpdates(MBB, MBBI, true);
- emitSPUpdate(MBB, MBBI, Offset, /*InEpilogue=*/true);
+ Offset += mergeSPUpdates(MBB, Terminator, true);
+ emitSPUpdate(MBB, Terminator, Offset, /*InEpilogue=*/true);
}
}
}
@@ -1997,9 +1975,39 @@ bool X86FrameLowering::spillCalleeSavedRegisters(
return true;
}
+void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineInstr *CatchRet) const {
+ // SEH shouldn't use catchret.
+ assert(!isAsynchronousEHPersonality(classifyEHPersonality(
+ MBB.getParent()->getFunction().getPersonalityFn())) &&
+ "SEH should not use CATCHRET");
+ DebugLoc DL = CatchRet->getDebugLoc();
+ MachineBasicBlock *CatchRetTarget = CatchRet->getOperand(0).getMBB();
+
+ // Fill EAX/RAX with the address of the target block.
+ if (STI.is64Bit()) {
+ // LEA64r CatchRetTarget(%rip), %rax
+ BuildMI(MBB, MBBI, DL, TII.get(X86::LEA64r), X86::RAX)
+ .addReg(X86::RIP)
+ .addImm(0)
+ .addReg(0)
+ .addMBB(CatchRetTarget)
+ .addReg(0);
+ } else {
+ // MOV32ri $CatchRetTarget, %eax
+ BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
+ .addMBB(CatchRetTarget);
+ }
+
+ // Record that we've taken the address of CatchRetTarget and no longer just
+ // reference it in a terminator.
+ CatchRetTarget->setHasAddressTaken();
+}
+
bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const {
if (CSI.empty())
return false;
@@ -2012,9 +2020,9 @@ bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
// Don't restore CSRs before an SEH catchret. SEH except blocks do not form
// funclets. emitEpilogue transforms these to normal jumps.
if (MI->getOpcode() == X86::CATCHRET) {
- const Function *Func = MBB.getParent()->getFunction();
+ const Function &F = MBB.getParent()->getFunction();
bool IsSEH = isAsynchronousEHPersonality(
- classifyEHPersonality(Func->getPersonalityFn()));
+ classifyEHPersonality(F.getPersonalityFn()));
if (IsSEH)
return true;
}
@@ -2086,8 +2094,8 @@ void X86FrameLowering::determineCalleeSaves(MachineFunction &MF,
static bool
HasNestArgument(const MachineFunction *MF) {
- const Function *F = MF->getFunction();
- for (Function::const_arg_iterator I = F->arg_begin(), E = F->arg_end();
+ const Function &F = MF->getFunction();
+ for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
I != E; I++) {
if (I->hasNestAttr())
return true;
@@ -2101,7 +2109,7 @@ HasNestArgument(const MachineFunction *MF) {
/// needed. Set primary to true for the first register, false for the second.
static unsigned
GetScratchRegister(bool Is64Bit, bool IsLP64, const MachineFunction &MF, bool Primary) {
- CallingConv::ID CallingConvention = MF.getFunction()->getCallingConv();
+ CallingConv::ID CallingConvention = MF.getFunction().getCallingConv();
// Erlang stuff.
if (CallingConvention == CallingConv::HiPE) {
@@ -2151,7 +2159,7 @@ void X86FrameLowering::adjustForSegmentedStacks(
assert(!MF.getRegInfo().isLiveIn(ScratchReg) &&
"Scratch register is live-in");
- if (MF.getFunction()->isVarArg())
+ if (MF.getFunction().isVarArg())
report_fatal_error("Segmented stacks do not support vararg functions.");
if (!STI.isTargetLinux() && !STI.isTargetDarwin() && !STI.isTargetWin32() &&
!STI.isTargetWin64() && !STI.isTargetFreeBSD() &&
@@ -2425,8 +2433,8 @@ void X86FrameLowering::adjustForHiPEPrologue(
Is64Bit ? "AMD64_LEAF_WORDS" : "X86_LEAF_WORDS");
const unsigned CCRegisteredArgs = Is64Bit ? 6 : 5;
const unsigned Guaranteed = HipeLeafWords * SlotSize;
- unsigned CallerStkArity = MF.getFunction()->arg_size() > CCRegisteredArgs ?
- MF.getFunction()->arg_size() - CCRegisteredArgs : 0;
+ unsigned CallerStkArity = MF.getFunction().arg_size() > CCRegisteredArgs ?
+ MF.getFunction().arg_size() - CCRegisteredArgs : 0;
unsigned MaxStack = MFI.getStackSize() + CallerStkArity*SlotSize + SlotSize;
assert(STI.isTargetLinux() &&
@@ -2567,6 +2575,7 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
unsigned Regs[2];
unsigned FoundRegs = 0;
+ auto &MRI = MBB.getParent()->getRegInfo();
auto RegMask = Prev->getOperand(1);
auto &RegClass =
@@ -2580,6 +2589,10 @@ bool X86FrameLowering::adjustStackWithPops(MachineBasicBlock &MBB,
if (!RegMask.clobbersPhysReg(Candidate))
continue;
+ // Don't clobber reserved registers
+ if (MRI.isReserved(Candidate))
+ continue;
+
bool IsDef = false;
for (const MachineOperand &MO : Prev->implicit_operands()) {
if (MO.isReg() && MO.isDef() &&
@@ -2635,10 +2648,10 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
Amount = alignTo(Amount, StackAlign);
MachineModuleInfo &MMI = MF.getMMI();
- const Function *Fn = MF.getFunction();
+ const Function &F = MF.getFunction();
bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool DwarfCFI = !WindowsCFI &&
- (MMI.hasDebugInfo() || Fn->needsUnwindTableEntry());
+ bool DwarfCFI = !WindowsCFI &&
+ (MMI.hasDebugInfo() || F.needsUnwindTableEntry());
// If we have any exception handlers in this function, and we adjust
// the SP before calls, we may need to indicate this to the unwinder
@@ -2680,7 +2693,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
StackAdjustment += mergeSPUpdates(MBB, InsertPos, false);
if (StackAdjustment) {
- if (!(Fn->optForMinSize() &&
+ if (!(F.optForMinSize() &&
adjustStackWithPops(MBB, InsertPos, DL, StackAdjustment)))
BuildStackAdjustment(MBB, InsertPos, DL, StackAdjustment,
/*InEpilogue=*/false);
@@ -2753,13 +2766,13 @@ bool X86FrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
bool X86FrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
// If we may need to emit frameless compact unwind information, give
// up as this is currently broken: PR25614.
- return (MF.getFunction()->hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
+ return (MF.getFunction().hasFnAttribute(Attribute::NoUnwind) || hasFP(MF)) &&
// The lowering of segmented stack and HiPE only support entry blocks
// as prologue blocks: PR26107.
// This limitation may be lifted if we fix:
// - adjustForSegmentedStacks
// - adjustForHiPEPrologue
- MF.getFunction()->getCallingConv() != CallingConv::HiPE &&
+ MF.getFunction().getCallingConv() != CallingConv::HiPE &&
!MF.shouldSplitStack();
}
@@ -2989,9 +3002,9 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
// If this function isn't doing Win64-style C++ EH, we don't need to do
// anything.
- const Function *Fn = MF.getFunction();
+ const Function &F = MF.getFunction();
if (!STI.is64Bit() || !MF.hasEHFunclets() ||
- classifyEHPersonality(Fn->getPersonalityFn()) != EHPersonality::MSVC_CXX)
+ classifyEHPersonality(F.getPersonalityFn()) != EHPersonality::MSVC_CXX)
return;
// Win64 C++ EH needs to allocate the UnwindHelp object at some fixed offset
diff --git a/lib/Target/X86/X86FrameLowering.h b/lib/Target/X86/X86FrameLowering.h
index 7d214cabad53..909319fc18fc 100644
--- a/lib/Target/X86/X86FrameLowering.h
+++ b/lib/Target/X86/X86FrameLowering.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
#define LLVM_LIB_TARGET_X86_X86FRAMELOWERING_H
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
namespace llvm {
@@ -89,7 +89,7 @@ public:
bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const std::vector<CalleeSavedInfo> &CSI,
+ std::vector<CalleeSavedInfo> &CSI,
const TargetRegisterInfo *TRI) const override;
bool hasFP(const MachineFunction &MF) const override;
@@ -157,15 +157,6 @@ public:
void orderFrameObjects(const MachineFunction &MF,
SmallVectorImpl<int> &ObjectsToAllocate) const override;
- /// convertArgMovsToPushes - This method tries to convert a call sequence
- /// that uses sub and mov instructions to put the argument onto the stack
- /// into a series of pushes.
- /// Returns true if the transformation succeeded, false if not.
- bool convertArgMovsToPushes(MachineFunction &MF,
- MachineBasicBlock &MBB,
- MachineBasicBlock::iterator I,
- uint64_t Amount) const;
-
/// Wraps up getting a CFI index and building a MachineInstr for it.
void BuildCFI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
const DebugLoc &DL, const MCCFIInstruction &CFIInst) const;
@@ -214,6 +205,11 @@ private:
unsigned getPSPSlotOffsetFromSP(const MachineFunction &MF) const;
unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
+
+ /// Materialize the catchret target MBB in RAX.
+ void emitCatchRetReturnValue(MachineBasicBlock &MBB,
+ MachineBasicBlock::iterator MBBI,
+ MachineInstr *CatchRet) const;
};
} // End llvm namespace
diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def
index 06be142432f7..9cd3f96f83ac 100644
--- a/lib/Target/X86/X86GenRegisterBankInfo.def
+++ b/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -11,10 +11,6 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
-
#ifdef GET_TARGET_REGBANK_INFO_IMPL
RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{
/* StartIdx, Length, RegBank */
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8f24f98be681..a6c7c5f22a3a 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -13,7 +13,6 @@
//===----------------------------------------------------------------------===//
#include "X86.h"
-#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
@@ -21,8 +20,6 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/SelectionDAGISel.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
@@ -194,6 +191,7 @@ namespace {
bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
bool matchWrapper(SDValue N, X86ISelAddressMode &AM);
bool matchAddress(SDValue N, X86ISelAddressMode &AM);
+ bool matchVectorAddress(SDValue N, X86ISelAddressMode &AM);
bool matchAdd(SDValue N, X86ISelAddressMode &AM, unsigned Depth);
bool matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
unsigned Depth);
@@ -204,11 +202,6 @@ namespace {
bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
SDValue &Segment);
- template <class GatherScatterSDNode>
- bool selectAddrOfGatherScatterNode(GatherScatterSDNode *Parent, SDValue N,
- SDValue &Base, SDValue &Scale,
- SDValue &Index, SDValue &Disp,
- SDValue &Segment);
bool selectMOV64Imm32(SDValue N, SDValue &Imm);
bool selectLEAAddr(SDValue N, SDValue &Base,
SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -226,11 +219,19 @@ namespace {
SDValue &NodeWithChain);
bool selectRelocImm(SDValue N, SDValue &Op);
- bool tryFoldLoad(SDNode *P, SDValue N,
+ bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment);
+ // Convience method where P is also root.
+ bool tryFoldLoad(SDNode *P, SDValue N,
+ SDValue &Base, SDValue &Scale,
+ SDValue &Index, SDValue &Disp,
+ SDValue &Segment) {
+ return tryFoldLoad(P, P, N, Base, Scale, Index, Disp, Segment);
+ }
+
/// Implement addressing mode selection for inline asm expressions.
bool SelectInlineAsmMemoryOperand(const SDValue &Op,
unsigned ConstraintID,
@@ -366,6 +367,22 @@ namespace {
return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
}
+ SDValue getExtractVEXTRACTImmediate(SDNode *N, unsigned VecWidth,
+ const SDLoc &DL) {
+ assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
+ uint64_t Index = N->getConstantOperandVal(1);
+ MVT VecVT = N->getOperand(0).getSimpleValueType();
+ return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
+ }
+
+ SDValue getInsertVINSERTImmediate(SDNode *N, unsigned VecWidth,
+ const SDLoc &DL) {
+ assert((VecWidth == 128 || VecWidth == 256) && "Unexpected vector width");
+ uint64_t Index = N->getConstantOperandVal(2);
+ MVT VecVT = N->getSimpleValueType(0);
+ return getI8Imm((Index * VecVT.getScalarSizeInBits()) / VecWidth, DL);
+ }
+
/// Return an SDNode that returns the value of the global base register.
/// Output instructions required to initialize the global base register,
/// if necessary.
@@ -399,10 +416,71 @@ namespace {
return isInt<Width>(CN->getSExtValue());
return isSExtAbsoluteSymbolRef(Width, N);
}
+
+ // Indicates we should prefer to use a non-temporal load for this load.
+ bool useNonTemporalLoad(LoadSDNode *N) const {
+ if (!N->isNonTemporal())
+ return false;
+
+ unsigned StoreSize = N->getMemoryVT().getStoreSize();
+
+ if (N->getAlignment() < StoreSize)
+ return false;
+
+ switch (StoreSize) {
+ default: llvm_unreachable("Unsupported store size");
+ case 16:
+ return Subtarget->hasSSE41();
+ case 32:
+ return Subtarget->hasAVX2();
+ case 64:
+ return Subtarget->hasAVX512();
+ }
+ }
+
+ bool foldLoadStoreIntoMemOperand(SDNode *Node);
+
+ bool matchBEXTRFromAnd(SDNode *Node);
+
+ bool isMaskZeroExtended(SDNode *N) const;
};
}
+// Returns true if this masked compare can be implemented legally with this
+// type.
+static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode == X86ISD::PCMPEQM || Opcode == X86ISD::PCMPGTM ||
+ Opcode == X86ISD::CMPM || Opcode == X86ISD::TESTM ||
+ Opcode == X86ISD::TESTNM || Opcode == X86ISD::CMPMU ||
+ Opcode == X86ISD::CMPM_RND) {
+ // We can get 256-bit 8 element types here without VLX being enabled. When
+ // this happens we will use 512-bit operations and the mask will not be
+ // zero extended.
+ EVT OpVT = N->getOperand(0).getValueType();
+ if (OpVT == MVT::v8i32 || OpVT == MVT::v8f32)
+ return Subtarget->hasVLX();
+
+ return true;
+ }
+
+ return false;
+}
+
+// Returns true if we can assume the writer of the mask has zero extended it
+// for us.
+bool X86DAGToDAGISel::isMaskZeroExtended(SDNode *N) const {
+ // If this is an AND, check if we have a compare on either side. As long as
+ // one side guarantees the mask is zero extended, the AND will preserve those
+ // zeros.
+ if (N->getOpcode() == ISD::AND)
+ return isLegalMaskCompare(N->getOperand(0).getNode(), Subtarget) ||
+ isLegalMaskCompare(N->getOperand(1).getNode(), Subtarget);
+
+ return isLegalMaskCompare(N, Subtarget);
+}
+
bool
X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if (OptLevel == CodeGenOpt::None) return false;
@@ -541,8 +619,8 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
void X86DAGToDAGISel::PreprocessISelDAG() {
// OptFor[Min]Size are used in pattern predicates that isel is matching.
- OptForSize = MF->getFunction()->optForSize();
- OptForMinSize = MF->getFunction()->optForMinSize();
+ OptForSize = MF->getFunction().optForSize();
+ OptForMinSize = MF->getFunction().optForMinSize();
assert((!OptForMinSize || OptForSize) && "OptForMinSize implies OptForSize");
for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
@@ -552,7 +630,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
if (OptLevel != CodeGenOpt::None &&
// Only does this when target favors doesn't favor register indirect
// call.
- ((N->getOpcode() == X86ISD::CALL && !Subtarget->callRegIndirect()) ||
+ ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
(N->getOpcode() == X86ISD::TC_RETURN &&
// Only does this if load can be folded into TC_RETURN.
(Subtarget->is64Bit() ||
@@ -675,9 +753,9 @@ void X86DAGToDAGISel::emitSpecialCodeForMain() {
void X86DAGToDAGISel::EmitFunctionEntryCode() {
// If this is main, emit special code for main.
- if (const Function *Fn = MF->getFunction())
- if (Fn->hasExternalLinkage() && Fn->getName() == "main")
- emitSpecialCodeForMain();
+ const Function &F = MF->getFunction();
+ if (F.hasExternalLinkage() && F.getName() == "main")
+ emitSpecialCodeForMain();
}
static bool isDispSafeForFrameIndex(int64_t Val) {
@@ -1423,12 +1501,30 @@ bool X86DAGToDAGISel::matchAddressBase(SDValue N, X86ISelAddressMode &AM) {
return false;
}
-template <class GatherScatterSDNode>
-bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
- GatherScatterSDNode *Mgs, SDValue N, SDValue &Base, SDValue &Scale,
- SDValue &Index, SDValue &Disp, SDValue &Segment) {
+/// Helper for selectVectorAddr. Handles things that can be folded into a
+/// gather scatter address. The index register and scale should have already
+/// been handled.
+bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
+ // TODO: Support other operations.
+ switch (N.getOpcode()) {
+ case X86ISD::Wrapper:
+ if (!matchWrapper(N, AM))
+ return false;
+ break;
+ }
+
+ return matchAddressBase(N, AM);
+}
+
+bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
+ SDValue &Scale, SDValue &Index,
+ SDValue &Disp, SDValue &Segment) {
X86ISelAddressMode AM;
- unsigned AddrSpace = Mgs->getPointerInfo().getAddrSpace();
+ auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
+ AM.IndexReg = Mgs->getIndex();
+ AM.Scale = Mgs->getValue().getScalarValueSizeInBits() / 8;
+
+ unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
// AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
if (AddrSpace == 256)
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1437,37 +1533,24 @@ bool X86DAGToDAGISel::selectAddrOfGatherScatterNode(
if (AddrSpace == 258)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
- SDLoc DL(N);
- Base = Mgs->getBasePtr();
- Index = Mgs->getIndex();
- unsigned ScalarSize = Mgs->getValue().getScalarValueSizeInBits();
- Scale = getI8Imm(ScalarSize/8, DL);
-
// If Base is 0, the whole address is in index and the Scale is 1
- if (isa<ConstantSDNode>(Base)) {
- assert(cast<ConstantSDNode>(Base)->isNullValue() &&
+ if (isa<ConstantSDNode>(N)) {
+ assert(cast<ConstantSDNode>(N)->isNullValue() &&
"Unexpected base in gather/scatter");
- Scale = getI8Imm(1, DL);
- Base = CurDAG->getRegister(0, MVT::i32);
+ AM.Scale = 1;
}
- if (AM.Segment.getNode())
- Segment = AM.Segment;
- else
- Segment = CurDAG->getRegister(0, MVT::i32);
- Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
- return true;
-}
+ // Otherwise, try to match into the base and displacement fields.
+ else if (matchVectorAddress(N, AM))
+ return false;
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
- SDValue &Scale, SDValue &Index,
- SDValue &Disp, SDValue &Segment) {
- if (auto Mgs = dyn_cast<MaskedGatherScatterSDNode>(Parent))
- return selectAddrOfGatherScatterNode<MaskedGatherScatterSDNode>(
- Mgs, N, Base, Scale, Index, Disp, Segment);
- if (auto X86Gather = dyn_cast<X86MaskedGatherSDNode>(Parent))
- return selectAddrOfGatherScatterNode<X86MaskedGatherSDNode>(
- X86Gather, N, Base, Scale, Index, Disp, Segment);
- return false;
+ MVT VT = N.getSimpleValueType();
+ if (AM.BaseType == X86ISelAddressMode::RegBase) {
+ if (!AM.Base_Reg.getNode())
+ AM.Base_Reg = CurDAG->getRegister(0, VT);
+ }
+
+ getAddressOperands(AM, SDLoc(N), Base, Scale, Index, Disp, Segment);
+ return true;
}
/// Returns true if it is able to pattern match an addressing mode.
@@ -1517,6 +1600,20 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
return true;
}
+// We can only fold a load if all nodes between it and the root node have a
+// single use. If there are additional uses, we could end up duplicating the
+// load.
+static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *N) {
+ SDNode *User = *N->use_begin();
+ while (User != Root) {
+ if (!User->hasOneUse())
+ return false;
+ User = *User->use_begin();
+ }
+
+ return true;
+}
+
/// Match a scalar SSE load. In particular, we want to match a load whose top
/// elements are either undef or zeros. The load flavor is derived from the
/// type of N, which is either v4f32 or v2f64.
@@ -1533,7 +1630,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
if (ISD::isNON_EXTLoad(N.getNode())) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+ IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
+ hasSingleUsesFromRoot(Root, N.getNode())) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1544,7 +1642,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
PatternNodeWithChain = N;
if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel)) {
+ IsLegalToFold(PatternNodeWithChain, *N->use_begin(), Root, OptLevel) &&
+ hasSingleUsesFromRoot(Root, N.getNode())) {
auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1558,7 +1657,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
PatternNodeWithChain = N.getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
+ hasSingleUsesFromRoot(Root, N.getNode())) {
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
Segment);
@@ -1574,7 +1674,8 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
PatternNodeWithChain = N.getOperand(0).getOperand(0);
if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
- IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
+ IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel) &&
+ hasSingleUsesFromRoot(Root, N.getNode())) {
// Okay, this is a zero extending load. Fold it.
LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
@@ -1589,7 +1690,7 @@ bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root,
bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
uint64_t ImmVal = CN->getZExtValue();
- if ((uint32_t)ImmVal != (uint64_t)ImmVal)
+ if (!isUInt<32>(ImmVal))
return false;
Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
@@ -1792,13 +1893,13 @@ bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
return true;
}
-bool X86DAGToDAGISel::tryFoldLoad(SDNode *P, SDValue N,
+bool X86DAGToDAGISel::tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
SDValue &Base, SDValue &Scale,
SDValue &Index, SDValue &Disp,
SDValue &Segment) {
if (!ISD::isNON_EXTLoad(N.getNode()) ||
- !IsProfitableToFold(N, P, P) ||
- !IsLegalToFold(N, P, P, OptLevel))
+ !IsProfitableToFold(N, P, Root) ||
+ !IsLegalToFold(N, P, Root, OptLevel))
return false;
return selectAddr(N.getNode(),
@@ -1891,15 +1992,79 @@ static bool hasNoSignedComparisonUses(SDNode *N) {
return true;
}
-/// Check whether or not the chain ending in StoreNode is suitable for doing
-/// the {load; increment or decrement; store} to modify transformation.
-static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
- SDValue StoredVal, SelectionDAG *CurDAG,
- LoadSDNode* &LoadNode, SDValue &InputChain) {
-
- // is the value stored the result of a DEC or INC?
- if (!(Opc == X86ISD::DEC || Opc == X86ISD::INC)) return false;
+/// Test whether the given node which sets flags has any uses which require the
+/// CF flag to be accurate.
+static bool hasNoCarryFlagUses(SDNode *N) {
+ // Examine each user of the node.
+ for (SDNode::use_iterator UI = N->use_begin(), UE = N->use_end(); UI != UE;
+ ++UI) {
+ // Only check things that use the flags.
+ if (UI.getUse().getResNo() != 1)
+ continue;
+ // Only examine CopyToReg uses.
+ if (UI->getOpcode() != ISD::CopyToReg)
+ return false;
+ // Only examine CopyToReg uses that copy to EFLAGS.
+ if (cast<RegisterSDNode>(UI->getOperand(1))->getReg() != X86::EFLAGS)
+ return false;
+ // Examine each user of the CopyToReg use.
+ for (SDNode::use_iterator FlagUI = UI->use_begin(), FlagUE = UI->use_end();
+ FlagUI != FlagUE; ++FlagUI) {
+ // Only examine the Flag result.
+ if (FlagUI.getUse().getResNo() != 1)
+ continue;
+ // Anything unusual: assume conservatively.
+ if (!FlagUI->isMachineOpcode())
+ return false;
+ // Examine the opcode of the user.
+ switch (FlagUI->getMachineOpcode()) {
+ // Comparisons which don't examine the CF flag.
+ case X86::SETOr: case X86::SETNOr: case X86::SETEr: case X86::SETNEr:
+ case X86::SETSr: case X86::SETNSr: case X86::SETPr: case X86::SETNPr:
+ case X86::SETLr: case X86::SETGEr: case X86::SETLEr: case X86::SETGr:
+ case X86::JO_1: case X86::JNO_1: case X86::JE_1: case X86::JNE_1:
+ case X86::JS_1: case X86::JNS_1: case X86::JP_1: case X86::JNP_1:
+ case X86::JL_1: case X86::JGE_1: case X86::JLE_1: case X86::JG_1:
+ case X86::CMOVO16rr: case X86::CMOVO32rr: case X86::CMOVO64rr:
+ case X86::CMOVO16rm: case X86::CMOVO32rm: case X86::CMOVO64rm:
+ case X86::CMOVNO16rr: case X86::CMOVNO32rr: case X86::CMOVNO64rr:
+ case X86::CMOVNO16rm: case X86::CMOVNO32rm: case X86::CMOVNO64rm:
+ case X86::CMOVE16rr: case X86::CMOVE32rr: case X86::CMOVE64rr:
+ case X86::CMOVE16rm: case X86::CMOVE32rm: case X86::CMOVE64rm:
+ case X86::CMOVNE16rr: case X86::CMOVNE32rr: case X86::CMOVNE64rr:
+ case X86::CMOVNE16rm: case X86::CMOVNE32rm: case X86::CMOVNE64rm:
+ case X86::CMOVS16rr: case X86::CMOVS32rr: case X86::CMOVS64rr:
+ case X86::CMOVS16rm: case X86::CMOVS32rm: case X86::CMOVS64rm:
+ case X86::CMOVNS16rr: case X86::CMOVNS32rr: case X86::CMOVNS64rr:
+ case X86::CMOVNS16rm: case X86::CMOVNS32rm: case X86::CMOVNS64rm:
+ case X86::CMOVP16rr: case X86::CMOVP32rr: case X86::CMOVP64rr:
+ case X86::CMOVP16rm: case X86::CMOVP32rm: case X86::CMOVP64rm:
+ case X86::CMOVNP16rr: case X86::CMOVNP32rr: case X86::CMOVNP64rr:
+ case X86::CMOVNP16rm: case X86::CMOVNP32rm: case X86::CMOVNP64rm:
+ case X86::CMOVL16rr: case X86::CMOVL32rr: case X86::CMOVL64rr:
+ case X86::CMOVL16rm: case X86::CMOVL32rm: case X86::CMOVL64rm:
+ case X86::CMOVGE16rr: case X86::CMOVGE32rr: case X86::CMOVGE64rr:
+ case X86::CMOVGE16rm: case X86::CMOVGE32rm: case X86::CMOVGE64rm:
+ case X86::CMOVLE16rr: case X86::CMOVLE32rr: case X86::CMOVLE64rr:
+ case X86::CMOVLE16rm: case X86::CMOVLE32rm: case X86::CMOVLE64rm:
+ case X86::CMOVG16rr: case X86::CMOVG32rr: case X86::CMOVG64rr:
+ case X86::CMOVG16rm: case X86::CMOVG32rm: case X86::CMOVG64rm:
+ continue;
+ // Anything else: assume conservatively.
+ default:
+ return false;
+ }
+ }
+ }
+ return true;
+}
+/// Check whether or not the chain ending in StoreNode is suitable for doing
+/// the {load; op; store} to modify transformation.
+static bool isFusableLoadOpStorePattern(StoreSDNode *StoreNode,
+ SDValue StoredVal, SelectionDAG *CurDAG,
+ LoadSDNode *&LoadNode,
+ SDValue &InputChain) {
// is the stored value result 0 of the load?
if (StoredVal.getResNo() != 0) return false;
@@ -1916,11 +2081,6 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
// Return LoadNode by reference.
LoadNode = cast<LoadSDNode>(Load);
- // is the size of the value one that we can handle? (i.e. 64, 32, 16, or 8)
- EVT LdVT = LoadNode->getMemoryVT();
- if (LdVT != MVT::i64 && LdVT != MVT::i32 && LdVT != MVT::i16 &&
- LdVT != MVT::i8)
- return false;
// Is store the only read of the loaded value?
if (!Load.hasOneUse())
@@ -1978,22 +2138,294 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
return true;
}
-/// Get the appropriate X86 opcode for an in-memory increment or decrement.
-/// Opc should be X86ISD::DEC or X86ISD::INC.
-static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
- if (Opc == X86ISD::DEC) {
- if (LdVT == MVT::i64) return X86::DEC64m;
- if (LdVT == MVT::i32) return X86::DEC32m;
- if (LdVT == MVT::i16) return X86::DEC16m;
- if (LdVT == MVT::i8) return X86::DEC8m;
+// Change a chain of {load; op; store} of the same value into a simple op
+// through memory of that value, if the uses of the modified value and its
+// address are suitable.
+//
+// The tablegen pattern memory operand pattern is currently not able to match
+// the case where the EFLAGS on the original operation are used.
+//
+// To move this to tablegen, we'll need to improve tablegen to allow flags to
+// be transferred from a node in the pattern to the result node, probably with
+// a new keyword. For example, we have this
+// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+// (implicit EFLAGS)]>;
+// but maybe need something like this
+// def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+// [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+// (transferrable EFLAGS)]>;
+//
+// Until then, we manually fold these and instruction select the operation
+// here.
+bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
+ StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+ SDValue StoredVal = StoreNode->getOperand(1);
+ unsigned Opc = StoredVal->getOpcode();
+
+ // Before we try to select anything, make sure this is memory operand size
+ // and opcode we can handle. Note that this must match the code below that
+ // actually lowers the opcodes.
+ EVT MemVT = StoreNode->getMemoryVT();
+ if (MemVT != MVT::i64 && MemVT != MVT::i32 && MemVT != MVT::i16 &&
+ MemVT != MVT::i8)
+ return false;
+ switch (Opc) {
+ default:
+ return false;
+ case X86ISD::INC:
+ case X86ISD::DEC:
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::AND:
+ case X86ISD::OR:
+ case X86ISD::XOR:
+ break;
+ }
+
+ LoadSDNode *LoadNode = nullptr;
+ SDValue InputChain;
+ if (!isFusableLoadOpStorePattern(StoreNode, StoredVal, CurDAG, LoadNode,
+ InputChain))
+ return false;
+
+ SDValue Base, Scale, Index, Disp, Segment;
+ if (!selectAddr(LoadNode, LoadNode->getBasePtr(), Base, Scale, Index, Disp,
+ Segment))
+ return false;
+
+ auto SelectOpcode = [&](unsigned Opc64, unsigned Opc32, unsigned Opc16,
+ unsigned Opc8) {
+ switch (MemVT.getSimpleVT().SimpleTy) {
+ case MVT::i64:
+ return Opc64;
+ case MVT::i32:
+ return Opc32;
+ case MVT::i16:
+ return Opc16;
+ case MVT::i8:
+ return Opc8;
+ default:
+ llvm_unreachable("Invalid size!");
+ }
+ };
+
+ MachineSDNode *Result;
+ switch (Opc) {
+ case X86ISD::INC:
+ case X86ISD::DEC: {
+ unsigned NewOpc =
+ Opc == X86ISD::INC
+ ? SelectOpcode(X86::INC64m, X86::INC32m, X86::INC16m, X86::INC8m)
+ : SelectOpcode(X86::DEC64m, X86::DEC32m, X86::DEC16m, X86::DEC8m);
+ const SDValue Ops[] = {Base, Scale, Index, Disp, Segment, InputChain};
+ Result =
+ CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+ break;
+ }
+ case X86ISD::ADD:
+ case X86ISD::SUB:
+ case X86ISD::AND:
+ case X86ISD::OR:
+ case X86ISD::XOR: {
+ auto SelectRegOpcode = [SelectOpcode](unsigned Opc) {
+ switch (Opc) {
+ case X86ISD::ADD:
+ return SelectOpcode(X86::ADD64mr, X86::ADD32mr, X86::ADD16mr,
+ X86::ADD8mr);
+ case X86ISD::SUB:
+ return SelectOpcode(X86::SUB64mr, X86::SUB32mr, X86::SUB16mr,
+ X86::SUB8mr);
+ case X86ISD::AND:
+ return SelectOpcode(X86::AND64mr, X86::AND32mr, X86::AND16mr,
+ X86::AND8mr);
+ case X86ISD::OR:
+ return SelectOpcode(X86::OR64mr, X86::OR32mr, X86::OR16mr, X86::OR8mr);
+ case X86ISD::XOR:
+ return SelectOpcode(X86::XOR64mr, X86::XOR32mr, X86::XOR16mr,
+ X86::XOR8mr);
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+ };
+ auto SelectImm8Opcode = [SelectOpcode](unsigned Opc) {
+ switch (Opc) {
+ case X86ISD::ADD:
+ return SelectOpcode(X86::ADD64mi8, X86::ADD32mi8, X86::ADD16mi8, 0);
+ case X86ISD::SUB:
+ return SelectOpcode(X86::SUB64mi8, X86::SUB32mi8, X86::SUB16mi8, 0);
+ case X86ISD::AND:
+ return SelectOpcode(X86::AND64mi8, X86::AND32mi8, X86::AND16mi8, 0);
+ case X86ISD::OR:
+ return SelectOpcode(X86::OR64mi8, X86::OR32mi8, X86::OR16mi8, 0);
+ case X86ISD::XOR:
+ return SelectOpcode(X86::XOR64mi8, X86::XOR32mi8, X86::XOR16mi8, 0);
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+ };
+ auto SelectImmOpcode = [SelectOpcode](unsigned Opc) {
+ switch (Opc) {
+ case X86ISD::ADD:
+ return SelectOpcode(X86::ADD64mi32, X86::ADD32mi, X86::ADD16mi,
+ X86::ADD8mi);
+ case X86ISD::SUB:
+ return SelectOpcode(X86::SUB64mi32, X86::SUB32mi, X86::SUB16mi,
+ X86::SUB8mi);
+ case X86ISD::AND:
+ return SelectOpcode(X86::AND64mi32, X86::AND32mi, X86::AND16mi,
+ X86::AND8mi);
+ case X86ISD::OR:
+ return SelectOpcode(X86::OR64mi32, X86::OR32mi, X86::OR16mi,
+ X86::OR8mi);
+ case X86ISD::XOR:
+ return SelectOpcode(X86::XOR64mi32, X86::XOR32mi, X86::XOR16mi,
+ X86::XOR8mi);
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+ };
+
+ unsigned NewOpc = SelectRegOpcode(Opc);
+ SDValue Operand = StoredVal->getOperand(1);
+
+ // See if the operand is a constant that we can fold into an immediate
+ // operand.
+ if (auto *OperandC = dyn_cast<ConstantSDNode>(Operand)) {
+ auto OperandV = OperandC->getAPIntValue();
+
+ // Check if we can shrink the operand enough to fit in an immediate (or
+ // fit into a smaller immediate) by negating it and switching the
+ // operation.
+ if ((Opc == X86ISD::ADD || Opc == X86ISD::SUB) &&
+ ((MemVT != MVT::i8 && OperandV.getMinSignedBits() > 8 &&
+ (-OperandV).getMinSignedBits() <= 8) ||
+ (MemVT == MVT::i64 && OperandV.getMinSignedBits() > 32 &&
+ (-OperandV).getMinSignedBits() <= 32)) &&
+ hasNoCarryFlagUses(StoredVal.getNode())) {
+ OperandV = -OperandV;
+ Opc = Opc == X86ISD::ADD ? X86ISD::SUB : X86ISD::ADD;
+ }
+
+ // First try to fit this into an Imm8 operand. If it doesn't fit, then try
+ // the larger immediate operand.
+ if (MemVT != MVT::i8 && OperandV.getMinSignedBits() <= 8) {
+ Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
+ NewOpc = SelectImm8Opcode(Opc);
+ } else if (OperandV.getActiveBits() <= MemVT.getSizeInBits() &&
+ (MemVT != MVT::i64 || OperandV.getMinSignedBits() <= 32)) {
+ Operand = CurDAG->getTargetConstant(OperandV, SDLoc(Node), MemVT);
+ NewOpc = SelectImmOpcode(Opc);
+ }
+ }
+
+ const SDValue Ops[] = {Base, Scale, Index, Disp,
+ Segment, Operand, InputChain};
+ Result =
+ CurDAG->getMachineNode(NewOpc, SDLoc(Node), MVT::i32, MVT::Other, Ops);
+ break;
+ }
+ default:
+ llvm_unreachable("Invalid opcode!");
+ }
+
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+ MemOp[0] = StoreNode->getMemOperand();
+ MemOp[1] = LoadNode->getMemOperand();
+ Result->setMemRefs(MemOp, MemOp + 2);
+
+ ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+ ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return true;
+}
+
+// See if this is an (X >> C1) & C2 that we can match to BEXTR/BEXTRI.
+bool X86DAGToDAGISel::matchBEXTRFromAnd(SDNode *Node) {
+ MVT NVT = Node->getSimpleValueType(0);
+ SDLoc dl(Node);
+
+ SDValue N0 = Node->getOperand(0);
+ SDValue N1 = Node->getOperand(1);
+
+ if (!Subtarget->hasBMI() && !Subtarget->hasTBM())
+ return false;
+
+ // Must have a shift right.
+ if (N0->getOpcode() != ISD::SRL && N0->getOpcode() != ISD::SRA)
+ return false;
+
+ // Shift can't have additional users.
+ if (!N0->hasOneUse())
+ return false;
+
+ // Only supported for 32 and 64 bits.
+ if (NVT != MVT::i32 && NVT != MVT::i64)
+ return false;
+
+ // Shift amount and RHS of and must be constant.
+ ConstantSDNode *MaskCst = dyn_cast<ConstantSDNode>(N1);
+ ConstantSDNode *ShiftCst = dyn_cast<ConstantSDNode>(N0->getOperand(1));
+ if (!MaskCst || !ShiftCst)
+ return false;
+
+ // And RHS must be a mask.
+ uint64_t Mask = MaskCst->getZExtValue();
+ if (!isMask_64(Mask))
+ return false;
+
+ uint64_t Shift = ShiftCst->getZExtValue();
+ uint64_t MaskSize = countPopulation(Mask);
+
+ // Don't interfere with something that can be handled by extracting AH.
+ // TODO: If we are able to fold a load, BEXTR might still be better than AH.
+ if (Shift == 8 && MaskSize == 8)
+ return false;
+
+ // Make sure we are only using bits that were in the original value, not
+ // shifted in.
+ if (Shift + MaskSize > NVT.getSizeInBits())
+ return false;
+
+ SDValue New = CurDAG->getTargetConstant(Shift | (MaskSize << 8), dl, NVT);
+ unsigned ROpc = NVT == MVT::i64 ? X86::BEXTRI64ri : X86::BEXTRI32ri;
+ unsigned MOpc = NVT == MVT::i64 ? X86::BEXTRI64mi : X86::BEXTRI32mi;
+
+ // BMI requires the immediate to placed in a register.
+ if (!Subtarget->hasTBM()) {
+ ROpc = NVT == MVT::i64 ? X86::BEXTR64rr : X86::BEXTR32rr;
+ MOpc = NVT == MVT::i64 ? X86::BEXTR64rm : X86::BEXTR32rm;
+ New = SDValue(CurDAG->getMachineNode(X86::MOV32ri, dl, NVT, New), 0);
+ if (NVT == MVT::i64) {
+ New =
+ SDValue(CurDAG->getMachineNode(
+ TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+ CurDAG->getTargetConstant(0, dl, MVT::i64), New,
+ CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
+ 0);
+ }
+ }
+
+ MachineSDNode *NewNode;
+ SDValue Input = N0->getOperand(0);
+ SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
+ if (tryFoldLoad(Node, N0.getNode(), Input, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, New, Input.getOperand(0) };
+ SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+ NewNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+ // Update the chain.
+ ReplaceUses(Input.getValue(1), SDValue(NewNode, 1));
+ // Record the mem-refs
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(Input)->getMemOperand();
+ NewNode->setMemRefs(MemOp, MemOp + 1);
} else {
- assert(Opc == X86ISD::INC && "unrecognized opcode");
- if (LdVT == MVT::i64) return X86::INC64m;
- if (LdVT == MVT::i32) return X86::INC32m;
- if (LdVT == MVT::i16) return X86::INC16m;
- if (LdVT == MVT::i8) return X86::INC8m;
+ NewNode = CurDAG->getMachineNode(ROpc, dl, NVT, Input, New);
}
- llvm_unreachable("unrecognized size for LdVT");
+
+ ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
+ return true;
}
void X86DAGToDAGISel::Select(SDNode *Node) {
@@ -2037,20 +2469,27 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, getGlobalBaseReg());
return;
+ case X86ISD::SELECT:
case X86ISD::SHRUNKBLEND: {
- // SHRUNKBLEND selects like a regular VSELECT.
+ // SHRUNKBLEND selects like a regular VSELECT. Same with X86ISD::SELECT.
SDValue VSelect = CurDAG->getNode(
ISD::VSELECT, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
Node->getOperand(1), Node->getOperand(2));
- ReplaceUses(SDValue(Node, 0), VSelect);
+ ReplaceNode(Node, VSelect.getNode());
SelectCode(VSelect.getNode());
// We already called ReplaceUses.
return;
}
case ISD::AND:
+ // Try to match BEXTR/BEXTRI instruction.
+ if (matchBEXTRFromAnd(Node))
+ return;
+
+ LLVM_FALLTHROUGH;
case ISD::OR:
case ISD::XOR: {
+
// For operations of the form (x << C1) op C2, check if we can use a smaller
// encoding for C2 by transforming it into (x op (C2>>C1)) << C1.
SDValue N0 = Node->getOperand(0);
@@ -2157,7 +2596,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
unsigned LoReg;
switch (NVT.SimpleTy) {
default: llvm_unreachable("Unsupported VT!");
- case MVT::i8: LoReg = X86::AL; Opc = X86::MUL8r; break;
+ // MVT::i8 is handled by X86ISD::UMUL8.
case MVT::i16: LoReg = X86::AX; Opc = X86::MUL16r; break;
case MVT::i32: LoReg = X86::EAX; Opc = X86::MUL32r; break;
case MVT::i64: LoReg = X86::RAX; Opc = X86::MUL64r; break;
@@ -2263,12 +2702,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Update the chain.
ReplaceUses(N1.getValue(1), Chain);
// Record the mem-refs
- LoadSDNode *LoadNode = cast<LoadSDNode>(N1);
- if (LoadNode) {
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
- MemOp[0] = LoadNode->getMemOperand();
- CNode->setMemRefs(MemOp, MemOp + 1);
- }
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
} else {
SDValue Ops[] = { N1, InFlag };
if (Opc == X86::MULX32rr || Opc == X86::MULX64rr) {
@@ -2293,7 +2729,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Get the low part if needed. Don't use getCopyFromReg for aliasing
// registers.
if (!SDValue(Node, 0).use_empty())
- ReplaceUses(SDValue(Node, 1),
+ ReplaceUses(SDValue(Node, 0),
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result));
// Shift AX down 8 bits.
@@ -2328,6 +2764,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG); dbgs() << '\n');
}
+ CurDAG->RemoveDeadNode(Node);
return;
}
@@ -2447,11 +2884,15 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (foldedLoad) {
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
InFlag };
- SDNode *CNode =
+ MachineSDNode *CNode =
CurDAG->getMachineNode(MOpc, dl, MVT::Other, MVT::Glue, Ops);
InFlag = SDValue(CNode, 1);
// Update the chain.
ReplaceUses(N1.getValue(1), SDValue(CNode, 0));
+ // Record the mem-refs
+ MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1);
+ MemOp[0] = cast<LoadSDNode>(N1)->getMemOperand();
+ CNode->setMemRefs(MemOp, MemOp + 1);
} else {
InFlag =
SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
@@ -2476,19 +2917,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (Opcode == X86ISD::UDIVREM8_ZEXT_HREG ||
Opcode == X86ISD::SDIVREM8_SEXT_HREG) {
- if (Node->getValueType(1) == MVT::i64) {
- // It's not possible to directly movsx AH to a 64bit register, because
- // the latter needs the REX prefix, but the former can't have it.
- assert(Opcode != X86ISD::SDIVREM8_SEXT_HREG &&
- "Unexpected i64 sext of h-register");
- Result =
- SDValue(CurDAG->getMachineNode(
- TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
- CurDAG->getTargetConstant(0, dl, MVT::i64), Result,
- CurDAG->getTargetConstant(X86::sub_32bit, dl,
- MVT::i32)),
- 0);
- }
+ assert(Node->getValueType(1) == MVT::i32 && "Unexpected result type!");
} else {
Result =
CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl, MVT::i8, Result);
@@ -2512,6 +2941,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceUses(SDValue(Node, 1), Result);
DEBUG(dbgs() << "=> "; Result.getNode()->dump(CurDAG); dbgs() << '\n');
}
+ CurDAG->RemoveDeadNode(Node);
return;
}
@@ -2531,34 +2961,21 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// Look for (X86cmp (and $op, $imm), 0) and see if we can convert it to
// use a smaller encoding.
// Look past the truncate if CMP is the only use of it.
- if ((N0.getNode()->getOpcode() == ISD::AND ||
- (N0.getResNo() == 0 && N0.getNode()->getOpcode() == X86ISD::AND)) &&
+ if ((N0.getOpcode() == ISD::AND ||
+ (N0.getResNo() == 0 && N0.getOpcode() == X86ISD::AND)) &&
N0.getNode()->hasOneUse() &&
N0.getValueType() != MVT::i8 &&
X86::isZeroNode(N1)) {
ConstantSDNode *C = dyn_cast<ConstantSDNode>(N0.getOperand(1));
if (!C) break;
+ uint64_t Mask = C->getZExtValue();
// For example, convert "testl %eax, $8" to "testb %al, $8"
- if ((C->getZExtValue() & ~UINT64_C(0xff)) == 0 &&
- (!(C->getZExtValue() & 0x80) ||
- hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+ if (isUInt<8>(Mask) &&
+ (!(Mask & 0x80) || hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i8);
SDValue Reg = N0.getOperand(0);
- // On x86-32, only the ABCD registers have 8-bit subregisters.
- if (!Subtarget->is64Bit()) {
- const TargetRegisterClass *TRC;
- switch (N0.getSimpleValueType().SimpleTy) {
- case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
- case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
- default: llvm_unreachable("Unsupported TEST operand type!");
- }
- SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
- Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
- Reg.getValueType(), Reg, RC), 0);
- }
-
// Extract the l-register.
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit, dl,
MVT::i8, Reg);
@@ -2570,30 +2987,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
return;
}
// For example, "testl %eax, $2048" to "testb %ah, $8".
- if ((C->getZExtValue() & ~UINT64_C(0xff00)) == 0 &&
- (!(C->getZExtValue() & 0x8000) ||
- hasNoSignedComparisonUses(Node))) {
+ if (isShiftedUInt<8, 8>(Mask) &&
+ (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
// Shift the immediate right by 8 bits.
- SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8,
- dl, MVT::i8);
+ SDValue ShiftedImm = CurDAG->getTargetConstant(Mask >> 8, dl, MVT::i8);
SDValue Reg = N0.getOperand(0);
- // Put the value in an ABCD register.
- const TargetRegisterClass *TRC;
- switch (N0.getSimpleValueType().SimpleTy) {
- case MVT::i64: TRC = &X86::GR64_ABCDRegClass; break;
- case MVT::i32: TRC = &X86::GR32_ABCDRegClass; break;
- case MVT::i16: TRC = &X86::GR16_ABCDRegClass; break;
- default: llvm_unreachable("Unsupported TEST operand type!");
- }
- SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
- Reg = SDValue(CurDAG->getMachineNode(X86::COPY_TO_REGCLASS, dl,
- Reg.getValueType(), Reg, RC), 0);
-
// Extract the h-register.
SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl,
MVT::i8, Reg);
@@ -2607,16 +3011,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
return;
}
// For example, "testl %eax, $32776" to "testw %ax, $32776".
- if ((C->getZExtValue() & ~UINT64_C(0xffff)) == 0 &&
- N0.getValueType() != MVT::i16 &&
- (!(C->getZExtValue() & 0x8000) ||
- hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
- MVT::i16);
+ // NOTE: We only want to form TESTW instructions if optimizing for
+ // min size. Otherwise we only save one byte and possibly get a length
+ // changing prefix penalty in the decoders.
+ if (OptForMinSize && isUInt<16>(Mask) && N0.getValueType() != MVT::i16 &&
+ (!(Mask & 0x8000) || hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i16);
SDValue Reg = N0.getOperand(0);
// Extract the 16-bit subregister.
@@ -2630,16 +3035,14 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
return;
}
// For example, "testq %rax, $268468232" to "testl %eax, $268468232".
- if ((C->getZExtValue() & ~UINT64_C(0xffffffff)) == 0 &&
- N0.getValueType() == MVT::i64 &&
- (!(C->getZExtValue() & 0x80000000) ||
- hasNoSignedComparisonUses(Node))) {
- SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl,
- MVT::i32);
+ if (isUInt<32>(Mask) && N0.getValueType() == MVT::i64 &&
+ (!(Mask & 0x80000000) || hasNoSignedComparisonUses(Node))) {
+ SDValue Imm = CurDAG->getTargetConstant(Mask, dl, MVT::i32);
SDValue Reg = N0.getOperand(0);
// Extract the 32-bit subregister.
@@ -2653,60 +3056,16 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
// one, do not call ReplaceAllUsesWith.
ReplaceUses(SDValue(Node, (Opcode == X86ISD::SUB ? 1 : 0)),
SDValue(NewNode, 0));
+ CurDAG->RemoveDeadNode(Node);
return;
}
}
break;
}
- case ISD::STORE: {
- // Change a chain of {load; incr or dec; store} of the same value into
- // a simple increment or decrement through memory of that value, if the
- // uses of the modified value and its address are suitable.
- // The DEC64m tablegen pattern is currently not able to match the case where
- // the EFLAGS on the original DEC are used. (This also applies to
- // {INC,DEC}X{64,32,16,8}.)
- // We'll need to improve tablegen to allow flags to be transferred from a
- // node in the pattern to the result node. probably with a new keyword
- // for example, we have this
- // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
- // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
- // (implicit EFLAGS)]>;
- // but maybe need something like this
- // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
- // [(store (add (loadi64 addr:$dst), -1), addr:$dst),
- // (transferrable EFLAGS)]>;
-
- StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
- SDValue StoredVal = StoreNode->getOperand(1);
- unsigned Opc = StoredVal->getOpcode();
-
- LoadSDNode *LoadNode = nullptr;
- SDValue InputChain;
- if (!isLoadIncOrDecStore(StoreNode, Opc, StoredVal, CurDAG,
- LoadNode, InputChain))
- break;
-
- SDValue Base, Scale, Index, Disp, Segment;
- if (!selectAddr(LoadNode, LoadNode->getBasePtr(),
- Base, Scale, Index, Disp, Segment))
- break;
-
- MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
- MemOp[0] = StoreNode->getMemOperand();
- MemOp[1] = LoadNode->getMemOperand();
- const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
- EVT LdVT = LoadNode->getMemoryVT();
- unsigned newOpc = getFusedLdStOpcode(LdVT, Opc);
- MachineSDNode *Result = CurDAG->getMachineNode(newOpc,
- SDLoc(Node),
- MVT::i32, MVT::Other, Ops);
- Result->setMemRefs(MemOp, MemOp + 2);
-
- ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
- ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
- CurDAG->RemoveDeadNode(Node);
- return;
- }
+ case ISD::STORE:
+ if (foldLoadStoreIntoMemOperand(Node))
+ return;
+ break;
}
SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 607bc4530abb..a72f4daa5e11 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -35,6 +35,7 @@
#include "llvm/CodeGen/MachineJumpTableInfo.h"
#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/CodeGen/WinEHFuncInfo.h"
#include "llvm/IR/CallSite.h"
#include "llvm/IR/CallingConv.h"
@@ -55,7 +56,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/KnownBits.h"
#include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
#include <algorithm>
#include <bitset>
@@ -94,7 +94,7 @@ static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl,
const char *Msg) {
MachineFunction &MF = DAG.getMachineFunction();
DAG.getContext()->diagnose(
- DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc()));
+ DiagnosticInfoUnsupported(MF.getFunction(), Msg, dl.getDebugLoc()));
}
X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
@@ -188,6 +188,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+ // Integer absolute.
+ if (Subtarget.hasCMov()) {
+ setOperationAction(ISD::ABS , MVT::i16 , Custom);
+ setOperationAction(ISD::ABS , MVT::i32 , Custom);
+ if (Subtarget.is64Bit())
+ setOperationAction(ISD::ABS , MVT::i64 , Custom);
+ }
+
// Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
// operation.
setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
@@ -372,8 +380,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// Special handling for half-precision floating point conversions.
// If we don't have F16C support, then lower half float conversions
// into library calls.
- if (Subtarget.useSoftFloat() ||
- (!Subtarget.hasF16C() && !Subtarget.hasAVX512())) {
+ if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
}
@@ -392,7 +399,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTruncStoreAction(MVT::f80, MVT::f16, Expand);
if (Subtarget.hasPOPCNT()) {
- setOperationAction(ISD::CTPOP , MVT::i8 , Promote);
+ setOperationPromotedToType(ISD::CTPOP, MVT::i8, MVT::i32);
} else {
setOperationAction(ISD::CTPOP , MVT::i8 , Expand);
setOperationAction(ISD::CTPOP , MVT::i16 , Expand);
@@ -425,12 +432,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SELECT_CC, MVT::x86mmx, Expand);
setOperationAction(ISD::EH_RETURN , MVT::Other, Custom);
- // NOTE: EH_SJLJ_SETJMP/_LONGJMP supported here is NOT intended to support
- // SjLj exception handling but a light-weight setjmp/longjmp replacement to
- // support continuation, user-level threading, and etc.. As a result, no
- // other SjLj exception interfaces are implemented and please don't build
- // your own exception handling based on them.
- // LLVM/Clang supports zero-cost DWARF exception handling.
+ // NOTE: EH_SJLJ_SETJMP/_LONGJMP are not recommended, since
+ // LLVM/Clang supports zero-cost DWARF and SEH exception handling.
setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
setOperationAction(ISD::EH_SJLJ_SETUP_DISPATCH, MVT::Other, Custom);
@@ -545,8 +548,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} else if (UseX87 && X86ScalarSSEf32) {
// Use SSE for f32, x87 for f64.
// Set up the FP register classes.
- addRegisterClass(MVT::f32, Subtarget.hasAVX512() ? &X86::FR32XRegClass
- : &X86::FR32RegClass);
+ addRegisterClass(MVT::f32, &X86::FR32RegClass);
addRegisterClass(MVT::f64, &X86::RFP64RegClass);
// Use ANDPS to simulate FABS.
@@ -573,11 +575,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat(-0.0)); // FLD0/FCHS
addLegalFPImmediate(APFloat(-1.0)); // FLD1/FCHS
- if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , MVT::f64, Expand);
- setOperationAction(ISD::FCOS , MVT::f64, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
- }
+ // Always expand sin/cos functions even though x87 has an instruction.
+ setOperationAction(ISD::FSIN , MVT::f64, Expand);
+ setOperationAction(ISD::FCOS , MVT::f64, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f64, Expand);
} else if (UseX87) {
// f32 and f64 in x87.
// Set up the FP register classes.
@@ -588,11 +589,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::UNDEF, VT, Expand);
setOperationAction(ISD::FCOPYSIGN, VT, Expand);
- if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , VT, Expand);
- setOperationAction(ISD::FCOS , VT, Expand);
- setOperationAction(ISD::FSINCOS, VT, Expand);
- }
+ // Always expand sin/cos functions even though x87 has an instruction.
+ setOperationAction(ISD::FSIN , VT, Expand);
+ setOperationAction(ISD::FCOS , VT, Expand);
+ setOperationAction(ISD::FSINCOS, VT, Expand);
}
addLegalFPImmediate(APFloat(+0.0)); // FLD0
addLegalFPImmediate(APFloat(+1.0)); // FLD1
@@ -636,11 +636,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(TmpFlt2); // FLD1/FCHS
}
- if (!TM.Options.UnsafeFPMath) {
- setOperationAction(ISD::FSIN , MVT::f80, Expand);
- setOperationAction(ISD::FCOS , MVT::f80, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
- }
+ // Always expand sin/cos functions even though x87 has an instruction.
+ setOperationAction(ISD::FSIN , MVT::f80, Expand);
+ setOperationAction(ISD::FCOS , MVT::f80, Expand);
+ setOperationAction(ISD::FSINCOS, MVT::f80, Expand);
setOperationAction(ISD::FFLOOR, MVT::f80, Expand);
setOperationAction(ISD::FCEIL, MVT::f80, Expand);
@@ -861,8 +860,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i8, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i16, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
@@ -944,6 +941,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
setLoadExtAction(LoadExtOp, MVT::v8i16, MVT::v8i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i8, Legal);
+ setLoadExtAction(LoadExtOp, MVT::v2i32, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i8, Legal);
setLoadExtAction(LoadExtOp, MVT::v4i32, MVT::v4i16, Legal);
setLoadExtAction(LoadExtOp, MVT::v2i64, MVT::v2i16, Legal);
@@ -1002,13 +1000,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_UINT, MVT::v8i16, Promote);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
setOperationAction(ISD::FP_ROUND, MVT::v4f32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i8, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i16, Custom);
-
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v4f32, Legal);
@@ -1104,7 +1098,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// (result) is 128-bit but the source is 256-bit wide.
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64,
MVT::v4f32, MVT::v2f64 }) {
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
}
// Custom lower several nodes for 256-bit types.
@@ -1131,6 +1125,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::LOAD, VT, MVT::v4i64);
setOperationPromotedToType(ISD::SELECT, VT, MVT::v4i64);
}
+
+ if (HasInt256) {
+ // Custom legalize 2x32 to get a little better code.
+ setOperationAction(ISD::MGATHER, MVT::v2f32, Custom);
+ setOperationAction(ISD::MGATHER, MVT::v2i32, Custom);
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+ setOperationAction(ISD::MGATHER, VT, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
@@ -1143,13 +1147,55 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v8i1, &X86::VK8RegClass);
addRegisterClass(MVT::v16i1, &X86::VK16RegClass);
+ setOperationAction(ISD::SELECT, MVT::v1i1, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
+
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
+
+ // Extends of v16i1/v8i1 to 128-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v16i8, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v8i16, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v8i16, Custom);
+
+ for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+ }
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
+ for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1,
+ MVT::v16i1, MVT::v32i1, MVT::v64i1 })
+ setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
+
for (MVT VT : MVT::fp_vector_valuetypes())
setLoadExtAction(ISD::EXTLOAD, VT, MVT::v8f32, Legal);
- for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8, Legal);
setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i16, Legal);
- setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i8, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i16, Legal);
setLoadExtAction(ExtType, MVT::v8i64, MVT::v8i32, Legal);
@@ -1173,98 +1219,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i16, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i8, Promote);
setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i16, Promote);
setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i8, Promote);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i16, Promote);
setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i8, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i16, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i1, Custom);
- setOperationAction(ISD::SINT_TO_FP, MVT::v2i1, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v2i1, Custom);
- setOperationAction(ISD::FP_ROUND, MVT::v8f32, Legal);
- setOperationAction(ISD::FP_EXTEND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i32, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i8, Legal);
setTruncStoreAction(MVT::v16i32, MVT::v16i16, Legal);
- if (Subtarget.hasVLX()){
- setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
- setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
- setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
- setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
- setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
-
- setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
- setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
- setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
- setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
- setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
- } else {
+
+ if (!Subtarget.hasVLX()) {
+ // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
+ // to 512-bit rather than use the AVX2 instructions so that we can use
+ // k-masks.
for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64}) {
setOperationAction(ISD::MLOAD, VT, Custom);
setOperationAction(ISD::MSTORE, VT, Custom);
}
}
- setOperationAction(ISD::TRUNCATE, MVT::v16i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
-
- if (Subtarget.hasDQI()) {
- for (auto VT : { MVT::v2i64, MVT::v4i64, MVT::v8i64 }) {
- setOperationAction(ISD::SINT_TO_FP, VT, Legal);
- setOperationAction(ISD::UINT_TO_FP, VT, Legal);
- setOperationAction(ISD::FP_TO_SINT, VT, Legal);
- setOperationAction(ISD::FP_TO_UINT, VT, Legal);
- }
- if (Subtarget.hasVLX()) {
- // Fast v2f32 SINT_TO_FP( v2i32 ) custom conversion.
- setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
- }
- }
- if (Subtarget.hasVLX()) {
- setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
- setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
-
- // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
- setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v8i32, MVT::v8i16, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i32, MVT::v4i16, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i16, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v4i64, MVT::v4i32, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i8, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i16, Legal);
- setLoadExtAction(ISD::EXTLOAD, MVT::v2i64, MVT::v2i32, Legal);
- }
+ setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64, Custom);
@@ -1272,9 +1252,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ANY_EXTEND, MVT::v8i64, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v8i16, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v16i16, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
setOperationAction(ISD::FFLOOR, VT, Legal);
@@ -1295,38 +1272,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i64, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16f32, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i32, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v16i1, Custom);
setOperationAction(ISD::MUL, MVT::v8i64, Custom);
+ setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::UMUL_LOHI, MVT::v16i32, Custom);
+ setOperationAction(ISD::SMUL_LOHI, MVT::v16i32, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v16i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
setOperationAction(ISD::SELECT, MVT::v8f64, Custom);
setOperationAction(ISD::SELECT, MVT::v8i64, Custom);
setOperationAction(ISD::SELECT, MVT::v16f32, Custom);
- setOperationAction(ISD::MUL, MVT::v16i32, Legal);
-
- // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
- setOperationAction(ISD::ABS, MVT::v4i64, Legal);
- setOperationAction(ISD::ABS, MVT::v2i64, Legal);
-
- for (auto VT : { MVT::v8i1, MVT::v16i1 }) {
- setOperationAction(ISD::ADD, VT, Custom);
- setOperationAction(ISD::SUB, VT, Custom);
- setOperationAction(ISD::MUL, VT, Custom);
- setOperationAction(ISD::SETCC, VT, Custom);
- setOperationAction(ISD::SELECT, VT, Custom);
- setOperationAction(ISD::TRUNCATE, VT, Custom);
-
- setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
- setOperationAction(ISD::VSELECT, VT, Expand);
- }
-
for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
@@ -1338,11 +1294,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SRA, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
- }
-
- // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
- for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64, MVT::v4i64,
- MVT::v8i64}) {
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
}
@@ -1354,44 +1305,33 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::OR, MVT::v16i32, MVT::v8i64);
setOperationPromotedToType(ISD::XOR, MVT::v16i32, MVT::v8i64);
+ if (Subtarget.hasDQI()) {
+ setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+
+ setOperationAction(ISD::MUL, MVT::v8i64, Legal);
+ }
+
if (Subtarget.hasCDI()) {
// NonVLX sub-targets extend 128/256 vectors to use the 512 version.
- for (auto VT : {MVT::v4i32, MVT::v8i32, MVT::v16i32, MVT::v2i64,
- MVT::v4i64, MVT::v8i64}) {
+ for (auto VT : { MVT::v16i32, MVT::v8i64} ) {
setOperationAction(ISD::CTLZ, VT, Legal);
setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
}
} // Subtarget.hasCDI()
- if (Subtarget.hasDQI()) {
- // NonVLX sub-targets extend 128/256 vectors to use the 512 version.
- setOperationAction(ISD::MUL, MVT::v2i64, Legal);
- setOperationAction(ISD::MUL, MVT::v4i64, Legal);
- setOperationAction(ISD::MUL, MVT::v8i64, Legal);
- }
-
if (Subtarget.hasVPOPCNTDQ()) {
- // VPOPCNTDQ sub-targets extend 128/256 vectors to use the avx512
- // version of popcntd/q.
- for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v8i32, MVT::v4i64,
- MVT::v4i32, MVT::v2i64})
+ for (auto VT : { MVT::v16i32, MVT::v8i64 })
setOperationAction(ISD::CTPOP, VT, Legal);
}
- // Custom lower several nodes.
- for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
- MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 }) {
- setOperationAction(ISD::MGATHER, VT, Custom);
- setOperationAction(ISD::MSCATTER, VT, Custom);
- }
// Extract subvector is special because the value type
// (result) is 256-bit but the source is 512-bit wide.
- // 128-bit was made Custom under AVX1.
+ // 128-bit was made Legal under AVX1.
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64,
- MVT::v8f32, MVT::v4f64, MVT::v1i1 })
- setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
- for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1,
- MVT::v16i1, MVT::v32i1, MVT::v64i1 })
+ MVT::v8f32, MVT::v4f64 })
setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
@@ -1404,7 +1344,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal);
setOperationAction(ISD::MLOAD, VT, Legal);
setOperationAction(ISD::MSTORE, VT, Legal);
- setOperationAction(ISD::MGATHER, VT, Legal);
+ setOperationAction(ISD::MGATHER, VT, Custom);
setOperationAction(ISD::MSCATTER, VT, Custom);
}
for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32 }) {
@@ -1413,6 +1353,59 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
}
}// has AVX-512
+ if (!Subtarget.useSoftFloat() &&
+ (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
+ // These operations are handled on non-VLX by artificially widening in
+ // isel patterns.
+ // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
+
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+
+ for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::SMAX, VT, Legal);
+ setOperationAction(ISD::UMAX, VT, Legal);
+ setOperationAction(ISD::SMIN, VT, Legal);
+ setOperationAction(ISD::UMIN, VT, Legal);
+ setOperationAction(ISD::ABS, VT, Legal);
+ }
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::ROTL, VT, Custom);
+ setOperationAction(ISD::ROTR, VT, Custom);
+ }
+
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64,
+ MVT::v4f32, MVT::v8f32, MVT::v2f64, MVT::v4f64 })
+ setOperationAction(ISD::MSCATTER, VT, Custom);
+
+ if (Subtarget.hasDQI()) {
+ for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::SINT_TO_FP, VT, Legal);
+ setOperationAction(ISD::UINT_TO_FP, VT, Legal);
+ setOperationAction(ISD::FP_TO_SINT, VT, Legal);
+ setOperationAction(ISD::FP_TO_UINT, VT, Legal);
+
+ setOperationAction(ISD::MUL, VT, Legal);
+ }
+ }
+
+ if (Subtarget.hasCDI()) {
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 }) {
+ setOperationAction(ISD::CTLZ, VT, Legal);
+ setOperationAction(ISD::CTTZ_ZERO_UNDEF, VT, Custom);
+ }
+ } // Subtarget.hasCDI()
+
+ if (Subtarget.hasVPOPCNTDQ()) {
+ for (auto VT : { MVT::v4i32, MVT::v8i32, MVT::v2i64, MVT::v4i64 })
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+ }
+
if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
addRegisterClass(MVT::v64i8, &X86::VR512RegClass);
@@ -1420,77 +1413,62 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addRegisterClass(MVT::v32i1, &X86::VK32RegClass);
addRegisterClass(MVT::v64i1, &X86::VK64RegClass);
- setOperationAction(ISD::ADD, MVT::v32i1, Custom);
- setOperationAction(ISD::ADD, MVT::v64i1, Custom);
- setOperationAction(ISD::SUB, MVT::v32i1, Custom);
- setOperationAction(ISD::SUB, MVT::v64i1, Custom);
- setOperationAction(ISD::MUL, MVT::v32i1, Custom);
- setOperationAction(ISD::MUL, MVT::v64i1, Custom);
+ for (auto VT : { MVT::v32i1, MVT::v64i1 }) {
+ setOperationAction(ISD::ADD, VT, Custom);
+ setOperationAction(ISD::SUB, VT, Custom);
+ setOperationAction(ISD::MUL, VT, Custom);
+ setOperationAction(ISD::VSELECT, VT, Expand);
+
+ setOperationAction(ISD::TRUNCATE, VT, Custom);
+ setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
+ setOperationAction(ISD::SELECT, VT, Custom);
+ setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
+ setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
+ }
+
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
+ setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
+ setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
+
+ // Extends from v32i1 masks to 256-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v32i8, Custom);
+ // Extends from v64i1 masks to 512-bit vectors.
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::SETCC, MVT::v32i1, Custom);
- setOperationAction(ISD::SETCC, MVT::v64i1, Custom);
setOperationAction(ISD::MUL, MVT::v32i16, Legal);
setOperationAction(ISD::MUL, MVT::v64i8, Custom);
setOperationAction(ISD::MULHS, MVT::v32i16, Legal);
setOperationAction(ISD::MULHU, MVT::v32i16, Legal);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i1, Custom);
- setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i1, Custom);
+ setOperationAction(ISD::MULHS, MVT::v64i8, Custom);
+ setOperationAction(ISD::MULHU, MVT::v64i8, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v32i16, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v32i16, Legal);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v64i8, Legal);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1, Custom);
- setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v32i16, Custom);
setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v64i8, Custom);
- setOperationAction(ISD::SELECT, MVT::v32i1, Custom);
- setOperationAction(ISD::SELECT, MVT::v64i1, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v32i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v32i8, Custom);
setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::ANY_EXTEND, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i16, Custom);
setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i8, Custom);
- setOperationAction(ISD::SIGN_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::ZERO_EXTEND, MVT::v64i8, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i1, Custom);
- setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom);
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom);
- setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom);
setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v32i1, Custom);
- setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v64i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v32i1, Custom);
- setOperationAction(ISD::BUILD_VECTOR, MVT::v64i1, Custom);
- setOperationAction(ISD::VSELECT, MVT::v32i1, Expand);
- setOperationAction(ISD::VSELECT, MVT::v64i1, Expand);
setOperationAction(ISD::BITREVERSE, MVT::v64i8, Custom);
setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
setTruncStoreAction(MVT::v32i16, MVT::v32i8, Legal);
- if (Subtarget.hasVLX()) {
- setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
- setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
- }
-
- LegalizeAction Action = Subtarget.hasVLX() ? Legal : Custom;
- for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
- setOperationAction(ISD::MLOAD, VT, Action);
- setOperationAction(ISD::MSTORE, VT, Action);
- }
-
- if (Subtarget.hasCDI()) {
- setOperationAction(ISD::CTLZ, MVT::v32i16, Custom);
- setOperationAction(ISD::CTLZ, MVT::v64i8, Custom);
- }
for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
@@ -1503,6 +1481,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::MSTORE, VT, Legal);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);
+ setOperationAction(ISD::CTLZ, VT, Custom);
setOperationAction(ISD::SMAX, VT, Legal);
setOperationAction(ISD::UMAX, VT, Legal);
setOperationAction(ISD::SMIN, VT, Legal);
@@ -1513,13 +1492,30 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationPromotedToType(ISD::XOR, VT, MVT::v8i64);
}
- for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD, ISD::EXTLOAD}) {
+ for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
- if (Subtarget.hasVLX()) {
- // FIXME. This commands are available on SSE/AVX2, add relevant patterns.
- setLoadExtAction(ExtType, MVT::v16i16, MVT::v16i8, Legal);
- setLoadExtAction(ExtType, MVT::v8i16, MVT::v8i8, Legal);
- }
+ }
+
+ if (Subtarget.hasBITALG()) {
+ for (auto VT : { MVT::v64i8, MVT::v32i16 })
+ setOperationAction(ISD::CTPOP, VT, Legal);
+ }
+ }
+
+ if (!Subtarget.useSoftFloat() && Subtarget.hasBWI() &&
+ (Subtarget.hasAVX512() || Subtarget.hasVLX())) {
+ for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
+ setOperationAction(ISD::MLOAD, VT, Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
+ }
+
+ // These operations are handled on non-VLX by artificially widening in
+ // isel patterns.
+ // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
+
+ if (Subtarget.hasBITALG()) {
+ for (auto VT : { MVT::v16i8, MVT::v32i8, MVT::v8i16, MVT::v16i16 })
+ setOperationAction(ISD::CTPOP, VT, Legal);
}
}
@@ -1542,16 +1538,47 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom);
}
+ // TODO: v8i1 concat should be legal without VLX to support concats of
+ // v1i1, but we won't legalize it correctly currently without introducing
+ // a v4i1 concat in the middle.
setOperationAction(ISD::CONCAT_VECTORS, MVT::v8i1, Custom);
setOperationAction(ISD::CONCAT_VECTORS, MVT::v4i1, Custom);
- setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v8i1, Custom);
setOperationAction(ISD::INSERT_SUBVECTOR, MVT::v4i1, Custom);
- for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::SMAX, VT, Legal);
- setOperationAction(ISD::UMAX, VT, Legal);
- setOperationAction(ISD::SMIN, VT, Legal);
- setOperationAction(ISD::UMIN, VT, Legal);
+ // Extends from v2i1/v4i1 masks to 128-bit vectors.
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::ZERO_EXTEND, MVT::v2i64, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::SIGN_EXTEND, MVT::v2i64, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v4i32, Custom);
+ setOperationAction(ISD::ANY_EXTEND, MVT::v2i64, Custom);
+
+ setTruncStoreAction(MVT::v4i64, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i16, Legal);
+ setTruncStoreAction(MVT::v4i64, MVT::v4i32, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i8, Legal);
+ setTruncStoreAction(MVT::v8i32, MVT::v8i16, Legal);
+
+ setTruncStoreAction(MVT::v2i64, MVT::v2i8, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i16, Legal);
+ setTruncStoreAction(MVT::v2i64, MVT::v2i32, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i8, Legal);
+ setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
+
+ if (Subtarget.hasDQI()) {
+ // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
+ // v2f32 UINT_TO_FP is already custom under SSE2.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ "Unexpected operation action!");
+ // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ }
+
+ if (Subtarget.hasBWI()) {
+ setTruncStoreAction(MVT::v16i16, MVT::v16i8, Legal);
+ setTruncStoreAction(MVT::v8i16, MVT::v8i8, Legal);
}
}
@@ -1592,6 +1619,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLibcallName(RTLIB::SHL_I128, nullptr);
setLibcallName(RTLIB::SRL_I128, nullptr);
setLibcallName(RTLIB::SRA_I128, nullptr);
+ setLibcallName(RTLIB::MUL_I128, nullptr);
}
// Combine sin / cos into one node or libcall if possible.
@@ -1631,6 +1659,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
+ setTargetDAGCombine(ISD::EXTRACT_SUBVECTOR);
setTargetDAGCombine(ISD::BITCAST);
setTargetDAGCombine(ISD::VSELECT);
setTargetDAGCombine(ISD::SELECT);
@@ -1698,6 +1727,19 @@ bool X86TargetLowering::useLoadStackGuardNode() const {
return Subtarget.isTargetMachO() && Subtarget.is64Bit();
}
+bool X86TargetLowering::useStackGuardXorFP() const {
+ // Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
+ return Subtarget.getTargetTriple().isOSMSVCRT();
+}
+
+SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
+ const SDLoc &DL) const {
+ EVT PtrTy = getPointerTy(DAG.getDataLayout());
+ unsigned XorOp = Subtarget.is64Bit() ? X86::XOR64_FP : X86::XOR32_FP;
+ MachineSDNode *Node = DAG.getMachineNode(XorOp, DL, PtrTy, Val);
+ return SDValue(Node, 0);
+}
+
TargetLoweringBase::LegalizeTypeAction
X86TargetLowering::getPreferredVectorAction(EVT VT) const {
if (ExperimentalVectorWideningLegalization &&
@@ -1714,40 +1756,26 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
if (!VT.isVector())
return MVT::i8;
- if (VT.isSimple()) {
- MVT VVT = VT.getSimpleVT();
- const unsigned NumElts = VVT.getVectorNumElements();
- MVT EltVT = VVT.getVectorElementType();
- if (VVT.is512BitVector()) {
- if (Subtarget.hasAVX512())
- if (EltVT == MVT::i32 || EltVT == MVT::i64 ||
- EltVT == MVT::f32 || EltVT == MVT::f64)
- switch(NumElts) {
- case 8: return MVT::v8i1;
- case 16: return MVT::v16i1;
- }
- if (Subtarget.hasBWI())
- if (EltVT == MVT::i8 || EltVT == MVT::i16)
- switch(NumElts) {
- case 32: return MVT::v32i1;
- case 64: return MVT::v64i1;
- }
- }
+ if (Subtarget.hasAVX512()) {
+ const unsigned NumElts = VT.getVectorNumElements();
- if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return MVT::getVectorVT(MVT::i1, NumElts);
+ // Figure out what this type will be legalized to.
+ EVT LegalVT = VT;
+ while (getTypeAction(Context, LegalVT) != TypeLegal)
+ LegalVT = getTypeToTransformTo(Context, LegalVT);
- if (!isTypeLegal(VT) && getTypeAction(Context, VT) == TypePromoteInteger) {
- EVT LegalVT = getTypeToTransformTo(Context, VT);
- EltVT = LegalVT.getVectorElementType().getSimpleVT();
- }
+ // If we got a 512-bit vector then we'll definitely have a vXi1 compare.
+ if (LegalVT.getSimpleVT().is512BitVector())
+ return EVT::getVectorVT(Context, MVT::i1, NumElts);
- if (Subtarget.hasVLX() && EltVT.getSizeInBits() >= 32)
- switch(NumElts) {
- case 2: return MVT::v2i1;
- case 4: return MVT::v4i1;
- case 8: return MVT::v8i1;
- }
+ if (LegalVT.getSimpleVT().isVector() && Subtarget.hasVLX()) {
+ // If we legalized to less than a 512-bit vector, then we will use a vXi1
+ // compare for vXi32/vXi64 for sure. If we have BWI we will also support
+ // vXi16/vXi8.
+ MVT EltVT = LegalVT.getSimpleVT().getVectorElementType();
+ if (Subtarget.hasBWI() || EltVT.getSizeInBits() >= 32)
+ return EVT::getVectorVT(Context, MVT::i1, NumElts);
+ }
}
return VT.changeVectorElementTypeToInteger();
@@ -1815,8 +1843,8 @@ X86TargetLowering::getOptimalMemOpType(uint64_t Size,
bool IsMemset, bool ZeroMemset,
bool MemcpyStrSrc,
MachineFunction &MF) const {
- const Function *F = MF.getFunction();
- if (!F->hasFnAttribute(Attribute::NoImplicitFloat)) {
+ const Function &F = MF.getFunction();
+ if (!F.hasFnAttribute(Attribute::NoImplicitFloat)) {
if (Size >= 16 &&
(!Subtarget.isUnalignedMem16Slow() ||
((DstAlign == 0 || DstAlign >= 16) &&
@@ -1912,7 +1940,7 @@ void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
return;
unsigned ParamRegs = 0;
- if (auto *M = MF->getFunction()->getParent())
+ if (auto *M = MF->getFunction().getParent())
ParamRegs = M->getNumberRegisterParameters();
// Mark the first N int arguments as having reg
@@ -2017,7 +2045,7 @@ Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
// sysdeps/{i386,x86_64}/nptl/tls.h)
if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
if (Subtarget.isTargetFuchsia()) {
- // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+ // <zircon/tls.h> defines ZX_TLS_STACK_GUARD_OFFSET with this value.
return SegmentOffset(IRB, 0x10, getAddressSpace());
} else {
// %fs:0x28, unless we're using a Kernel code model, in which case
@@ -2082,7 +2110,7 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
// Fuchsia is similar.
if (Subtarget.isTargetFuchsia()) {
- // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+ // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
return SegmentOffset(IRB, 0x18, getAddressSpace());
}
@@ -2145,8 +2173,7 @@ static void Passv64i1ArgInRegs(
const SDLoc &Dl, SelectionDAG &DAG, SDValue Chain, SDValue &Arg,
SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, CCValAssign &VA,
CCValAssign &NextVA, const X86Subtarget &Subtarget) {
- assert((Subtarget.hasBWI() || Subtarget.hasBMI()) &&
- "Expected AVX512BW or AVX512BMI target!");
+ assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
assert(Subtarget.is32Bit() && "Expecting 32 bit target");
assert(Arg.getValueType() == MVT::i64 && "Expecting 64 bit value");
assert(VA.isRegLoc() && NextVA.isRegLoc() &&
@@ -2180,7 +2207,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
// For example, when they are used for argument passing.
bool ShouldDisableCalleeSavedRegister =
CallConv == CallingConv::X86_RegCall ||
- MF.getFunction()->hasFnAttribute("no_caller_saved_registers");
+ MF.getFunction().hasFnAttribute("no_caller_saved_registers");
if (CallConv == CallingConv::X86_INTR && !Outs.empty())
report_fatal_error("X86 interrupts may not return any value");
@@ -2862,8 +2889,8 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
return None;
}
- const Function *Fn = MF.getFunction();
- bool NoImplicitFloatOps = Fn->hasFnAttribute(Attribute::NoImplicitFloat);
+ const Function &F = MF.getFunction();
+ bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool isSoftFloat = Subtarget.useSoftFloat();
assert(!(isSoftFloat && NoImplicitFloatOps) &&
"SSE register cannot be used when SSE is disabled!");
@@ -2896,10 +2923,9 @@ SDValue X86TargetLowering::LowerFormalArguments(
X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- const Function *Fn = MF.getFunction();
- if (Fn->hasExternalLinkage() &&
- Subtarget.isTargetCygMing() &&
- Fn->getName() == "main")
+ const Function &F = MF.getFunction();
+ if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
+ F.getName() == "main")
FuncInfo->setForceFramePointer(true);
MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -3074,7 +3100,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
// Figure out if XMM registers are in use.
assert(!(Subtarget.useSoftFloat() &&
- Fn->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
"SSE register cannot be used when SSE is disabled!");
// 64-bit calling conventions support varargs and register parameters, so we
@@ -3231,7 +3257,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
FuncInfo->setArgumentStackSize(StackSize);
if (WinEHFuncInfo *EHInfo = MF.getWinEHFuncInfo()) {
- EHPersonality Personality = classifyEHPersonality(Fn->getPersonalityFn());
+ EHPersonality Personality = classifyEHPersonality(F.getPersonalityFn());
if (Personality == EHPersonality::CoreCLR) {
assert(Is64Bit);
// TODO: Add a mechanism to frame lowering that will allow us to indicate
@@ -3248,10 +3274,10 @@ SDValue X86TargetLowering::LowerFormalArguments(
}
if (CallConv == CallingConv::X86_RegCall ||
- Fn->hasFnAttribute("no_caller_saved_registers")) {
- const MachineRegisterInfo &MRI = MF.getRegInfo();
- for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
- MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
+ F.hasFnAttribute("no_caller_saved_registers")) {
+ MachineRegisterInfo &MRI = MF.getRegInfo();
+ for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
+ MRI.disableCalleeSavedRegister(Pair.first);
}
return Chain;
@@ -3339,9 +3365,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
StructReturnType SR = callIsStructReturn(Outs, Subtarget.isTargetMCU());
bool IsSibcall = false;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
- auto Attr = MF.getFunction()->getFnAttribute("disable-tail-calls");
- const CallInst *CI =
- CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
+ auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
+ const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
(Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
@@ -3365,7 +3390,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
isTailCall = false;
}
- bool IsMustTail = CLI.CS && CLI.CS->isMustTailCall();
+ bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
if (IsMustTail) {
// Force this to be a tail call. The verifier rules are enough to ensure
// that we can lower this successfully without moving the return address
@@ -3375,7 +3400,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// Check if it's really possible to do a tail call.
isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv,
isVarArg, SR != NotStructReturn,
- MF.getFunction()->hasStructRetAttr(), CLI.RetTy,
+ MF.getFunction().hasStructRetAttr(), CLI.RetTy,
Outs, OutVals, Ins, DAG);
// Sibcalls are automatically detected tailcalls which do not require
@@ -3721,7 +3746,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
}
}
} else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
- const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
unsigned char OpFlags =
Subtarget.classifyGlobalFunctionReference(nullptr, *Mod);
@@ -3769,11 +3794,11 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
// is thrown, the runtime will not restore CSRs.
// FIXME: Model this more precisely so that we can register allocate across
// the normal edge and spill and fill across the exceptional edge.
- if (!Is64Bit && CLI.CS && CLI.CS->isInvoke()) {
- const Function *CallerFn = MF.getFunction();
+ if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
+ const Function &CallerFn = MF.getFunction();
EHPersonality Pers =
- CallerFn->hasPersonalityFn()
- ? classifyEHPersonality(CallerFn->getPersonalityFn())
+ CallerFn.hasPersonalityFn()
+ ? classifyEHPersonality(CallerFn.getPersonalityFn())
: EHPersonality::Unknown;
if (isFuncletEHPersonality(Pers))
Mask = RegInfo->getNoPreservedMask();
@@ -4021,15 +4046,15 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
// If -tailcallopt is specified, make fastcc functions tail-callable.
MachineFunction &MF = DAG.getMachineFunction();
- const Function *CallerF = MF.getFunction();
+ const Function &CallerF = MF.getFunction();
// If the function return type is x86_fp80 and the callee return type is not,
// then the FP_EXTEND of the call result is not a nop. It's not safe to
// perform a tailcall optimization here.
- if (CallerF->getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
+ if (CallerF.getReturnType()->isX86_FP80Ty() && !RetTy->isX86_FP80Ty())
return false;
- CallingConv::ID CallerCC = CallerF->getCallingConv();
+ CallingConv::ID CallerCC = CallerF.getCallingConv();
bool CCMatch = CallerCC == CalleeCC;
bool IsCalleeWin64 = Subtarget.isCallingConvWin64(CalleeCC);
bool IsCallerWin64 = Subtarget.isCallingConvWin64(CallerCC);
@@ -4243,7 +4268,6 @@ static bool isTargetShuffle(unsigned Opcode) {
case X86ISD::VSHLDQ:
case X86ISD::VSRLDQ:
case X86ISD::MOVLHPS:
- case X86ISD::MOVLHPD:
case X86ISD::MOVHLPS:
case X86ISD::MOVLPS:
case X86ISD::MOVLPD:
@@ -4491,6 +4515,7 @@ static bool hasFPCMov(unsigned X86CC) {
bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
const CallInst &I,
+ MachineFunction &MF,
unsigned Intrinsic) const {
const IntrinsicData* IntrData = getIntrinsicWithChain(Intrinsic);
@@ -4498,9 +4523,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
return false;
Info.opc = ISD::INTRINSIC_W_CHAIN;
- Info.readMem = false;
- Info.writeMem = false;
- Info.vol = false;
+ Info.flags = MachineMemOperand::MONone;
Info.offset = 0;
switch (IntrData->Type) {
@@ -4508,14 +4531,14 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.ptrVal = I.getArgOperand(0);
Info.memVT = MVT::getVT(I.getType());
Info.align = 1;
- Info.readMem = true;
+ Info.flags |= MachineMemOperand::MOLoad;
break;
}
case COMPRESS_TO_MEM: {
Info.ptrVal = I.getArgOperand(0);
Info.memVT = MVT::getVT(I.getArgOperand(1)->getType());
Info.align = 1;
- Info.writeMem = true;
+ Info.flags |= MachineMemOperand::MOStore;
break;
}
case TRUNCATE_TO_MEM_VI8:
@@ -4533,7 +4556,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
Info.align = 1;
- Info.writeMem = true;
+ Info.flags |= MachineMemOperand::MOStore;
break;
}
default:
@@ -4578,12 +4601,27 @@ bool X86TargetLowering::shouldConvertConstantLoadToIntImm(const APInt &Imm,
return true;
}
-bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT,
+bool X86TargetLowering::convertSelectOfConstantsToMath(EVT VT) const {
+ // TODO: It might be a win to ease or lift this restriction, but the generic
+ // folds in DAGCombiner conflict with vector folds for an AVX512 target.
+ if (VT.isVector() && Subtarget.hasAVX512())
+ return false;
+
+ return true;
+}
+
+bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
return false;
- return (Index == 0 || Index == ResVT.getVectorNumElements());
+ // Mask vectors support all subregister combinations and operations that
+ // extract half of vector.
+ if (ResVT.getVectorElementType() == MVT::i1)
+ return Index == 0 || ((ResVT.getSizeInBits() == SrcVT.getSizeInBits()*2) &&
+ (Index == ResVT.getVectorNumElements()));
+
+ return (Index % ResVT.getVectorNumElements()) == 0;
}
bool X86TargetLowering::isCheapToSpeculateCttz() const {
@@ -4596,6 +4634,20 @@ bool X86TargetLowering::isCheapToSpeculateCtlz() const {
return Subtarget.hasLZCNT();
}
+bool X86TargetLowering::canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+ const SelectionDAG &DAG) const {
+ // Do not merge to float value size (128 bytes) if no implicit
+ // float attribute is set.
+ bool NoFloat = DAG.getMachineFunction().getFunction().hasFnAttribute(
+ Attribute::NoImplicitFloat);
+
+ if (NoFloat) {
+ unsigned MaxIntSize = Subtarget.is64Bit() ? 64 : 32;
+ return (MemVT.getSizeInBits() <= MaxIntSize);
+ }
+ return true;
+}
+
bool X86TargetLowering::isCtlzFast() const {
return Subtarget.hasFastLZCNT();
}
@@ -4778,123 +4830,6 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
return true;
}
-/// Helper function to scale a shuffle or target shuffle mask, replacing each
-/// mask index with the scaled sequential indices for an equivalent narrowed
-/// mask. This is the reverse process to canWidenShuffleElements, but can always
-/// succeed.
-static void scaleShuffleMask(int Scale, ArrayRef<int> Mask,
- SmallVectorImpl<int> &ScaledMask) {
- assert(0 < Scale && "Unexpected scaling factor");
- int NumElts = Mask.size();
- ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
-
- for (int i = 0; i != NumElts; ++i) {
- int M = Mask[i];
-
- // Repeat sentinel values in every mask element.
- if (M < 0) {
- for (int s = 0; s != Scale; ++s)
- ScaledMask[(Scale * i) + s] = M;
- continue;
- }
-
- // Scale mask element and increment across each mask element.
- for (int s = 0; s != Scale; ++s)
- ScaledMask[(Scale * i) + s] = (Scale * M) + s;
- }
-}
-
-/// Return true if the specified EXTRACT_SUBVECTOR operand specifies a vector
-/// extract that is suitable for instruction that extract 128 or 256 bit vectors
-static bool isVEXTRACTIndex(SDNode *N, unsigned vecWidth) {
- assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
- if (!isa<ConstantSDNode>(N->getOperand(1).getNode()))
- return false;
-
- // The index should be aligned on a vecWidth-bit boundary.
- uint64_t Index = N->getConstantOperandVal(1);
- MVT VT = N->getSimpleValueType(0);
- unsigned ElSize = VT.getScalarSizeInBits();
- return (Index * ElSize) % vecWidth == 0;
-}
-
-/// Return true if the specified INSERT_SUBVECTOR
-/// operand specifies a subvector insert that is suitable for input to
-/// insertion of 128 or 256-bit subvectors
-static bool isVINSERTIndex(SDNode *N, unsigned vecWidth) {
- assert((vecWidth == 128 || vecWidth == 256) && "Unexpected vector width");
- if (!isa<ConstantSDNode>(N->getOperand(2).getNode()))
- return false;
-
- // The index should be aligned on a vecWidth-bit boundary.
- uint64_t Index = N->getConstantOperandVal(2);
- MVT VT = N->getSimpleValueType(0);
- unsigned ElSize = VT.getScalarSizeInBits();
- return (Index * ElSize) % vecWidth == 0;
-}
-
-bool X86::isVINSERT128Index(SDNode *N) {
- return isVINSERTIndex(N, 128);
-}
-
-bool X86::isVINSERT256Index(SDNode *N) {
- return isVINSERTIndex(N, 256);
-}
-
-bool X86::isVEXTRACT128Index(SDNode *N) {
- return isVEXTRACTIndex(N, 128);
-}
-
-bool X86::isVEXTRACT256Index(SDNode *N) {
- return isVEXTRACTIndex(N, 256);
-}
-
-static unsigned getExtractVEXTRACTImmediate(SDNode *N, unsigned vecWidth) {
- assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
- assert(isa<ConstantSDNode>(N->getOperand(1).getNode()) &&
- "Illegal extract subvector for VEXTRACT");
-
- uint64_t Index = N->getConstantOperandVal(1);
- MVT VecVT = N->getOperand(0).getSimpleValueType();
- unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
- return Index / NumElemsPerChunk;
-}
-
-static unsigned getInsertVINSERTImmediate(SDNode *N, unsigned vecWidth) {
- assert((vecWidth == 128 || vecWidth == 256) && "Unsupported vector width");
- assert(isa<ConstantSDNode>(N->getOperand(2).getNode()) &&
- "Illegal insert subvector for VINSERT");
-
- uint64_t Index = N->getConstantOperandVal(2);
- MVT VecVT = N->getSimpleValueType(0);
- unsigned NumElemsPerChunk = vecWidth / VecVT.getScalarSizeInBits();
- return Index / NumElemsPerChunk;
-}
-
-/// Return the appropriate immediate to extract the specified
-/// EXTRACT_SUBVECTOR index with VEXTRACTF128 and VINSERTI128 instructions.
-unsigned X86::getExtractVEXTRACT128Immediate(SDNode *N) {
- return getExtractVEXTRACTImmediate(N, 128);
-}
-
-/// Return the appropriate immediate to extract the specified
-/// EXTRACT_SUBVECTOR index with VEXTRACTF64x4 and VINSERTI64x4 instructions.
-unsigned X86::getExtractVEXTRACT256Immediate(SDNode *N) {
- return getExtractVEXTRACTImmediate(N, 256);
-}
-
-/// Return the appropriate immediate to insert at the specified
-/// INSERT_SUBVECTOR index with VINSERTF128 and VINSERTI128 instructions.
-unsigned X86::getInsertVINSERT128Immediate(SDNode *N) {
- return getInsertVINSERTImmediate(N, 128);
-}
-
-/// Return the appropriate immediate to insert at the specified
-/// INSERT_SUBVECTOR index with VINSERTF46x4 and VINSERTI64x4 instructions.
-unsigned X86::getInsertVINSERT256Immediate(SDNode *N) {
- return getInsertVINSERTImmediate(N, 256);
-}
-
/// Returns true if Elt is a constant zero or a floating point constant +0.0.
bool X86::isZeroNode(SDValue Elt) {
return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -5018,8 +4953,8 @@ static SDValue extractSubVector(SDValue Vec, unsigned IdxVal, SelectionDAG &DAG,
// If the input is a buildvector just emit a smaller one.
if (Vec.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getBuildVector(
- ResultVT, dl, makeArrayRef(Vec->op_begin() + IdxVal, ElemsPerChunk));
+ return DAG.getBuildVector(ResultVT, dl,
+ Vec->ops().slice(IdxVal, ElemsPerChunk));
SDValue VecIdx = DAG.getIntPtrConstant(IdxVal, dl);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResultVT, Vec, VecIdx);
@@ -5093,10 +5028,13 @@ static bool isMaskedZeroUpperBitsvXi1(unsigned int Opcode) {
switch (Opcode) {
default:
return false;
+ case X86ISD::TESTM:
+ case X86ISD::TESTNM:
case X86ISD::PCMPEQM:
case X86ISD::PCMPGTM:
case X86ISD::CMPM:
case X86ISD::CMPMU:
+ case X86ISD::CMPM_RND:
return true;
}
}
@@ -5113,113 +5051,128 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
if (!isa<ConstantSDNode>(Idx))
return SDValue();
+ // Inserting undef is a nop. We can just return the original vector.
+ if (SubVec.isUndef())
+ return Vec;
+
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
+ if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
return Op;
MVT OpVT = Op.getSimpleValueType();
- MVT SubVecVT = SubVec.getSimpleValueType();
unsigned NumElems = OpVT.getVectorNumElements();
+
+ SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
+
+ // Extend to natively supported kshift.
+ MVT WideOpVT = OpVT;
+ if ((!Subtarget.hasDQI() && NumElems == 8) || NumElems < 8)
+ WideOpVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
+
+ // Inserting into the lsbs of a zero vector is legal. ISel will insert shifts
+ // if necessary.
+ if (IdxVal == 0 && ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ // May need to promote to a legal type.
+ Op = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ SubVec, Idx);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ MVT SubVecVT = SubVec.getSimpleValueType();
unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
assert(IdxVal + SubVecNumElems <= NumElems &&
IdxVal % SubVecVT.getSizeInBits() == 0 &&
"Unexpected index value in INSERT_SUBVECTOR");
- // There are 3 possible cases:
- // 1. Subvector should be inserted in the lower part (IdxVal == 0)
- // 2. Subvector should be inserted in the upper part
- // (IdxVal + SubVecNumElems == NumElems)
- // 3. Subvector should be inserted in the middle (for example v2i1
- // to v16i1, index 2)
-
- // If this node widens - by concatenating zeroes - the type of the result
- // of a node with instruction that zeroes all upper (irrelevant) bits of the
- // output register, mark this node as legal to enable replacing them with
- // the v8i1 version of the previous instruction during instruction selection.
- // For example, VPCMPEQDZ128rr instruction stores its v4i1 result in a k-reg,
- // while zeroing all the upper remaining 60 bits of the register. if the
- // result of such instruction is inserted into an allZeroVector, then we can
- // safely remove insert_vector (in instruction selection) as the cmp instr
- // already zeroed the rest of the register.
- if (ISD::isBuildVectorAllZeros(Vec.getNode()) && IdxVal == 0 &&
- (isMaskedZeroUpperBitsvXi1(SubVec.getOpcode()) ||
- (SubVec.getOpcode() == ISD::AND &&
- (isMaskedZeroUpperBitsvXi1(SubVec.getOperand(0).getOpcode()) ||
- isMaskedZeroUpperBitsvXi1(SubVec.getOperand(1).getOpcode())))))
- return Op;
-
- // extend to natively supported kshift
- MVT MinVT = Subtarget.hasDQI() ? MVT::v8i1 : MVT::v16i1;
- MVT WideOpVT = OpVT;
- if (OpVT.getSizeInBits() < MinVT.getStoreSizeInBits())
- WideOpVT = MinVT;
-
- SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Undef = DAG.getUNDEF(WideOpVT);
- SDValue WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- Undef, SubVec, ZeroIdx);
- // Extract sub-vector if require.
- auto ExtractSubVec = [&](SDValue V) {
- return (WideOpVT == OpVT) ? V : DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl,
- OpVT, V, ZeroIdx);
- };
+ if (IdxVal == 0) {
+ // Zero lower bits of the Vec
+ SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec,
+ ZeroIdx);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+ // Merge them together, SubVec should be zero extended.
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ SubVec, ZeroIdx);
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ SubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ Undef, SubVec, ZeroIdx);
if (Vec.isUndef()) {
- if (IdxVal != 0) {
- SDValue ShiftBits = DAG.getConstant(IdxVal, dl, MVT::i8);
- WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
- ShiftBits);
- }
- return ExtractSubVec(WideSubVec);
+ assert(IdxVal != 0 && "Unexpected index");
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ assert(IdxVal != 0 && "Unexpected index");
NumElems = WideOpVT.getVectorNumElements();
unsigned ShiftLeft = NumElems - SubVecNumElems;
unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
- Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
- DAG.getConstant(ShiftLeft, dl, MVT::i8));
- Vec = ShiftRight ? DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
- DAG.getConstant(ShiftRight, dl, MVT::i8)) : Vec;
- return ExtractSubVec(Vec);
- }
-
- if (IdxVal == 0) {
- // Zero lower bits of the Vec
- SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
- // Merge them together, SubVec should be zero extended.
- WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
- getZeroVector(WideOpVT, Subtarget, DAG, dl),
- SubVec, ZeroIdx);
- Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
- return ExtractSubVec(Vec);
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ if (ShiftRight != 0)
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getConstant(ShiftRight, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, SubVec, ZeroIdx);
}
// Simple case when we put subvector in the upper part
if (IdxVal + SubVecNumElems == NumElems) {
- // Zero upper bits of the Vec
- WideSubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, WideSubVec,
- DAG.getConstant(IdxVal, dl, MVT::i8));
- SDValue ShiftBits = DAG.getConstant(SubVecNumElems, dl, MVT::i8);
- Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
- Vec = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, WideSubVec);
- return ExtractSubVec(Vec);
- }
- // Subvector should be inserted in the middle - use shuffle
- WideSubVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Undef,
- SubVec, ZeroIdx);
- SmallVector<int, 64> Mask;
- for (unsigned i = 0; i < NumElems; ++i)
- Mask.push_back(i >= IdxVal && i < IdxVal + SubVecNumElems ?
- i : i + NumElems);
- return DAG.getVectorShuffle(OpVT, dl, WideSubVec, Vec, Mask);
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ if (SubVecNumElems * 2 == NumElems) {
+ // Special case, use legal zero extending insert_subvector. This allows
+ // isel to opimitize when bits are known zero.
+ Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ getZeroVector(WideOpVT, Subtarget, DAG, dl),
+ Vec, ZeroIdx);
+ } else {
+ // Otherwise use explicit shifts to zero the bits.
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
+ Undef, Vec, ZeroIdx);
+ NumElems = WideOpVT.getVectorNumElements();
+ SDValue ShiftBits = DAG.getConstant(NumElems - IdxVal, dl, MVT::i8);
+ Vec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Vec, ShiftBits);
+ Vec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec, ShiftBits);
+ }
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ // Inserting into the middle is more complicated.
+
+ NumElems = WideOpVT.getVectorNumElements();
+
+ // Widen the vector if needed.
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
+ // Move the current value of the bit to be replace to the lsbs.
+ Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ // Xor with the new bit.
+ Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Op, SubVec);
+ // Shift to MSB, filling bottom bits with 0.
+ unsigned ShiftLeft = NumElems - SubVecNumElems;
+ Op = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, Op,
+ DAG.getConstant(ShiftLeft, dl, MVT::i8));
+ // Shift to the final position, filling upper bits with 0.
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+ Op = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, Op,
+ DAG.getConstant(ShiftRight, dl, MVT::i8));
+ // Xor with original vector leaving the new value.
+ Op = DAG.getNode(ISD::XOR, dl, WideOpVT, Vec, Op);
+ // Reduce to original width if needed.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
}
/// Concat two 128-bit vectors into a 256 bit vector using VINSERTF128
@@ -5273,22 +5226,6 @@ static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
return DAG.getNode(Opc, DL, VT, In);
}
-/// Generate unpacklo/unpackhi shuffle mask.
-static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
- bool Unary) {
- assert(Mask.empty() && "Expected an empty shuffle mask vector");
- int NumElts = VT.getVectorNumElements();
- int NumEltsInLane = 128 / VT.getScalarSizeInBits();
-
- for (int i = 0; i < NumElts; ++i) {
- unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
- int Pos = (i % NumEltsInLane) / 2 + LaneStart;
- Pos += (Unary ? 0 : NumElts * (i % 2));
- Pos += (Lo ? 0 : NumEltsInLane / 2);
- Mask.push_back(Pos);
- }
-}
-
/// Returns a vector_shuffle node for an unpackl operation.
static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
SDValue V1, SDValue V2) {
@@ -5448,6 +5385,20 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
return false;
};
+ // Handle UNDEFs.
+ if (Op.isUndef()) {
+ APInt UndefSrcElts = APInt::getAllOnesValue(NumElts);
+ SmallVector<APInt, 64> SrcEltBits(NumElts, APInt(EltSizeInBits, 0));
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
+ // Extract scalar constant bits.
+ if (auto *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt UndefSrcElts = APInt::getNullValue(1);
+ SmallVector<APInt, 64> SrcEltBits(1, Cst->getAPIntValue());
+ return CastBitData(UndefSrcElts, SrcEltBits);
+ }
+
// Extract constant bits from build vector.
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
@@ -5542,6 +5493,24 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
return true;
}
+/// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
+/// Note: This ignores saturation, so inputs must be checked first.
+static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+ bool Unary) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
+ unsigned Offset = Unary ? 0 : NumElts;
+
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane));
+ for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
+ Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+ }
+}
+
/// Calculates the shuffle mask corresponding to the target-specific opcode.
/// If the mask could be calculated, returns it in \p Mask, returns the shuffle
/// operands in \p Ops, and returns true.
@@ -5562,21 +5531,28 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
bool IsFakeUnary = false;
switch(N->getOpcode()) {
case X86ISD::BLENDI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::SHUFP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeSHUFPMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::INSERTPS:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::EXTRQI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(1)) &&
isa<ConstantSDNode>(N->getOperand(2))) {
int BitLen = N->getConstantOperandVal(1);
@@ -5586,6 +5562,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
}
break;
case X86ISD::INSERTQI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
if (isa<ConstantSDNode>(N->getOperand(2)) &&
isa<ConstantSDNode>(N->getOperand(3))) {
int BitLen = N->getConstantOperandVal(2);
@@ -5595,23 +5573,33 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
}
break;
case X86ISD::UNPCKH:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKHMask(VT, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::UNPCKL:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeUNPCKLMask(VT, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVHLPS:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVHLPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVLHPS:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeMOVLHPSMask(NumElems, Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::PALIGNR:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePALIGNRMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
@@ -5620,33 +5608,39 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
break;
case X86ISD::VSHLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSLLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::VSRLDQ:
assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands() - 1);
DecodePSRLDQMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFD:
case X86ISD::VPERMILPI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePSHUFMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFHW:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePSHUFHWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::PSHUFLW:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodePSHUFLWMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::VZEXT_MOVL:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeZeroMoveLowMask(VT, Mask);
IsUnary = true;
break;
@@ -5670,6 +5664,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::VPERMILPV: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
unsigned MaskEltSize = VT.getScalarSizeInBits();
@@ -5685,6 +5680,9 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::PSHUFB: {
+ assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
SDValue MaskNode = N->getOperand(1);
SmallVector<uint64_t, 32> RawMask;
@@ -5699,37 +5697,46 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::VPERMI:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERMMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = true;
break;
case X86ISD::MOVSS:
case X86ISD::MOVSD:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask);
break;
case X86ISD::VPERM2X128:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
ImmN = N->getOperand(N->getNumOperands()-1);
DecodeVPERM2X128Mask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
break;
case X86ISD::MOVSLDUP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSLDUPMask(VT, Mask);
IsUnary = true;
break;
case X86ISD::MOVSHDUP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVSHDUPMask(VT, Mask);
IsUnary = true;
break;
case X86ISD::MOVDDUP:
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
DecodeMOVDDUPMask(VT, Mask);
IsUnary = true;
break;
- case X86ISD::MOVLHPD:
case X86ISD::MOVLPD:
case X86ISD::MOVLPS:
// Not yet implemented
return false;
case X86ISD::VPERMIL2: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
unsigned MaskEltSize = VT.getScalarSizeInBits();
SDValue MaskNode = N->getOperand(2);
@@ -5749,6 +5756,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::VPPERM: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
SDValue MaskNode = N->getOperand(2);
SmallVector<uint64_t, 32> RawMask;
@@ -5763,6 +5772,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::VPERMV: {
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
IsUnary = true;
// Unlike most shuffle nodes, VPERMV's mask operand is operand 0.
Ops.push_back(N->getOperand(1));
@@ -5780,6 +5790,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::VPERMV3: {
+ assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(2);
// Unlike most shuffle nodes, VPERMV3's mask operand is the middle one.
Ops.push_back(N->getOperand(0));
@@ -5793,6 +5805,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return false;
}
case X86ISD::VPERMIV3: {
+ assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+ assert(N->getOperand(2).getValueType() == VT && "Unexpected value type");
IsUnary = IsFakeUnary = N->getOperand(1) == N->getOperand(2);
// Unlike most shuffle nodes, VPERMIV3's mask operand is the first one.
Ops.push_back(N->getOperand(1));
@@ -5965,19 +5979,13 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
SDValue N0 = N.getOperand(0);
SDValue SrcExtract;
- if (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
- N0.getOperand(0).getValueType() == VT) {
+ if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ N0.getOperand(0).getValueType() == VT) ||
+ (N0.getOpcode() == X86ISD::PEXTRW &&
+ N0.getOperand(0).getValueType() == MVT::v8i16) ||
+ (N0.getOpcode() == X86ISD::PEXTRB &&
+ N0.getOperand(0).getValueType() == MVT::v16i8)) {
SrcExtract = N0;
- } else if (N0.getOpcode() == ISD::AssertZext &&
- N0.getOperand(0).getOpcode() == X86ISD::PEXTRW &&
- cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i16) {
- SrcExtract = N0.getOperand(0);
- assert(SrcExtract.getOperand(0).getValueType() == MVT::v8i16);
- } else if (N0.getOpcode() == ISD::AssertZext &&
- N0.getOperand(0).getOpcode() == X86ISD::PEXTRB &&
- cast<VTSDNode>(N0.getOperand(1))->getVT() == MVT::i8) {
- SrcExtract = N0.getOperand(0);
- assert(SrcExtract.getOperand(0).getValueType() == MVT::v16i8);
}
if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
@@ -6013,16 +6021,15 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
return true;
}
- // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
+ // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
// TODO: Expand this to support INSERT_VECTOR_ELT/etc.
unsigned ExOp =
(X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
- if (InScl.getOpcode() != ISD::AssertZext ||
- InScl.getOperand(0).getOpcode() != ExOp)
+ if (InScl.getOpcode() != ExOp)
return false;
- SDValue ExVec = InScl.getOperand(0).getOperand(0);
- uint64_t ExIdx = InScl.getOperand(0).getConstantOperandVal(1);
+ SDValue ExVec = InScl.getOperand(0);
+ uint64_t ExIdx = InScl.getConstantOperandVal(1);
assert(ExIdx < NumElts && "Illegal extraction index");
Ops.push_back(InVec);
Ops.push_back(ExVec);
@@ -6030,17 +6037,34 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
return true;
}
- case X86ISD::PACKSS: {
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: {
+ SDValue N0 = N.getOperand(0);
+ SDValue N1 = N.getOperand(1);
+ assert(N0.getValueType().getVectorNumElements() == (NumElts / 2) &&
+ N1.getValueType().getVectorNumElements() == (NumElts / 2) &&
+ "Unexpected input value type");
+
// If we know input saturation won't happen we can treat this
// as a truncation shuffle.
- if (DAG.ComputeNumSignBits(N.getOperand(0)) <= NumBitsPerElt ||
- DAG.ComputeNumSignBits(N.getOperand(1)) <= NumBitsPerElt)
- return false;
+ if (Opcode == X86ISD::PACKSS) {
+ if ((!N0.isUndef() && DAG.ComputeNumSignBits(N0) <= NumBitsPerElt) ||
+ (!N1.isUndef() && DAG.ComputeNumSignBits(N1) <= NumBitsPerElt))
+ return false;
+ } else {
+ APInt ZeroMask = APInt::getHighBitsSet(2 * NumBitsPerElt, NumBitsPerElt);
+ if ((!N0.isUndef() && !DAG.MaskedValueIsZero(N0, ZeroMask)) ||
+ (!N1.isUndef() && !DAG.MaskedValueIsZero(N1, ZeroMask)))
+ return false;
+ }
- Ops.push_back(N.getOperand(0));
- Ops.push_back(N.getOperand(1));
- for (unsigned i = 0; i != NumElts; ++i)
- Mask.push_back(i * 2);
+ bool IsUnary = (N0 == N1);
+
+ Ops.push_back(N0);
+ if (!IsUnary)
+ Ops.push_back(N1);
+
+ createPackShuffleMask(VT, Mask, IsUnary);
return true;
}
case X86ISD::VSHLI:
@@ -6099,6 +6123,14 @@ static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
for (int i = 0, e = Inputs.size(); i < e; ++i) {
int lo = UsedInputs.size() * MaskWidth;
int hi = lo + MaskWidth;
+
+ // Strip UNDEF input usage.
+ if (Inputs[i].isUndef())
+ for (int &M : Mask)
+ if ((lo <= M) && (M < hi))
+ M = SM_SentinelUndef;
+
+ // Check for unused inputs.
if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
UsedInputs.push_back(Inputs[i]);
continue;
@@ -6196,6 +6228,49 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
return SDValue();
}
+// Use PINSRB/PINSRW/PINSRD to create a build vector.
+static SDValue LowerBuildVectorAsInsert(SDValue Op, unsigned NonZeros,
+ unsigned NumNonZero, unsigned NumZero,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Op.getSimpleValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(((VT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+ ((VT == MVT::v16i8 || VT == MVT::v4i32) && Subtarget.hasSSE41())) &&
+ "Illegal vector insertion");
+
+ SDLoc dl(Op);
+ SDValue V;
+ bool First = true;
+
+ for (unsigned i = 0; i < NumElts; ++i) {
+ bool IsNonZero = (NonZeros & (1 << i)) != 0;
+ if (!IsNonZero)
+ continue;
+
+ // If the build vector contains zeros or our first insertion is not the
+ // first index then insert into zero vector to break any register
+ // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
+ if (First) {
+ First = false;
+ if (NumZero || 0 != i)
+ V = getZeroVector(VT, Subtarget, DAG, dl);
+ else {
+ assert(0 == i && "Expected insertion into zero-index");
+ V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+ V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+ V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+ V = DAG.getBitcast(VT, V);
+ continue;
+ }
+ }
+ V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, V, Op.getOperand(i),
+ DAG.getIntPtrConstant(i, dl));
+ }
+
+ return V;
+}
+
/// Custom lower build_vector of v16i8.
static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
unsigned NumNonZero, unsigned NumZero,
@@ -6204,39 +6279,15 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
if (NumNonZero > 8 && !Subtarget.hasSSE41())
return SDValue();
+ // SSE4.1 - use PINSRB to insert each byte directly.
+ if (Subtarget.hasSSE41())
+ return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
+ Subtarget);
+
SDLoc dl(Op);
SDValue V;
bool First = true;
- // SSE4.1 - use PINSRB to insert each byte directly.
- if (Subtarget.hasSSE41()) {
- for (unsigned i = 0; i < 16; ++i) {
- bool IsNonZero = (NonZeros & (1 << i)) != 0;
- if (IsNonZero) {
- // If the build vector contains zeros or our first insertion is not the
- // first index then insert into zero vector to break any register
- // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
- if (First) {
- First = false;
- if (NumZero || 0 != i)
- V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
- else {
- assert(0 == i && "Expected insertion into zero-index");
- V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
- V = DAG.getBitcast(MVT::v16i8, V);
- continue;
- }
- }
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
- Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
- }
- }
-
- return V;
- }
-
// Pre-SSE4.1 - merge byte pairs and insert with PINSRW.
for (unsigned i = 0; i < 16; ++i) {
bool ThisIsNonZero = (NonZeros & (1 << i)) != 0;
@@ -6292,34 +6343,9 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
if (NumNonZero > 4 && !Subtarget.hasSSE41())
return SDValue();
- SDLoc dl(Op);
- SDValue V;
- bool First = true;
- for (unsigned i = 0; i < 8; ++i) {
- bool IsNonZero = (NonZeros & (1 << i)) != 0;
- if (IsNonZero) {
- // If the build vector contains zeros or our first insertion is not the
- // first index then insert into zero vector to break any register
- // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
- if (First) {
- First = false;
- if (NumZero || 0 != i)
- V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
- else {
- assert(0 == i && "Expected insertion into zero-index");
- V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
- V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
- V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
- V = DAG.getBitcast(MVT::v8i16, V);
- continue;
- }
- }
- V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
- Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
- }
- }
-
- return V;
+ // Use PINSRW to insert each byte directly.
+ return LowerBuildVectorAsInsert(Op, NonZeros, NumNonZero, NumZero, DAG,
+ Subtarget);
}
/// Custom lower build_vector of v4i32 or v4f32.
@@ -6589,14 +6615,20 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
}
}
- auto CreateLoad = [&DAG, &DL](EVT VT, LoadSDNode *LDBase) {
+ SmallVector<LoadSDNode *, 8> Loads;
+ for (int i = FirstLoadedElt; i <= LastLoadedElt; ++i)
+ if (LoadMask[i])
+ Loads.push_back(cast<LoadSDNode>(peekThroughBitcasts(Elts[i])));
+
+ auto CreateLoad = [&DAG, &DL, &Loads](EVT VT, LoadSDNode *LDBase) {
auto MMOFlags = LDBase->getMemOperand()->getFlags();
assert(!(MMOFlags & MachineMemOperand::MOVolatile) &&
"Cannot merge volatile loads.");
SDValue NewLd =
DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
- DAG.makeEquivalentMemoryOrdering(LDBase, NewLd);
+ for (auto *LD : Loads)
+ DAG.makeEquivalentMemoryOrdering(LD, NewLd);
return NewLd;
};
@@ -6659,9 +6691,9 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
LDBase->getPointerInfo(),
LDBase->getAlignment(),
- false/*isVolatile*/, true/*ReadMem*/,
- false/*WriteMem*/);
- DAG.makeEquivalentMemoryOrdering(LDBase, ResNode);
+ MachineMemOperand::MOLoad);
+ for (auto *LD : Loads)
+ DAG.makeEquivalentMemoryOrdering(LD, ResNode);
return DAG.getBitcast(VT, ResNode);
}
}
@@ -6702,6 +6734,43 @@ static bool isUseOfShuffle(SDNode *N) {
return false;
}
+// Check if the current node of build vector is a zero extended vector.
+// // If so, return the value extended.
+// // For example: (0,0,0,a,0,0,0,a,0,0,0,a,0,0,0,a) returns a.
+// // NumElt - return the number of zero extended identical values.
+// // EltType - return the type of the value include the zero extend.
+static SDValue isSplatZeroExtended(const BuildVectorSDNode *Op,
+ unsigned &NumElt, MVT &EltType) {
+ SDValue ExtValue = Op->getOperand(0);
+ unsigned NumElts = Op->getNumOperands();
+ unsigned Delta = NumElts;
+
+ for (unsigned i = 1; i < NumElts; i++) {
+ if (Op->getOperand(i) == ExtValue) {
+ Delta = i;
+ break;
+ }
+ if (!(Op->getOperand(i).isUndef() || isNullConstant(Op->getOperand(i))))
+ return SDValue();
+ }
+ if (!isPowerOf2_32(Delta) || Delta == 1)
+ return SDValue();
+
+ for (unsigned i = Delta; i < NumElts; i++) {
+ if (i % Delta == 0) {
+ if (Op->getOperand(i) != ExtValue)
+ return SDValue();
+ } else if (!(isNullConstant(Op->getOperand(i)) ||
+ Op->getOperand(i).isUndef()))
+ return SDValue();
+ }
+ unsigned EltSize = Op->getSimpleValueType(0).getScalarSizeInBits();
+ unsigned ExtVTSize = EltSize * Delta;
+ EltType = MVT::getIntegerVT(ExtVTSize);
+ NumElt = NumElts / Delta;
+ return ExtValue;
+}
+
/// Attempt to use the vbroadcast instruction to generate a splat value
/// from a splat BUILD_VECTOR which uses:
/// a. A single scalar load, or a constant.
@@ -6727,6 +6796,39 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
BitVector UndefElements;
SDValue Ld = BVOp->getSplatValue(&UndefElements);
+ // Attempt to use VBROADCASTM
+ // From this paterrn:
+ // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
+ // b. t1 = (build_vector t0 t0)
+ //
+ // Create (VBROADCASTM v2i1 X)
+ if (Subtarget.hasCDI() && (VT.is512BitVector() || Subtarget.hasVLX())) {
+ MVT EltType = VT.getScalarType();
+ unsigned NumElts = VT.getVectorNumElements();
+ SDValue BOperand;
+ SDValue ZeroExtended = isSplatZeroExtended(BVOp, NumElts, EltType);
+ if ((ZeroExtended && ZeroExtended.getOpcode() == ISD::BITCAST) ||
+ (Ld && Ld.getOpcode() == ISD::ZERO_EXTEND &&
+ Ld.getOperand(0).getOpcode() == ISD::BITCAST)) {
+ if (ZeroExtended)
+ BOperand = ZeroExtended.getOperand(0);
+ else
+ BOperand = Ld.getOperand(0).getOperand(0);
+ if (BOperand.getValueType().isVector() &&
+ BOperand.getSimpleValueType().getVectorElementType() == MVT::i1) {
+ if ((EltType == MVT::i64 && (VT.getVectorElementType() == MVT::i8 ||
+ NumElts == 8)) || // for broadcastmb2q
+ (EltType == MVT::i32 && (VT.getVectorElementType() == MVT::i16 ||
+ NumElts == 16))) { // for broadcastmw2d
+ SDValue Brdcst =
+ DAG.getNode(X86ISD::VBROADCASTM, dl,
+ MVT::getVectorVT(EltType, NumElts), BOperand);
+ return DAG.getBitcast(VT, Brdcst);
+ }
+ }
+ }
+ }
+
// We need a splat of a single value to use broadcast, and it doesn't
// make any sense if the value is only in one element of the vector.
if (!Ld || (VT.getVectorNumElements() - UndefElements.count()) <= 1) {
@@ -6824,7 +6926,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -6902,10 +7004,10 @@ static int getUnderlyingExtractedFromVec(SDValue &ExtractedFromVec,
// For 256-bit vectors, LowerEXTRACT_VECTOR_ELT_SSE4 may have already
// lowered this:
- // (extract_vector_elt (v8f32 %vreg1), Constant<6>)
+ // (extract_vector_elt (v8f32 %1), Constant<6>)
// to:
// (extract_vector_elt (vector_shuffle<2,u,u,u>
- // (extract_subvector (v8f32 %vreg0), Constant<4>),
+ // (extract_subvector (v8f32 %0), Constant<4>),
// undef)
// Constant<0>)
// In this case the vector is the extract_subvector expression and the index
@@ -7020,10 +7122,10 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
if (ISD::isBuildVectorAllZeros(Op.getNode()))
- return DAG.getTargetConstant(0, dl, VT);
+ return Op;
if (ISD::isBuildVectorAllOnes(Op.getNode()))
- return DAG.getTargetConstant(1, dl, VT);
+ return Op;
if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
@@ -7272,7 +7374,8 @@ static SDValue ExpandHorizontalBinOp(const SDValue &V0, const SDValue &V1,
/// are written to the parameters \p Opnd0 and \p Opnd1.
static bool isAddSub(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget, SelectionDAG &DAG,
- SDValue &Opnd0, SDValue &Opnd1) {
+ SDValue &Opnd0, SDValue &Opnd1,
+ unsigned &NumExtracts) {
MVT VT = BV->getSimpleValueType(0);
if ((!Subtarget.hasSSE3() || (VT != MVT::v4f32 && VT != MVT::v2f64)) &&
@@ -7284,6 +7387,8 @@ static bool isAddSub(const BuildVectorSDNode *BV,
SDValue InVec0 = DAG.getUNDEF(VT);
SDValue InVec1 = DAG.getUNDEF(VT);
+ NumExtracts = 0;
+
// Odd-numbered elements in the input build vector are obtained from
// adding two integer/float elements.
// Even-numbered elements in the input build vector are obtained from
@@ -7360,6 +7465,9 @@ static bool isAddSub(const BuildVectorSDNode *BV,
// Update the pair of expected opcodes.
std::swap(ExpectedOpcode, NextExpectedOpcode);
+
+ // Increment the number of extractions done.
+ ++NumExtracts;
}
// Don't try to fold this build_vector into an ADDSUB if the inputs are undef.
@@ -7398,8 +7506,10 @@ static bool isAddSub(const BuildVectorSDNode *BV,
/// is illegal sometimes. E.g. 512-bit ADDSUB is not available, while 512-bit
/// FMADDSUB is.
static bool isFMAddSub(const X86Subtarget &Subtarget, SelectionDAG &DAG,
- SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2) {
- if (Opnd0.getOpcode() != ISD::FMUL || Opnd0->use_size() != 2 ||
+ SDValue &Opnd0, SDValue &Opnd1, SDValue &Opnd2,
+ unsigned ExpectedUses) {
+ if (Opnd0.getOpcode() != ISD::FMUL ||
+ !Opnd0->hasNUsesOfValue(ExpectedUses, 0) ||
!Subtarget.hasAnyFMA())
return false;
@@ -7426,7 +7536,8 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
SDValue Opnd0, Opnd1;
- if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1))
+ unsigned NumExtracts;
+ if (!isAddSub(BV, Subtarget, DAG, Opnd0, Opnd1, NumExtracts))
return SDValue();
MVT VT = BV->getSimpleValueType(0);
@@ -7434,7 +7545,9 @@ static SDValue lowerToAddSubOrFMAddSub(const BuildVectorSDNode *BV,
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
- if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ // TODO: According to coverage reports, the FMADDSUB transform is not
+ // triggered by any tests.
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, NumExtracts))
return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
@@ -7658,6 +7771,111 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
return SDValue();
}
+// Tries to lower a BUILD_VECTOR composed of extract-extract chains that can be
+// reasoned to be a permutation of a vector by indices in a non-constant vector.
+// (build_vector (extract_elt V, (extract_elt I, 0)),
+// (extract_elt V, (extract_elt I, 1)),
+// ...
+// ->
+// (vpermv I, V)
+//
+// TODO: Handle undefs
+// TODO: Utilize pshufb and zero mask blending to support more efficient
+// construction of vectors with constant-0 elements.
+// TODO: Use smaller-element vectors of same width, and "interpolate" the indices,
+// when no native operation available.
+static SDValue
+LowerBUILD_VECTORAsVariablePermute(SDValue V, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Look for VPERMV and PSHUFB opportunities.
+ MVT VT = V.getSimpleValueType();
+ switch (VT.SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v16i8:
+ if (!Subtarget.hasSSE3())
+ return SDValue();
+ break;
+ case MVT::v8f32:
+ case MVT::v8i32:
+ if (!Subtarget.hasAVX2())
+ return SDValue();
+ break;
+ case MVT::v4i64:
+ case MVT::v4f64:
+ if (!Subtarget.hasVLX())
+ return SDValue();
+ break;
+ case MVT::v16f32:
+ case MVT::v8f64:
+ case MVT::v16i32:
+ case MVT::v8i64:
+ if (!Subtarget.hasAVX512())
+ return SDValue();
+ break;
+ case MVT::v32i16:
+ if (!Subtarget.hasBWI())
+ return SDValue();
+ break;
+ case MVT::v8i16:
+ case MVT::v16i16:
+ if (!Subtarget.hasVLX() || !Subtarget.hasBWI())
+ return SDValue();
+ break;
+ case MVT::v64i8:
+ if (!Subtarget.hasVBMI())
+ return SDValue();
+ break;
+ case MVT::v32i8:
+ if (!Subtarget.hasVLX() || !Subtarget.hasVBMI())
+ return SDValue();
+ break;
+ }
+ SDValue SrcVec, IndicesVec;
+ // Check for a match of the permute source vector and permute index elements.
+ // This is done by checking that the i-th build_vector operand is of the form:
+ // (extract_elt SrcVec, (extract_elt IndicesVec, i)).
+ for (unsigned Idx = 0, E = V.getNumOperands(); Idx != E; ++Idx) {
+ SDValue Op = V.getOperand(Idx);
+ if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract encountered in V, set the source vector,
+ // otherwise verify the extract is from the previously defined source
+ // vector.
+ if (!SrcVec)
+ SrcVec = Op.getOperand(0);
+ else if (SrcVec != Op.getOperand(0))
+ return SDValue();
+ SDValue ExtractedIndex = Op->getOperand(1);
+ // Peek through extends.
+ if (ExtractedIndex.getOpcode() == ISD::ZERO_EXTEND ||
+ ExtractedIndex.getOpcode() == ISD::SIGN_EXTEND)
+ ExtractedIndex = ExtractedIndex.getOperand(0);
+ if (ExtractedIndex.getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ // If this is the first extract from the index vector candidate, set the
+ // indices vector, otherwise verify the extract is from the previously
+ // defined indices vector.
+ if (!IndicesVec)
+ IndicesVec = ExtractedIndex.getOperand(0);
+ else if (IndicesVec != ExtractedIndex.getOperand(0))
+ return SDValue();
+
+ auto *PermIdx = dyn_cast<ConstantSDNode>(ExtractedIndex.getOperand(1));
+ if (!PermIdx || PermIdx->getZExtValue() != Idx)
+ return SDValue();
+ }
+ MVT IndicesVT = VT;
+ if (VT.isFloatingPoint())
+ IndicesVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
+ VT.getVectorNumElements());
+ IndicesVec = DAG.getZExtOrTrunc(IndicesVec, SDLoc(IndicesVec), IndicesVT);
+ return DAG.getNode(VT == MVT::v16i8 ? X86ISD::PSHUFB : X86ISD::VPERMV,
+ SDLoc(V), VT, IndicesVec, SrcVec);
+}
+
SDValue
X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
SDLoc dl(Op);
@@ -7674,6 +7892,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return VectorConstant;
BuildVectorSDNode *BV = cast<BuildVectorSDNode>(Op.getNode());
+ // TODO: Support FMSUBADD here if we ever get tests for the FMADDSUB
+ // transform here.
if (SDValue AddSub = lowerToAddSubOrFMAddSub(BV, Subtarget, DAG))
return AddSub;
if (SDValue HorizontalOp = LowerToHorizontalOp(BV, Subtarget, DAG))
@@ -7690,14 +7910,16 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
uint64_t NonZeros = 0;
bool IsAllConstants = true;
SmallSet<SDValue, 8> Values;
+ unsigned NumConstants = NumElems;
for (unsigned i = 0; i < NumElems; ++i) {
SDValue Elt = Op.getOperand(i);
if (Elt.isUndef())
continue;
Values.insert(Elt);
- if (Elt.getOpcode() != ISD::Constant &&
- Elt.getOpcode() != ISD::ConstantFP)
+ if (!isa<ConstantSDNode>(Elt) && !isa<ConstantFPSDNode>(Elt)) {
IsAllConstants = false;
+ NumConstants--;
+ }
if (X86::isZeroNode(Elt))
NumZero++;
else {
@@ -7711,6 +7933,52 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (NumNonZero == 0)
return DAG.getUNDEF(VT);
+ // If we are inserting one variable into a vector of non-zero constants, try
+ // to avoid loading each constant element as a scalar. Load the constants as a
+ // vector and then insert the variable scalar element. If insertion is not
+ // supported, we assume that we will fall back to a shuffle to get the scalar
+ // blended with the constants. Insertion into a zero vector is handled as a
+ // special-case somewhere below here.
+ LLVMContext &Context = *DAG.getContext();
+ if (NumConstants == NumElems - 1 && NumNonZero != 1 &&
+ (isOperationLegalOrCustom(ISD::INSERT_VECTOR_ELT, VT) ||
+ isOperationLegalOrCustom(ISD::VECTOR_SHUFFLE, VT))) {
+ // Create an all-constant vector. The variable element in the old
+ // build vector is replaced by undef in the constant vector. Save the
+ // variable scalar element and its index for use in the insertelement.
+ Type *EltType = Op.getValueType().getScalarType().getTypeForEVT(Context);
+ SmallVector<Constant *, 16> ConstVecOps(NumElems, UndefValue::get(EltType));
+ SDValue VarElt;
+ SDValue InsIndex;
+ for (unsigned i = 0; i != NumElems; ++i) {
+ SDValue Elt = Op.getOperand(i);
+ if (auto *C = dyn_cast<ConstantSDNode>(Elt))
+ ConstVecOps[i] = ConstantInt::get(Context, C->getAPIntValue());
+ else if (auto *C = dyn_cast<ConstantFPSDNode>(Elt))
+ ConstVecOps[i] = ConstantFP::get(Context, C->getValueAPF());
+ else if (!Elt.isUndef()) {
+ assert(!VarElt.getNode() && !InsIndex.getNode() &&
+ "Expected one variable element in this vector");
+ VarElt = Elt;
+ InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
+ }
+ }
+ Constant *CV = ConstantVector::get(ConstVecOps);
+ SDValue DAGConstVec = DAG.getConstantPool(CV, VT);
+
+ // The constants we just created may not be legal (eg, floating point). We
+ // must lower the vector right here because we can not guarantee that we'll
+ // legalize it before loading it. This is also why we could not just create
+ // a new build vector here. If the build vector contains illegal constants,
+ // it could get split back up into a series of insert elements.
+ // TODO: Improve this by using shorter loads with broadcast/VZEXT_LOAD.
+ SDValue LegalDAGConstVec = LowerConstantPool(DAGConstVec, DAG);
+ MachineFunction &MF = DAG.getMachineFunction();
+ MachinePointerInfo MPI = MachinePointerInfo::getConstantPool(MF);
+ SDValue Ld = DAG.getLoad(VT, dl, DAG.getEntryNode(), LegalDAGConstVec, MPI);
+ return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Ld, VarElt, InsIndex);
+ }
+
// Special case for single non-zero, non-undef, element.
if (NumNonZero == 1) {
unsigned Idx = countTrailingZeros(NonZeros);
@@ -7825,6 +8093,9 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
if (IsAllConstants)
return SDValue();
+ if (SDValue V = LowerBUILD_VECTORAsVariablePermute(Op, DAG, Subtarget))
+ return V;
+
// See if we can use a vector load to get all of the elements.
if (VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) {
SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
@@ -7836,15 +8107,13 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
// For AVX-length vectors, build the individual 128-bit pieces and use
// shuffles to put them in place.
if (VT.is256BitVector() || VT.is512BitVector()) {
- SmallVector<SDValue, 64> Ops(Op->op_begin(), Op->op_begin() + NumElems);
-
- EVT HVT = EVT::getVectorVT(*DAG.getContext(), ExtVT, NumElems/2);
+ EVT HVT = EVT::getVectorVT(Context, ExtVT, NumElems/2);
// Build both the lower and upper subvector.
SDValue Lower =
- DAG.getBuildVector(HVT, dl, makeArrayRef(&Ops[0], NumElems / 2));
+ DAG.getBuildVector(HVT, dl, Op->ops().slice(0, NumElems / 2));
SDValue Upper = DAG.getBuildVector(
- HVT, dl, makeArrayRef(&Ops[NumElems / 2], NumElems / 2));
+ HVT, dl, Op->ops().slice(NumElems / 2, NumElems /2));
// Recreate the wider vector with the lower and upper part.
if (VT.is256BitVector())
@@ -7892,8 +8161,8 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
}
for (unsigned i = 0; i < 2; ++i) {
- switch ((NonZeros & (0x3 << i*2)) >> (i*2)) {
- default: break;
+ switch ((NonZeros >> (i*2)) & 0x3) {
+ default: llvm_unreachable("Unexpected NonZero count");
case 0:
Ops[i] = Ops[i*2]; // Must be a zero vector.
break;
@@ -7920,57 +8189,56 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
return DAG.getVectorShuffle(VT, dl, Ops[0], Ops[1], MaskVec);
}
- if (Values.size() > 1 && VT.is128BitVector()) {
- // Check for a build vector from mostly shuffle plus few inserting.
- if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
- return Sh;
+ assert(Values.size() > 1 && "Expected non-undef and non-splat vector");
- // For SSE 4.1, use insertps to put the high elements into the low element.
- if (Subtarget.hasSSE41()) {
- SDValue Result;
- if (!Op.getOperand(0).isUndef())
- Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
- else
- Result = DAG.getUNDEF(VT);
+ // Check for a build vector from mostly shuffle plus few inserting.
+ if (SDValue Sh = buildFromShuffleMostly(Op, DAG))
+ return Sh;
- for (unsigned i = 1; i < NumElems; ++i) {
- if (Op.getOperand(i).isUndef()) continue;
- Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
- Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
- }
- return Result;
- }
+ // For SSE 4.1, use insertps to put the high elements into the low element.
+ if (Subtarget.hasSSE41()) {
+ SDValue Result;
+ if (!Op.getOperand(0).isUndef())
+ Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
+ else
+ Result = DAG.getUNDEF(VT);
- // Otherwise, expand into a number of unpckl*, start by extending each of
- // our (non-undef) elements to the full vector width with the element in the
- // bottom slot of the vector (which generates no code for SSE).
- SmallVector<SDValue, 8> Ops(NumElems);
- for (unsigned i = 0; i < NumElems; ++i) {
- if (!Op.getOperand(i).isUndef())
- Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
- else
- Ops[i] = DAG.getUNDEF(VT);
+ for (unsigned i = 1; i < NumElems; ++i) {
+ if (Op.getOperand(i).isUndef()) continue;
+ Result = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, Result,
+ Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
}
+ return Result;
+ }
+
+ // Otherwise, expand into a number of unpckl*, start by extending each of
+ // our (non-undef) elements to the full vector width with the element in the
+ // bottom slot of the vector (which generates no code for SSE).
+ SmallVector<SDValue, 8> Ops(NumElems);
+ for (unsigned i = 0; i < NumElems; ++i) {
+ if (!Op.getOperand(i).isUndef())
+ Ops[i] = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(i));
+ else
+ Ops[i] = DAG.getUNDEF(VT);
+ }
- // Next, we iteratively mix elements, e.g. for v4f32:
- // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
- // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
- // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
- for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
- // Generate scaled UNPCKL shuffle mask.
- SmallVector<int, 16> Mask;
- for(unsigned i = 0; i != Scale; ++i)
- Mask.push_back(i);
- for (unsigned i = 0; i != Scale; ++i)
- Mask.push_back(NumElems+i);
- Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
+ // Next, we iteratively mix elements, e.g. for v4f32:
+ // Step 1: unpcklps 0, 1 ==> X: <?, ?, 1, 0>
+ // : unpcklps 2, 3 ==> Y: <?, ?, 3, 2>
+ // Step 2: unpcklpd X, Y ==> <3, 2, 1, 0>
+ for (unsigned Scale = 1; Scale < NumElems; Scale *= 2) {
+ // Generate scaled UNPCKL shuffle mask.
+ SmallVector<int, 16> Mask;
+ for(unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(i);
+ for (unsigned i = 0; i != Scale; ++i)
+ Mask.push_back(NumElems+i);
+ Mask.append(NumElems - Mask.size(), SM_SentinelUndef);
- for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
- Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
- }
- return Ops[0];
+ for (unsigned i = 0, e = NumElems / (2 * Scale); i != e; ++i)
+ Ops[i] = DAG.getVectorShuffle(VT, dl, Ops[2*i], Ops[(2*i)+1], Mask);
}
- return SDValue();
+ return Ops[0];
}
// 256-bit AVX can use the vinsertf128 instruction
@@ -8060,87 +8328,74 @@ static SDValue LowerCONCAT_VECTORSvXi1(SDValue Op,
SelectionDAG & DAG) {
SDLoc dl(Op);
MVT ResVT = Op.getSimpleValueType();
- unsigned NumOfOperands = Op.getNumOperands();
+ unsigned NumOperands = Op.getNumOperands();
- assert(isPowerOf2_32(NumOfOperands) &&
+ assert(NumOperands > 1 && isPowerOf2_32(NumOperands) &&
"Unexpected number of operands in CONCAT_VECTORS");
// If this node promotes - by concatenating zeroes - the type of the result
// of a node with instruction that zeroes all upper (irrelevant) bits of the
// output register, mark it as legal and catch the pattern in instruction
- // selection to avoid emitting extra insturctions (for zeroing upper bits).
+ // selection to avoid emitting extra instructions (for zeroing upper bits).
if (SDValue Promoted = isTypePromotionOfi1ZeroUpBits(Op)) {
- SDValue ZeroC = DAG.getConstant(0, dl, MVT::i64);
- SDValue AllZeros = DAG.getSplatBuildVector(ResVT, dl, ZeroC);
+ SDValue ZeroC = DAG.getIntPtrConstant(0, dl);
+ SDValue AllZeros = getZeroVector(ResVT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, AllZeros, Promoted,
ZeroC);
}
- SDValue Undef = DAG.getUNDEF(ResVT);
- if (NumOfOperands > 2) {
- // Specialize the cases when all, or all but one, of the operands are undef.
- unsigned NumOfDefinedOps = 0;
- unsigned OpIdx = 0;
- for (unsigned i = 0; i < NumOfOperands; i++)
- if (!Op.getOperand(i).isUndef()) {
- NumOfDefinedOps++;
- OpIdx = i;
- }
- if (NumOfDefinedOps == 0)
- return Undef;
- if (NumOfDefinedOps == 1) {
- unsigned SubVecNumElts =
- Op.getOperand(OpIdx).getValueType().getVectorNumElements();
- SDValue IdxVal = DAG.getIntPtrConstant(SubVecNumElts * OpIdx, dl);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef,
- Op.getOperand(OpIdx), IdxVal);
+ unsigned NumZero = 0;
+ unsigned NumNonZero = 0;
+ uint64_t NonZeros = 0;
+ for (unsigned i = 0; i != NumOperands; ++i) {
+ SDValue SubVec = Op.getOperand(i);
+ if (SubVec.isUndef())
+ continue;
+ if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ ++NumZero;
+ else {
+ assert(i < sizeof(NonZeros) * CHAR_BIT); // Ensure the shift is in range.
+ NonZeros |= (uint64_t)1 << i;
+ ++NumNonZero;
}
+ }
+
+ // If there are zero or one non-zeros we can handle this very simply.
+ if (NumNonZero <= 1) {
+ SDValue Vec = NumZero ? getZeroVector(ResVT, Subtarget, DAG, dl)
+ : DAG.getUNDEF(ResVT);
+ if (!NumNonZero)
+ return Vec;
+ unsigned Idx = countTrailingZeros(NonZeros);
+ SDValue SubVec = Op.getOperand(Idx);
+ unsigned SubVecNumElts = SubVec.getSimpleValueType().getVectorNumElements();
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, SubVec,
+ DAG.getIntPtrConstant(Idx * SubVecNumElts, dl));
+ }
+
+ if (NumOperands > 2) {
MVT HalfVT = MVT::getVectorVT(ResVT.getVectorElementType(),
ResVT.getVectorNumElements()/2);
- SmallVector<SDValue, 2> Ops;
- for (unsigned i = 0; i < NumOfOperands/2; i++)
- Ops.push_back(Op.getOperand(i));
- SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
- Ops.clear();
- for (unsigned i = NumOfOperands/2; i < NumOfOperands; i++)
- Ops.push_back(Op.getOperand(i));
- SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT, Ops);
+ ArrayRef<SDUse> Ops = Op->ops();
+ SDValue Lo = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(0, NumOperands/2));
+ SDValue Hi = DAG.getNode(ISD::CONCAT_VECTORS, dl, HalfVT,
+ Ops.slice(NumOperands/2));
return DAG.getNode(ISD::CONCAT_VECTORS, dl, ResVT, Lo, Hi);
}
- // 2 operands
- SDValue V1 = Op.getOperand(0);
- SDValue V2 = Op.getOperand(1);
- unsigned NumElems = ResVT.getVectorNumElements();
- assert(V1.getValueType() == V2.getValueType() &&
- V1.getValueType().getVectorNumElements() == NumElems/2 &&
- "Unexpected operands in CONCAT_VECTORS");
+ assert(NumNonZero == 2 && "Simple cases not handled?");
- if (ResVT.getSizeInBits() >= 16)
+ if (ResVT.getVectorNumElements() >= 16)
return Op; // The operation is legal with KUNPCK
- bool IsZeroV1 = ISD::isBuildVectorAllZeros(V1.getNode());
- bool IsZeroV2 = ISD::isBuildVectorAllZeros(V2.getNode());
- SDValue ZeroVec = getZeroVector(ResVT, Subtarget, DAG, dl);
- if (IsZeroV1 && IsZeroV2)
- return ZeroVec;
-
- SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
- if (V2.isUndef())
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
- if (IsZeroV2)
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V1, ZeroIdx);
-
- SDValue IdxVal = DAG.getIntPtrConstant(NumElems/2, dl);
- if (V1.isUndef())
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V2, IdxVal);
-
- if (IsZeroV1)
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, ZeroVec, V2, IdxVal);
-
- V1 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Undef, V1, ZeroIdx);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, V1, V2, IdxVal);
+ SDValue Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT,
+ DAG.getUNDEF(ResVT), Op.getOperand(0),
+ DAG.getIntPtrConstant(0, dl));
+ unsigned NumElems = ResVT.getVectorNumElements();
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResVT, Vec, Op.getOperand(1),
+ DAG.getIntPtrConstant(NumElems/2, dl));
}
static SDValue LowerCONCAT_VECTORS(SDValue Op,
@@ -8723,6 +8978,76 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
+// X86 has dedicated pack instructions that can handle specific truncation
+// operations: PACKSS and PACKUS.
+static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
+ SDValue &V2, unsigned &PackOpcode,
+ ArrayRef<int> TargetMask,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned BitSize = VT.getScalarSizeInBits();
+ MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
+ MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
+
+ auto MatchPACK = [&](SDValue N1, SDValue N2) {
+ SDValue VV1 = DAG.getBitcast(PackVT, N1);
+ SDValue VV2 = DAG.getBitcast(PackVT, N2);
+ if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
+ (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+ V1 = VV1;
+ V2 = VV2;
+ SrcVT = PackVT;
+ PackOpcode = X86ISD::PACKSS;
+ return true;
+ }
+
+ if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
+ APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
+ if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
+ (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
+ V1 = VV1;
+ V2 = VV2;
+ SrcVT = PackVT;
+ PackOpcode = X86ISD::PACKUS;
+ return true;
+ }
+ }
+
+ return false;
+ };
+
+ // Try binary shuffle.
+ SmallVector<int, 32> BinaryMask;
+ createPackShuffleMask(VT, BinaryMask, false);
+ if (isTargetShuffleEquivalent(TargetMask, BinaryMask))
+ if (MatchPACK(V1, V2))
+ return true;
+
+ // Try unary shuffle.
+ SmallVector<int, 32> UnaryMask;
+ createPackShuffleMask(VT, UnaryMask, true);
+ if (isTargetShuffleEquivalent(TargetMask, UnaryMask))
+ if (MatchPACK(V1, V1))
+ return true;
+
+ return false;
+}
+
+static SDValue lowerVectorShuffleWithPACK(const SDLoc &DL, MVT VT,
+ ArrayRef<int> Mask, SDValue V1,
+ SDValue V2, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT PackVT;
+ unsigned PackOpcode;
+ if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+ Subtarget))
+ return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
+ DAG.getBitcast(PackVT, V2));
+
+ return SDValue();
+}
+
/// \brief Try to emit a bitmask instruction for a shuffle.
///
/// This handles cases where we can model a blend exactly as a bitmask due to
@@ -8834,7 +9159,8 @@ static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
return true;
}
-uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
+static uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size,
+ int Scale) {
uint64_t ScaledMask = 0;
for (int i = 0; i != Size; ++i)
if (BlendMask & (1ull << i))
@@ -9869,7 +10195,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
return SDValue();
// Zero-extend directly to i32.
- ExtVT = MVT::v4i32;
+ ExtVT = MVT::getVectorVT(MVT::i32, ExtVT.getSizeInBits() / 32);
V2S = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, V2S);
}
V2 = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, ExtVT, V2S);
@@ -9891,10 +10217,7 @@ static SDValue lowerVectorShuffleAsElementInsertion(
V1Mask[V2Index] = -1;
if (!isNoopShuffleMask(V1Mask))
return SDValue();
- // This is essentially a special case blend operation, but if we have
- // general purpose blend operations, they are always faster. Bail and let
- // the rest of the lowering handle these as blends.
- if (Subtarget.hasSSE41())
+ if (!VT.is128BitVector())
return SDValue();
// Otherwise, use MOVSD or MOVSS.
@@ -10005,7 +10328,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
// we can only broadcast from a register with AVX2.
unsigned NumElts = Mask.size();
- unsigned Opcode = VT == MVT::v2f64 ? X86ISD::MOVDDUP : X86ISD::VBROADCAST;
+ unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
+ ? X86ISD::MOVDDUP
+ : X86ISD::VBROADCAST;
bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
// Check that the mask is a broadcast.
@@ -10030,9 +10355,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
for (;;) {
switch (V.getOpcode()) {
case ISD::BITCAST: {
+ // Peek through bitcasts as long as BroadcastIdx can be adjusted.
SDValue VSrc = V.getOperand(0);
- MVT SrcVT = VSrc.getSimpleValueType();
- if (VT.getScalarSizeInBits() != SrcVT.getScalarSizeInBits())
+ unsigned NumEltBits = V.getScalarValueSizeInBits();
+ unsigned NumSrcBits = VSrc.getScalarValueSizeInBits();
+ if ((NumEltBits % NumSrcBits) == 0)
+ BroadcastIdx *= (NumEltBits / NumSrcBits);
+ else if ((NumSrcBits % NumEltBits) == 0 &&
+ (BroadcastIdx % (NumSrcBits / NumEltBits)) == 0)
+ BroadcastIdx /= (NumSrcBits / NumEltBits);
+ else
break;
V = VSrc;
continue;
@@ -10064,6 +10396,23 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
break;
}
+ // Ensure the source vector and BroadcastIdx are for a suitable type.
+ if (VT.getScalarSizeInBits() != V.getScalarValueSizeInBits()) {
+ unsigned NumEltBits = VT.getScalarSizeInBits();
+ unsigned NumSrcBits = V.getScalarValueSizeInBits();
+ if ((NumSrcBits % NumEltBits) == 0)
+ BroadcastIdx *= (NumSrcBits / NumEltBits);
+ else if ((NumEltBits % NumSrcBits) == 0 &&
+ (BroadcastIdx % (NumEltBits / NumSrcBits)) == 0)
+ BroadcastIdx /= (NumEltBits / NumSrcBits);
+ else
+ return SDValue();
+
+ unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+ MVT SrcVT = MVT::getVectorVT(VT.getScalarType(), NumSrcElts);
+ V = DAG.getBitcast(SrcVT, V);
+ }
+
// Check if this is a broadcast of a scalar. We special case lowering
// for scalars so that we can more effectively fold with loads.
// First, look through bitcast: if the original value has a larger element
@@ -10091,7 +10440,9 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// 32-bit targets need to load i64 as a f64 and then bitcast the result.
if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
- Opcode = (BroadcastVT.is128BitVector() ? X86ISD::MOVDDUP : Opcode);
+ Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
+ ? X86ISD::MOVDDUP
+ : Opcode;
}
// If we are broadcasting a load that is only used by the shuffle
@@ -10127,15 +10478,11 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// The shuffle input might have been a bitcast we looked through; look at
// the original input vector. Emit an EXTRACT_SUBVECTOR of that type; we'll
// later bitcast it to BroadcastVT.
- MVT SrcVT = V.getSimpleValueType();
- assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+ assert(V.getScalarValueSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
"Unexpected vector element size");
- assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
+ assert((V.getValueSizeInBits() == 256 || V.getValueSizeInBits() == 512) &&
"Unexpected vector size");
-
- MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
- V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
- DAG.getIntPtrConstant(BroadcastIdx, DL));
+ V = extract128BitVector(V, BroadcastIdx, DAG, DL);
}
if (Opcode == X86ISD::MOVDDUP && !V.getValueType().isVector())
@@ -10165,9 +10512,13 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
// We only support broadcasting from 128-bit vectors to minimize the
// number of patterns we need to deal with in isel. So extract down to
- // 128-bits.
- if (SrcVT.getSizeInBits() > 128)
- V = extract128BitVector(V, 0, DAG, DL);
+ // 128-bits, removing as many bitcasts as possible.
+ if (SrcVT.getSizeInBits() > 128) {
+ MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(),
+ 128 / SrcVT.getScalarSizeInBits());
+ V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
+ V = DAG.getBitcast(ExtVT, V);
+ }
return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
}
@@ -10517,26 +10868,6 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(Mask[0] < 2 && "We sort V1 to be the first input.");
assert(Mask[1] >= 2 && "We sort V2 to be the second input.");
- // If we have a blend of two same-type PACKUS operations and the blend aligns
- // with the low and high halves, we can just merge the PACKUS operations.
- // This is particularly important as it lets us merge shuffles that this
- // routine itself creates.
- auto GetPackNode = [](SDValue V) {
- V = peekThroughBitcasts(V);
- return V.getOpcode() == X86ISD::PACKUS ? V : SDValue();
- };
- if (SDValue V1Pack = GetPackNode(V1))
- if (SDValue V2Pack = GetPackNode(V2)) {
- EVT PackVT = V1Pack.getValueType();
- if (PackVT == V2Pack.getValueType())
- return DAG.getBitcast(MVT::v2i64,
- DAG.getNode(X86ISD::PACKUS, DL, PackVT,
- Mask[0] == 0 ? V1Pack.getOperand(0)
- : V1Pack.getOperand(1),
- Mask[1] == 2 ? V2Pack.getOperand(0)
- : V2Pack.getOperand(1)));
- }
-
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v2i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -10569,10 +10900,16 @@ static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget.hasSSSE3())
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v2i64, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v2i64, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ }
// If we have direct support for blends, we should lower by decomposing into
// a permute. That will be faster than the domain cross.
@@ -10736,6 +11073,15 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Mask, DL, DAG));
}
+ // Use MOVLHPS/MOVHLPS to simulate unary shuffles. These are only valid
+ // in SSE1 because otherwise they are widened to v2f64 and never get here.
+ if (!Subtarget.hasSSE2()) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1}))
+ return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V1);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 2, 3}))
+ return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V1, V1);
+ }
+
// Otherwise, use a straight shuffle of a single input vector. We pass the
// input vector to both operands to simulate this with a SHUFPS.
return DAG.getNode(X86ISD::SHUFP, DL, MVT::v4f32, V1, V1,
@@ -10768,11 +11114,14 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
return BlendPerm;
}
- // Use low/high mov instructions.
- if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
- return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
- if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
- return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+ // Use low/high mov instructions. These are only valid in SSE1 because
+ // otherwise they are widened to v2f64 and never get here.
+ if (!Subtarget.hasSSE2()) {
+ if (isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5}))
+ return DAG.getNode(X86ISD::MOVLHPS, DL, MVT::v4f32, V1, V2);
+ if (isShuffleEquivalent(V1, V2, Mask, {2, 3, 6, 7}))
+ return DAG.getNode(X86ISD::MOVHLPS, DL, MVT::v4f32, V2, V1);
+ }
// Use dedicated unpack instructions for masks that match their pattern.
if (SDValue V =
@@ -10857,10 +11206,16 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Try to use byte rotation instructions.
// Its more profitable for pre-SSSE3 to use shuffles/unpacks.
- if (Subtarget.hasSSSE3())
+ if (Subtarget.hasSSSE3()) {
+ if (Subtarget.hasVLX())
+ if (SDValue Rotate = lowerVectorShuffleAsRotate(DL, MVT::v4i32, V1, V2,
+ Mask, Subtarget, DAG))
+ return Rotate;
+
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v4i32, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ }
// Assume that a single SHUFPS is faster than an alternative sequence of
// multiple instructions (even if the CPU has a domain penalty).
@@ -11449,6 +11804,11 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2,
+ DAG, Subtarget))
+ return V;
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(DL, MVT::v8i16, V1, V1,
Mask, Subtarget, DAG))
@@ -11499,6 +11859,11 @@ static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
return V;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v8i16, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
// Try to use byte rotation instructions.
if (SDValue Rotate = lowerVectorShuffleAsByteRotate(
DL, MVT::v8i16, V1, V2, Mask, Subtarget, DAG))
@@ -11619,6 +11984,11 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, MVT::v16i8, V1, V2, Mask, Subtarget, DAG))
return Rotate;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i8, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
// Try to use a zext lowering.
if (SDValue ZExt = lowerVectorShuffleAsZeroOrAnyExtend(
DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
@@ -12105,7 +12475,8 @@ static SDValue lowerVectorShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT,
static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
SDValue V1, SDValue V2,
ArrayRef<int> Mask,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// FIXME: This should probably be generalized for 512-bit vectors as well.
assert(VT.is256BitVector() && "Only for 256-bit vector shuffles!");
int Size = Mask.size();
@@ -12114,12 +12485,21 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
- bool LaneCrossing[2] = {false, false};
- for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
- LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
- if (!LaneCrossing[0] || !LaneCrossing[1])
- return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ if (!Subtarget.hasAVX2()) {
+ bool LaneCrossing[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
+ if (!LaneCrossing[0] || !LaneCrossing[1])
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ } else {
+ bool LaneUsed[2] = {false, false};
+ for (int i = 0; i < Size; ++i)
+ if (Mask[i] >= 0)
+ LaneUsed[(Mask[i] / LaneSize)] = true;
+ if (!LaneUsed[0] || !LaneUsed[1])
+ return splitAndLowerVectorShuffle(DL, VT, V1, V2, Mask, DAG);
+ }
assert(V2.isUndef() &&
"This last part of this routine only works on single input shuffles");
@@ -12132,14 +12512,12 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
: Mask[i] % LaneSize +
(i / LaneSize) * LaneSize + Size);
- // Flip the vector, and blend the results which should now be in-lane. The
- // VPERM2X128 mask uses the low 2 bits for the low source and bits 4 and
- // 5 for the high source. The value 3 selects the high half of source 2 and
- // the value 2 selects the low half of source 2. We only use source 2 to
- // allow folding it into a memory operand.
- unsigned PERMMask = 3 | 2 << 4;
- SDValue Flipped = DAG.getNode(X86ISD::VPERM2X128, DL, VT, DAG.getUNDEF(VT),
- V1, DAG.getConstant(PERMMask, DL, MVT::i8));
+ // Flip the vector, and blend the results which should now be in-lane.
+ MVT PVT = VT.isFloatingPoint() ? MVT::v4f64 : MVT::v4i64;
+ SDValue Flipped = DAG.getBitcast(PVT, V1);
+ Flipped = DAG.getVectorShuffle(PVT, DL, Flipped, DAG.getUNDEF(PVT),
+ { 2, 3, 0, 1 });
+ Flipped = DAG.getBitcast(VT, Flipped);
return DAG.getVectorShuffle(VT, DL, V1, Flipped, FlippedBlendMask);
}
@@ -12149,6 +12527,10 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
const APInt &Zeroable,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ // With AVX2, use VPERMQ/VPERMPD for unary shuffles to allow memory folding.
+ if (Subtarget.hasAVX2() && V2.isUndef())
+ return SDValue();
+
SmallVector<int, 4> WidenedMask;
if (!canWidenShuffleElements(Mask, WidenedMask))
return SDValue();
@@ -12162,19 +12544,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
Zeroable, Subtarget, DAG))
return Blend;
- bool IsV1Zero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool IsV2Zero = ISD::isBuildVectorAllZeros(V2.getNode());
+ bool IsLowZero = (Zeroable & 0x3) == 0x3;
+ bool IsHighZero = (Zeroable & 0xc) == 0xc;
// If either input operand is a zero vector, use VPERM2X128 because its mask
// allows us to replace the zero input with an implicit zero.
- if (!IsV1Zero && !IsV2Zero) {
+ if (!IsLowZero && !IsHighZero) {
// Check for patterns which can be matched with a single insert of a 128-bit
// subvector.
bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 0, 1});
if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask, {0, 1, 4, 5})) {
- // With AVX2, use VPERMQ/VPERMPD to allow memory folding.
- if (Subtarget.hasAVX2() && V2.isUndef())
- return SDValue();
// With AVX1, use vperm2f128 (below) to allow load folding. Otherwise,
// this will likely become vinsertf128 which can't fold a 256-bit memop.
@@ -12189,6 +12568,16 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LoV, HiV);
}
}
+
+ // Try to use SHUF128 if possible.
+ if (Subtarget.hasVLX()) {
+ if (WidenedMask[0] < 2 && WidenedMask[1] >= 2) {
+ unsigned PermMask = ((WidenedMask[0] % 2) << 0) |
+ ((WidenedMask[1] % 2) << 1);
+ return DAG.getNode(X86ISD::SHUF128, DL, VT, V1, V2,
+ DAG.getConstant(PermMask, DL, MVT::i8));
+ }
+ }
}
// Otherwise form a 128-bit permutation. After accounting for undefs,
@@ -12204,30 +12593,17 @@ static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
// [6] - ignore
// [7] - zero high half of destination
- int MaskLO = WidenedMask[0] < 0 ? 0 : WidenedMask[0];
- int MaskHI = WidenedMask[1] < 0 ? 0 : WidenedMask[1];
+ assert(WidenedMask[0] >= 0 && WidenedMask[1] >= 0 && "Undef half?");
- unsigned PermMask = MaskLO | (MaskHI << 4);
+ unsigned PermMask = 0;
+ PermMask |= IsLowZero ? 0x08 : (WidenedMask[0] << 0);
+ PermMask |= IsHighZero ? 0x80 : (WidenedMask[1] << 4);
- // If either input is a zero vector, replace it with an undef input.
- // Shuffle mask values < 4 are selecting elements of V1.
- // Shuffle mask values >= 4 are selecting elements of V2.
- // Adjust each half of the permute mask by clearing the half that was
- // selecting the zero vector and setting the zero mask bit.
- if (IsV1Zero) {
+ // Check the immediate mask and replace unused sources with undef.
+ if ((PermMask & 0x0a) != 0x00 && (PermMask & 0xa0) != 0x00)
V1 = DAG.getUNDEF(VT);
- if (MaskLO < 2)
- PermMask = (PermMask & 0xf0) | 0x08;
- if (MaskHI < 2)
- PermMask = (PermMask & 0x0f) | 0x80;
- }
- if (IsV2Zero) {
+ if ((PermMask & 0x0a) != 0x02 && (PermMask & 0xa0) != 0x20)
V2 = DAG.getUNDEF(VT);
- if (MaskLO >= 2)
- PermMask = (PermMask & 0xf0) | 0x08;
- if (MaskHI >= 2)
- PermMask = (PermMask & 0x0f) | 0x80;
- }
return DAG.getNode(X86ISD::VPERM2X128, DL, VT, V1, V2,
DAG.getConstant(PermMask, DL, MVT::i8));
@@ -12311,7 +12687,7 @@ static SDValue lowerVectorShuffleByMerging128BitLanes(
return DAG.getVectorShuffle(VT, DL, LaneShuffle, DAG.getUNDEF(VT), NewMask);
}
-/// Lower shuffles where an entire half of a 256-bit vector is UNDEF.
+/// Lower shuffles where an entire half of a 256 or 512-bit vector is UNDEF.
/// This allows for fast cases such as subvector extraction/insertion
/// or shuffling smaller vector types which can lower more efficiently.
static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
@@ -12319,7 +12695,8 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
ArrayRef<int> Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(VT.is256BitVector() && "Expected 256-bit vector");
+ assert((VT.is256BitVector() || VT.is512BitVector()) &&
+ "Expected 256-bit or 512-bit vector");
unsigned NumElts = VT.getVectorNumElements();
unsigned HalfNumElts = NumElts / 2;
@@ -12415,6 +12792,10 @@ static SDValue lowerVectorShuffleWithUndefHalf(const SDLoc &DL, MVT VT,
}
}
+ // AVX512 - XXXXuuuu - always extract lowers.
+ if (VT.is512BitVector() && !(UndefUpper && NumUpperHalves == 0))
+ return SDValue();
+
auto GetHalfVector = [&](int HalfIdx) {
if (HalfIdx < 0)
return DAG.getUNDEF(HalfVT);
@@ -12729,7 +13110,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v4f64, V1, V2, Mask,
- DAG);
+ DAG, Subtarget);
}
// Use dedicated unpack instructions for masks that match their pattern.
@@ -12810,7 +13191,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 2> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
SmallVector<int, 4> PSHUFDMask;
- scaleShuffleMask(2, RepeatedMask, PSHUFDMask);
+ scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
return DAG.getBitcast(
MVT::v4i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
@@ -12932,7 +13313,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Otherwise, fall back.
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v8f32, V1, V2, Mask,
- DAG);
+ DAG, Subtarget);
}
// Try to simplify this by merging 128-bit lanes to enable a lane-based
@@ -13112,6 +13493,11 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v16i16, Mask, V1, V2, DAG))
return V;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v16i16, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v16i16, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -13133,7 +13519,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// element types.
if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask))
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v16i16, V1, V2,
- Mask, DAG);
+ Mask, DAG, Subtarget);
SmallVector<int, 8> RepeatedMask;
if (is128BitLaneRepeatedShuffleMask(MVT::v16i16, Mask, RepeatedMask)) {
@@ -13198,6 +13584,11 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
lowerVectorShuffleWithUNPCK(DL, MVT::v32i8, Mask, V1, V2, DAG))
return V;
+ // Use dedicated pack instructions for masks that match their pattern.
+ if (SDValue V = lowerVectorShuffleWithPACK(DL, MVT::v32i8, Mask, V1, V2, DAG,
+ Subtarget))
+ return V;
+
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -13218,7 +13609,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// element types.
if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask))
return lowerVectorShuffleAsLanePermuteAndBlend(DL, MVT::v32i8, V1, V2, Mask,
- DAG);
+ DAG, Subtarget);
if (SDValue PSHUFB = lowerVectorShuffleWithPSHUFB(
DL, MVT::v32i8, Mask, V1, V2, Zeroable, Subtarget, DAG))
@@ -13485,6 +13876,15 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
// Otherwise, fall back to a SHUFPS sequence.
return lowerVectorShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
}
+
+ // If we have a single input shuffle with different shuffle patterns in the
+ // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
+ if (V2.isUndef() &&
+ !is128BitLaneCrossingShuffleMask(MVT::v16f32, Mask)) {
+ SDValue VPermMask = getConstVector(Mask, MVT::v16i32, DAG, DL, true);
+ return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v16f32, V1, VPermMask);
+ }
+
// If we have AVX512F support, we can use VEXPAND.
if (SDValue V = lowerVectorShuffleToEXPAND(DL, MVT::v16f32, Zeroable, Mask,
V1, V2, DAG, Subtarget))
@@ -13503,10 +13903,6 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
assert(V2.getSimpleValueType() == MVT::v8i64 && "Bad operand type!");
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");
- if (SDValue Shuf128 =
- lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
- return Shuf128;
-
if (V2.isUndef()) {
// When the shuffle is mirrored between the 128-bit lanes of the unit, we
// can use lower latency instructions that will operate on all four
@@ -13514,7 +13910,7 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
SmallVector<int, 2> Repeated128Mask;
if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
SmallVector<int, 4> PSHUFDMask;
- scaleShuffleMask(2, Repeated128Mask, PSHUFDMask);
+ scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
return DAG.getBitcast(
MVT::v8i64,
DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
@@ -13528,6 +13924,10 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
getV4X86ShuffleImm8ForMask(Repeated256Mask, DL, DAG));
}
+ if (SDValue Shuf128 =
+ lowerV4X128VectorShuffle(DL, MVT::v8i64, Mask, V1, V2, DAG))
+ return Shuf128;
+
// Try to use shift instructions.
if (SDValue Shift = lowerVectorShuffleAsShift(DL, MVT::v8i64, V1, V2, Mask,
Zeroable, Subtarget, DAG))
@@ -13758,6 +14158,11 @@ static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
DL, VT, V1, V2, Mask, Zeroable, Subtarget, DAG))
return Insertion;
+ // Handle special cases where the lower or upper half is UNDEF.
+ if (SDValue V =
+ lowerVectorShuffleWithUndefHalf(DL, VT, V1, V2, Mask, Subtarget, DAG))
+ return V;
+
// Check for being able to broadcast a single element.
if (SDValue Broadcast =
lowerVectorShuffleAsBroadcast(DL, VT, V1, V2, Mask, Subtarget, DAG))
@@ -14046,16 +14451,16 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode()))
return SDValue();
- // If this VSELECT has a vector if i1 as a mask, it will be directly matched
- // with patterns on the mask registers on AVX-512.
- if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
- return Op;
-
// Try to lower this to a blend-style vector shuffle. This can handle all
// constant condition cases.
if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG))
return BlendOp;
+ // If this VSELECT has a vector if i1 as a mask, it will be directly matched
+ // with patterns on the mask registers on AVX-512.
+ if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1)
+ return Op;
+
// Variable blends are only legal from SSE4.1 onward.
if (!Subtarget.hasSSE41())
return SDValue();
@@ -14097,10 +14502,6 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
case MVT::v8i16:
case MVT::v16i16:
- // AVX-512 BWI and VLX features support VSELECT with i16 elements.
- if (Subtarget.hasBWI() && Subtarget.hasVLX())
- return Op;
-
// FIXME: We should custom lower this by fixing the condition and using i8
// blends.
return SDValue();
@@ -14117,9 +14518,7 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
if (VT.getSizeInBits() == 8) {
SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
- SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
- DAG.getValueType(VT));
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
if (VT == MVT::f32) {
@@ -14153,8 +14552,8 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
/// Extract one bit from mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
-SDValue
-X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const {
+static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue Vec = Op.getOperand(0);
SDLoc dl(Vec);
MVT VecVT = Vec.getSimpleValueType();
@@ -14171,30 +14570,42 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
// Extending v8i1/v16i1 to 512-bit get better performance on KNL
// than extending to 128/256bit.
unsigned VecSize = (NumElts <= 4 ? 128 : 512);
- MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+ MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize / NumElts), NumElts);
SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
ExtVT.getVectorElementType(), Ext, Idx);
return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
}
+ // Canonicalize result type to MVT::i32.
+ if (EltVT != MVT::i32) {
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+ Vec, Idx);
+ return DAG.getAnyExtOrTrunc(Extract, dl, EltVT);
+ }
+
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+
+ // Extracts from element 0 are always allowed.
+ if (IdxVal == 0)
+ return Op;
+
+ // If the kshift instructions of the correct width aren't natively supported
+ // then we need to promote the vector to the native size to get the correct
+ // zeroing behavior.
if ((!Subtarget.hasDQI() && (VecVT.getVectorNumElements() == 8)) ||
(VecVT.getVectorNumElements() < 8)) {
- // Use kshiftlw/rw instruction.
VecVT = MVT::v16i1;
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT,
DAG.getUNDEF(VecVT),
Vec,
DAG.getIntPtrConstant(0, dl));
}
- unsigned MaxSift = VecVT.getVectorNumElements() - 1;
- if (MaxSift - IdxVal)
- Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
- DAG.getConstant(MaxSift - IdxVal, dl, MVT::i8));
+
+ // Use kshiftr instruction to move to the lower element.
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
- DAG.getConstant(MaxSift, dl, MVT::i8));
- return DAG.getNode(X86ISD::VEXTRACT, dl, Op.getSimpleValueType(), Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, Vec,
DAG.getIntPtrConstant(0, dl));
}
@@ -14207,7 +14618,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
SDValue Idx = Op.getOperand(1);
if (VecVT.getVectorElementType() == MVT::i1)
- return ExtractBitFromMaskVector(Op, DAG);
+ return ExtractBitFromMaskVector(Op, DAG, Subtarget);
if (!isa<ConstantSDNode>(Idx)) {
// Its more profitable to go through memory (1 cycles throughput)
@@ -14278,9 +14689,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
// Transform it so it match pextrw which produces a 32-bit result.
SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
Op.getOperand(0), Op.getOperand(1));
- SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, Extract,
- DAG.getValueType(VT));
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Assert);
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
}
if (Subtarget.hasSSE41())
@@ -14347,8 +14756,8 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
/// Insert one bit to mask vector, like v16i1 or v8i1.
/// AVX-512 feature.
-SDValue
-X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
+static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDLoc dl(Op);
SDValue Vec = Op.getOperand(0);
SDValue Elt = Op.getOperand(1);
@@ -14358,8 +14767,10 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
if (!isa<ConstantSDNode>(Idx)) {
// Non constant index. Extend source and destination,
// insert element and then truncate the result.
- MVT ExtVecVT = (VecVT == MVT::v8i1 ? MVT::v8i64 : MVT::v16i32);
- MVT ExtEltVT = (VecVT == MVT::v8i1 ? MVT::i64 : MVT::i32);
+ unsigned NumElts = VecVT.getVectorNumElements();
+ unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+ MVT ExtVecVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+ MVT ExtEltVT = ExtVecVT.getVectorElementType();
SDValue ExtOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, ExtVecVT,
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVecVT, Vec),
DAG.getNode(ISD::ZERO_EXTEND, dl, ExtEltVT, Elt), Idx);
@@ -14367,10 +14778,24 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
}
unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
unsigned NumElems = VecVT.getVectorNumElements();
- if(Vec.isUndef()) {
+ // If the kshift instructions of the correct width aren't natively supported
+ // then we need to promote the vector to the native size to get the correct
+ // zeroing behavior.
+ if ((!Subtarget.hasDQI() && NumElems == 8) || (NumElems < 8)) {
+ // Need to promote to v16i1, do the insert, then extract back.
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v16i1,
+ DAG.getUNDEF(MVT::v16i1), Vec,
+ DAG.getIntPtrConstant(0, dl));
+ Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i1, Vec, Elt, Idx);
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VecVT, Op,
+ DAG.getIntPtrConstant(0, dl));
+ }
+
+ SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecVT, Elt);
+
+ if (Vec.isUndef()) {
if (IdxVal)
EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
@@ -14393,25 +14818,33 @@ X86TargetLowering::InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
// Insertion of one bit into last position
- if (IdxVal == NumElems -1) {
+ if (IdxVal == NumElems - 1) {
// Move the bit to the last position inside the vector.
EltInVec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, EltInVec,
DAG.getConstant(IdxVal, dl, MVT::i8));
// Clean the last bit in the source vector.
Vec = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Vec,
- DAG.getConstant(1, dl, MVT::i8));
+ DAG.getConstant(1, dl, MVT::i8));
Vec = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
- DAG.getConstant(1 , dl, MVT::i8));
+ DAG.getConstant(1 , dl, MVT::i8));
return DAG.getNode(ISD::OR, dl, VecVT, Vec, EltInVec);
}
- // Use shuffle to insert element.
- SmallVector<int, 64> MaskVec(NumElems);
- for (unsigned i = 0; i != NumElems; ++i)
- MaskVec[i] = (i == IdxVal) ? NumElems : i;
-
- return DAG.getVectorShuffle(VecVT, dl, Vec, EltInVec, MaskVec);
+ // Move the current value of the bit to be replace to bit 0.
+ SDValue Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Vec,
+ DAG.getConstant(IdxVal, dl, MVT::i8));
+ // Xor with the new bit.
+ Merged = DAG.getNode(ISD::XOR, dl, VecVT, Merged, EltInVec);
+ // Shift to MSB, filling bottom bits with 0.
+ Merged = DAG.getNode(X86ISD::KSHIFTL, dl, VecVT, Merged,
+ DAG.getConstant(NumElems - 1, dl, MVT::i8));
+ // Shift to the final position, filling upper bits with 0.
+ Merged = DAG.getNode(X86ISD::KSHIFTR, dl, VecVT, Merged,
+ DAG.getConstant(NumElems - 1 - IdxVal, dl, MVT::i8));
+ // Xor with original vector to cancel out the original bit value that's still
+ // present.
+ return DAG.getNode(ISD::XOR, dl, VecVT, Merged, Vec);
}
SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -14421,7 +14854,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
unsigned NumElts = VT.getVectorNumElements();
if (EltVT == MVT::i1)
- return InsertBitToMaskVector(Op, DAG);
+ return InsertBitToMaskVector(Op, DAG, Subtarget);
SDLoc dl(Op);
SDValue N0 = Op.getOperand(0);
@@ -14444,7 +14877,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
for (unsigned i = 0; i != NumElts; ++i)
BlendMask.push_back(i == IdxVal ? i + NumElts : i);
SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
- : DAG.getConstant(-1, dl, VT);
+ : getOnesVector(VT, DAG, dl);
return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
}
@@ -14513,7 +14946,7 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
// Bits [3:0] of the constant are the zero mask. The DAG Combiner may
// combine either bitwise AND or insert of float 0.0 to set these bits.
- bool MinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+ bool MinSize = DAG.getMachineFunction().getFunction().optForMinSize();
if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) {
// If this is an insertion of 32-bits into the low 32-bits of
// a vector, we prefer to generate a blend with immediate rather
@@ -14574,48 +15007,6 @@ static SDValue LowerSCALAR_TO_VECTOR(SDValue Op, const X86Subtarget &Subtarget,
OpVT, DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, AnyExt));
}
-// Lower a node with an EXTRACT_SUBVECTOR opcode. This may result in
-// a simple subregister reference or explicit instructions to grab
-// upper bits of a vector.
-static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
- assert(Subtarget.hasAVX() && "EXTRACT_SUBVECTOR requires AVX");
-
- SDLoc dl(Op);
- SDValue In = Op.getOperand(0);
- SDValue Idx = Op.getOperand(1);
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- MVT ResVT = Op.getSimpleValueType();
-
- // When v1i1 is legal a scalarization of a vselect with a vXi1 Cond
- // would result with: v1i1 = extract_subvector(vXi1, idx).
- // Lower these into extract_vector_elt which is already selectable.
- if (ResVT == MVT::v1i1) {
- assert(Subtarget.hasAVX512() &&
- "Boolean EXTRACT_SUBVECTOR requires AVX512");
-
- MVT EltVT = ResVT.getVectorElementType();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- MVT LegalVT =
- (TLI.getTypeToTransformTo(*DAG.getContext(), EltVT)).getSimpleVT();
- SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LegalVT, In, Idx);
- return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ResVT, Res);
- }
-
- assert((In.getSimpleValueType().is256BitVector() ||
- In.getSimpleValueType().is512BitVector()) &&
- "Can only extract from 256-bit or 512-bit vectors");
-
- // If the input is a buildvector just emit a smaller one.
- unsigned ElemsPerChunk = ResVT.getVectorNumElements();
- if (In.getOpcode() == ISD::BUILD_VECTOR)
- return DAG.getBuildVector(
- ResVT, dl, makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
-
- // Everything else is legal.
- return Op;
-}
-
// Lower a node with an INSERT_SUBVECTOR opcode. This may result in a
// simple superregister reference or explicit instructions to insert
// the upper bits of a vector.
@@ -14696,7 +15087,7 @@ X86TargetLowering::LowerExternalSymbol(SDValue Op, SelectionDAG &DAG) const {
// In PIC mode (unless we're in RIPRel PIC mode) we add an offset to the
// global base reg.
- const Module *Mod = DAG.getMachineFunction().getFunction()->getParent();
+ const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
unsigned char OpFlag = Subtarget.classifyGlobalReference(nullptr, *Mod);
auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -15516,24 +15907,12 @@ SDValue X86TargetLowering::lowerUINT_TO_FP_vec(SDValue Op,
switch (SrcVT.SimpleTy) {
default:
llvm_unreachable("Custom UINT_TO_FP is not supported!");
- case MVT::v4i8:
- case MVT::v4i16:
- case MVT::v8i8:
- case MVT::v8i16: {
- MVT NVT = MVT::getVectorVT(MVT::i32, SrcVT.getVectorNumElements());
- return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(),
- DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, N0));
- }
case MVT::v2i32:
return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
+ assert(!Subtarget.hasAVX512());
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
- case MVT::v16i8:
- case MVT::v16i16:
- assert(Subtarget.hasAVX512());
- return DAG.getNode(ISD::UINT_TO_FP, dl, Op.getValueType(),
- DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v16i32, N0));
}
}
@@ -15543,12 +15922,6 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- // Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
- // optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
- // the optimization here.
- if (DAG.SignBitIsZero(N0))
- return DAG.getNode(ISD::SINT_TO_FP, dl, Op.getValueType(), N0);
-
if (Op.getSimpleValueType().isVector())
return lowerUINT_TO_FP_vec(Op, DAG);
@@ -15827,8 +16200,18 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
- if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
- return DAG.getNode(ISD::ZERO_EXTEND, dl, VT, In);
+ if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+ (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
+ (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
+ (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
+ (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
+ (VT != MVT::v32i16 || InVT != MVT::v32i8))
+ return SDValue();
+
+ if (Subtarget.hasInt256())
+ return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
// Optimize vectors in AVX mode:
//
@@ -15843,14 +16226,6 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
// Concat upper and lower parts.
//
- if (((VT != MVT::v16i16) || (InVT != MVT::v16i8)) &&
- ((VT != MVT::v8i32) || (InVT != MVT::v8i16)) &&
- ((VT != MVT::v4i64) || (InVT != MVT::v4i32)))
- return SDValue();
-
- if (Subtarget.hasInt256())
- return DAG.getNode(X86ISD::VZEXT, dl, VT, In);
-
SDValue ZeroVec = getZeroVector(InVT, Subtarget, DAG, dl);
SDValue Undef = DAG.getUNDEF(InVT);
bool NeedZero = Op.getOpcode() == ISD::ZERO_EXTEND;
@@ -15866,39 +16241,60 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
}
-static SDValue LowerZERO_EXTEND_AVX512(SDValue Op,
- const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+static SDValue LowerZERO_EXTEND_Mask(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
+ assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
SDLoc DL(Op);
unsigned NumElts = VT.getVectorNumElements();
- if (VT.is512BitVector() && InVT.getVectorElementType() != MVT::i1 &&
- (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI()))
- return DAG.getNode(X86ISD::VZEXT, DL, VT, In);
+ // Extend VT if the scalar type is v8/v16 and BWI is not supported.
+ MVT ExtVT = VT;
+ if (!Subtarget.hasBWI() &&
+ (VT.getVectorElementType().getSizeInBits() <= 16))
+ ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+
+ // Widen to 512-bits if VLX is not supported.
+ MVT WideVT = ExtVT;
+ if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
+ NumElts *= 512 / ExtVT.getSizeInBits();
+ InVT = MVT::getVectorVT(MVT::i1, NumElts);
+ In = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT, DAG.getUNDEF(InVT),
+ In, DAG.getIntPtrConstant(0, DL));
+ WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(),
+ NumElts);
+ }
- if (InVT.getVectorElementType() != MVT::i1)
- return SDValue();
+ SDValue One = DAG.getConstant(1, DL, WideVT);
+ SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, DL);
- // Extend VT if the target is 256 or 128bit vector and VLX is not supported.
- MVT ExtVT = VT;
- if (!VT.is512BitVector() && !Subtarget.hasVLX())
- ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ SDValue SelectedVal = DAG.getSelect(DL, WideVT, In, One, Zero);
- SDValue One =
- DAG.getConstant(APInt(ExtVT.getScalarSizeInBits(), 1), DL, ExtVT);
- SDValue Zero =
- DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT);
+ // Truncate if we had to extend i16/i8 above.
+ if (VT != ExtVT) {
+ WideVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
+ SelectedVal = DAG.getNode(X86ISD::VTRUNC, DL, WideVT, SelectedVal);
+ }
+
+ // Extract back to 128/256-bit if we widened.
+ if (WideVT != VT)
+ SelectedVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SelectedVal,
+ DAG.getIntPtrConstant(0, DL));
- SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero);
- if (VT == ExtVT)
- return SelectedVal;
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal);
+ return SelectedVal;
}
static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
+ SDValue In = Op->getOperand(0);
+ MVT InVT = In.getSimpleValueType();
+
+ if (InVT.getVectorElementType() == MVT::i1)
+ return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
+
if (Subtarget.hasFp256())
if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
return Res;
@@ -15908,32 +16304,33 @@ static SDValue LowerANY_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerZERO_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDLoc DL(Op);
- MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT SVT = In.getSimpleValueType();
- if (VT.is512BitVector() || SVT.getVectorElementType() == MVT::i1)
- return LowerZERO_EXTEND_AVX512(Op, Subtarget, DAG);
+ if (SVT.getVectorElementType() == MVT::i1)
+ return LowerZERO_EXTEND_Mask(Op, Subtarget, DAG);
if (Subtarget.hasFp256())
if (SDValue Res = LowerAVXExtend(Op, DAG, Subtarget))
return Res;
- assert(!VT.is256BitVector() || !SVT.is128BitVector() ||
- VT.getVectorNumElements() != SVT.getVectorNumElements());
+ assert(!Op.getSimpleValueType().is256BitVector() || !SVT.is128BitVector() ||
+ Op.getSimpleValueType().getVectorNumElements() !=
+ SVT.getVectorNumElements());
return SDValue();
}
-/// Helper to recursively truncate vector elements in half with PACKSS.
-/// It makes use of the fact that vector comparison results will be all-zeros
-/// or all-ones to use (vXi8 PACKSS(vYi16, vYi16)) instead of matching types.
-/// AVX2 (Int256) sub-targets require extra shuffling as the PACKSS operates
+/// Helper to recursively truncate vector elements in half with PACKSS/PACKUS.
+/// It makes use of the fact that vectors with enough leading sign/zero bits
+/// prevent the PACKSS/PACKUS from saturating the results.
+/// AVX2 (Int256) sub-targets require extra shuffling as the PACK*S operates
/// within each 128-bit lane.
-static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
- const SDLoc &DL,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
+ const SDLoc &DL, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert((Opcode == X86ISD::PACKSS || Opcode == X86ISD::PACKUS) &&
+ "Unexpected PACK opcode");
+
// Requires SSE2 but AVX512 has fast truncate.
if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
return SDValue();
@@ -15946,40 +16343,52 @@ static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
// We only support vector truncation to 128bits or greater from a
// 256bits or greater source.
- if ((DstVT.getSizeInBits() % 128) != 0)
- return SDValue();
- if ((SrcVT.getSizeInBits() % 256) != 0)
+ unsigned DstSizeInBits = DstVT.getSizeInBits();
+ unsigned SrcSizeInBits = SrcVT.getSizeInBits();
+ if ((DstSizeInBits % 128) != 0 || (SrcSizeInBits % 256) != 0)
return SDValue();
+ LLVMContext &Ctx = *DAG.getContext();
unsigned NumElems = SrcVT.getVectorNumElements();
assert(DstVT.getVectorNumElements() == NumElems && "Illegal truncation");
- assert(SrcVT.getSizeInBits() > DstVT.getSizeInBits() && "Illegal truncation");
+ assert(SrcSizeInBits > DstSizeInBits && "Illegal truncation");
- EVT PackedSVT =
- EVT::getIntegerVT(*DAG.getContext(), SrcVT.getScalarSizeInBits() / 2);
+ EVT PackedSVT = EVT::getIntegerVT(Ctx, SrcVT.getScalarSizeInBits() / 2);
// Extract lower/upper subvectors.
unsigned NumSubElts = NumElems / 2;
- unsigned SrcSizeInBits = SrcVT.getSizeInBits();
SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
- // 256bit -> 128bit truncate - PACKSS lower/upper 128-bit subvectors.
+ // Pack to the largest type possible:
+ // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
+ EVT InVT = MVT::i16, OutVT = MVT::i8;
+ if (DstVT.getScalarSizeInBits() > 8 &&
+ (Opcode == X86ISD::PACKSS || Subtarget.hasSSE41())) {
+ InVT = MVT::i32;
+ OutVT = MVT::i16;
+ }
+
+ unsigned SubSizeInBits = SrcSizeInBits / 2;
+ InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
+ OutVT = EVT::getVectorVT(Ctx, OutVT, SubSizeInBits / OutVT.getSizeInBits());
+
+ // 256bit -> 128bit truncate - PACK lower/upper 128-bit subvectors.
if (SrcVT.is256BitVector()) {
- Lo = DAG.getBitcast(MVT::v8i16, Lo);
- Hi = DAG.getBitcast(MVT::v8i16, Hi);
- SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, Lo, Hi);
+ Lo = DAG.getBitcast(InVT, Lo);
+ Hi = DAG.getBitcast(InVT, Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
return DAG.getBitcast(DstVT, Res);
}
- // AVX2: 512bit -> 256bit truncate - PACKSS lower/upper 256-bit subvectors.
- // AVX2: 512bit -> 128bit truncate - PACKSS(PACKSS, PACKSS).
+ // AVX2: 512bit -> 256bit truncate - PACK lower/upper 256-bit subvectors.
+ // AVX2: 512bit -> 128bit truncate - PACK(PACK, PACK).
if (SrcVT.is512BitVector() && Subtarget.hasInt256()) {
- Lo = DAG.getBitcast(MVT::v16i16, Lo);
- Hi = DAG.getBitcast(MVT::v16i16, Hi);
- SDValue Res = DAG.getNode(X86ISD::PACKSS, DL, MVT::v32i8, Lo, Hi);
+ Lo = DAG.getBitcast(InVT, Lo);
+ Hi = DAG.getBitcast(InVT, Hi);
+ SDValue Res = DAG.getNode(Opcode, DL, OutVT, Lo, Hi);
- // 256-bit PACKSS(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
+ // 256-bit PACK(ARG0, ARG1) leaves us with ((LO0,LO1),(HI0,HI1)),
// so we need to shuffle to get ((LO0,HI0),(LO1,HI1)).
Res = DAG.getBitcast(MVT::v4i64, Res);
Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, {0, 2, 1, 3});
@@ -15988,20 +16397,20 @@ static SDValue truncateVectorCompareWithPACKSS(EVT DstVT, SDValue In,
return DAG.getBitcast(DstVT, Res);
// If 512bit -> 128bit truncate another stage.
- EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
Res = DAG.getBitcast(PackedVT, Res);
- return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+ return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
}
// Recursively pack lower/upper subvectors, concat result and pack again.
- assert(SrcVT.getSizeInBits() >= 512 && "Expected 512-bit vector or greater");
- EVT PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems / 2);
- Lo = truncateVectorCompareWithPACKSS(PackedVT, Lo, DL, DAG, Subtarget);
- Hi = truncateVectorCompareWithPACKSS(PackedVT, Hi, DL, DAG, Subtarget);
+ assert(SrcSizeInBits >= 512 && "Expected 512-bit vector or greater");
+ EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
+ Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
+ Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
- PackedVT = EVT::getVectorVT(*DAG.getContext(), PackedSVT, NumElems);
+ PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems);
SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, PackedVT, Lo, Hi);
- return truncateVectorCompareWithPACKSS(DstVT, Res, DL, DAG, Subtarget);
+ return truncateVectorWithPACK(Opcode, DstVT, Res, DL, DAG, Subtarget);
}
static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
@@ -16047,15 +16456,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
MVT InVT = In.getSimpleValueType();
+ unsigned InNumEltBits = InVT.getScalarSizeInBits();
- if (VT == MVT::i1) {
- assert((InVT.isInteger() && (InVT.getSizeInBits() <= 64)) &&
- "Invalid scalar TRUNCATE operation");
- if (InVT.getSizeInBits() >= 32)
- return SDValue();
- In = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, In);
- return DAG.getNode(ISD::TRUNCATE, DL, VT, In);
- }
assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Invalid TRUNCATE operation");
@@ -16071,9 +16473,23 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
}
- // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
- if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
- if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
+ // Truncate with PACKSS if we are truncating a vector with sign-bits that
+ // extend all the way to the packed/truncated value.
+ unsigned NumPackedBits = std::min<unsigned>(VT.getScalarSizeInBits(), 16);
+ if ((InNumEltBits - NumPackedBits) < DAG.ComputeNumSignBits(In))
+ if (SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget))
+ return V;
+
+ // Truncate with PACKUS if we are truncating a vector with leading zero bits
+ // that extend all the way to the packed/truncated value.
+ // Pre-SSE41 we can only use PACKUSWB.
+ KnownBits Known;
+ DAG.computeKnownBits(In, Known);
+ NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
+ if ((InNumEltBits - NumPackedBits) <= Known.countMinLeadingZeros())
+ if (SDValue V =
+ truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget))
return V;
if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
@@ -16579,16 +16995,11 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
// non-casted variable when we check for possible users.
switch (ArithOp.getOpcode()) {
case ISD::ADD:
- // Due to an isel shortcoming, be conservative if this add is likely to be
- // selected as part of a load-modify-store instruction. When the root node
- // in a match is a store, isel doesn't know how to remap non-chain non-flag
- // uses of other nodes in the match, such as the ADD in this case. This
- // leads to the ADD being left around and reselected, with the result being
- // two adds in the output. Alas, even if none our users are stores, that
- // doesn't prove we're O.K. Ergo, if we have any parents that aren't
- // CopyToReg or SETCC, eschew INC/DEC. A better fix seems to require
- // climbing the DAG back to the root, and it doesn't seem to be worth the
- // effort.
+ // We only want to rewrite this as a target-specific node with attached
+ // flags if there is a reasonable chance of either using that to do custom
+ // instructions selection that can fold some of the memory operands, or if
+ // only the flags are used. If there are other uses, leave the node alone
+ // and emit a test instruction.
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
UE = Op.getNode()->use_end(); UI != UE; ++UI)
if (UI->getOpcode() != ISD::CopyToReg &&
@@ -16596,17 +17007,20 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
UI->getOpcode() != ISD::STORE)
goto default_case;
- if (ConstantSDNode *C =
- dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
+ if (auto *C = dyn_cast<ConstantSDNode>(ArithOp.getOperand(1))) {
// An add of one will be selected as an INC.
- if (C->isOne() && !Subtarget.slowIncDec()) {
+ if (C->isOne() &&
+ (!Subtarget.slowIncDec() ||
+ DAG.getMachineFunction().getFunction().optForSize())) {
Opcode = X86ISD::INC;
NumOperands = 1;
break;
}
// An add of negative one (subtract of one) will be selected as a DEC.
- if (C->isAllOnesValue() && !Subtarget.slowIncDec()) {
+ if (C->isAllOnesValue() &&
+ (!Subtarget.slowIncDec() ||
+ DAG.getMachineFunction().getFunction().optForSize())) {
Opcode = X86ISD::DEC;
NumOperands = 1;
break;
@@ -16699,11 +17113,13 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::SUB:
case ISD::OR:
case ISD::XOR:
- // Due to the ISEL shortcoming noted above, be conservative if this op is
- // likely to be selected as part of a load-modify-store instruction.
+ // Similar to ISD::ADD above, check if the uses will preclude useful
+ // lowering of the target-specific node.
for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
UE = Op.getNode()->use_end(); UI != UE; ++UI)
- if (UI->getOpcode() == ISD::STORE)
+ if (UI->getOpcode() != ISD::CopyToReg &&
+ UI->getOpcode() != ISD::SETCC &&
+ UI->getOpcode() != ISD::STORE)
goto default_case;
// Otherwise use a regular EFLAGS-setting instruction.
@@ -16799,7 +17215,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
// with an immediate. 16 bit immediates are to be avoided.
if ((Op0.getValueType() == MVT::i16 &&
(isa<ConstantSDNode>(Op0) || isa<ConstantSDNode>(Op1))) &&
- !DAG.getMachineFunction().getFunction()->optForMinSize() &&
+ !DAG.getMachineFunction().getFunction().optForMinSize() &&
!Subtarget.isAtom()) {
unsigned ExtendOp =
isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
@@ -16808,8 +17224,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
}
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
- SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs,
- Op0, Op1);
+ SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
return SDValue(Sub.getNode(), 1);
}
return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
@@ -16871,8 +17286,11 @@ SDValue X86TargetLowering::getSqrtEstimate(SDValue Op,
// instructions: convert to single, rsqrtss, convert back to double, refine
// (3 steps = at least 13 insts). If an 'rsqrtsd' variant was added to the ISA
// along with FMA, this could be a throughput win.
+ // TODO: SQRT requires SSE2 to prevent the introduction of an illegal v4i32
+ // after legalize types.
if ((VT == MVT::f32 && Subtarget.hasSSE1()) ||
- (VT == MVT::v4f32 && Subtarget.hasSSE1()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1() && Reciprocal) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE2() && !Reciprocal) ||
(VT == MVT::v8f32 && Subtarget.hasAVX())) {
if (RefinementSteps == ReciprocalEstimate::Unspecified)
RefinementSteps = 1;
@@ -16965,6 +17383,7 @@ static SDValue getBitTestCondition(SDValue Src, SDValue BitNo, ISD::CondCode CC,
/// Result of 'and' is compared against zero. Change to a BT node if possible.
static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
const SDLoc &dl, SelectionDAG &DAG) {
+ assert(And.getOpcode() == ISD::AND && "Expected AND node!");
SDValue Op0 = And.getOperand(0);
SDValue Op1 = And.getOperand(1);
if (Op0.getOpcode() == ISD::TRUNCATE)
@@ -17013,36 +17432,10 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
return SDValue();
}
-// Convert (truncate (srl X, N) to i1) to (bt X, N)
-static SDValue LowerTruncateToBT(SDValue Op, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG) {
-
- assert(Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1 &&
- "Expected TRUNCATE to i1 node");
-
- if (Op.getOperand(0).getOpcode() != ISD::SRL)
- return SDValue();
-
- SDValue ShiftRight = Op.getOperand(0);
- return getBitTestCondition(ShiftRight.getOperand(0), ShiftRight.getOperand(1),
- CC, dl, DAG);
-}
-
-/// Result of 'and' or 'trunc to i1' is compared against zero.
-/// Change to a BT node if possible.
-SDValue X86TargetLowering::LowerToBT(SDValue Op, ISD::CondCode CC,
- const SDLoc &dl, SelectionDAG &DAG) const {
- if (Op.getOpcode() == ISD::AND)
- return LowerAndToBT(Op, CC, dl, DAG);
- if (Op.getOpcode() == ISD::TRUNCATE && Op.getValueType() == MVT::i1)
- return LowerTruncateToBT(Op, CC, dl, DAG);
- return SDValue();
-}
-
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
/// CMPs.
-static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
- SDValue &Op1) {
+static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
+ SDValue &Op1) {
unsigned SSECC;
bool Swap = false;
@@ -17075,8 +17468,8 @@ static int translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
case ISD::SETULT: Swap = true; LLVM_FALLTHROUGH;
case ISD::SETUGT: SSECC = 6; break;
case ISD::SETO: SSECC = 7; break;
- case ISD::SETUEQ:
- case ISD::SETONE: SSECC = 8; break;
+ case ISD::SETUEQ: SSECC = 8; break;
+ case ISD::SETONE: SSECC = 12; break;
}
if (Swap)
std::swap(Op0, Op1);
@@ -17189,6 +17582,20 @@ static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
if (Swap)
std::swap(Op0, Op1);
+
+ // See if it is the case of CMP(EQ|NEQ,AND(A,B),ZERO) and change it to TESTM|NM.
+ if ((!Opc && SSECC == 4) || Opc == X86ISD::PCMPEQM) {
+ SDValue A = peekThroughBitcasts(Op0);
+ if ((A.getOpcode() == ISD::AND || A.getOpcode() == X86ISD::FAND) &&
+ ISD::isBuildVectorAllZeros(Op1.getNode())) {
+ MVT VT0 = Op0.getSimpleValueType();
+ SDValue RHS = DAG.getBitcast(VT0, A.getOperand(0));
+ SDValue LHS = DAG.getBitcast(VT0, A.getOperand(1));
+ return DAG.getNode(Opc == X86ISD::PCMPEQM ? X86ISD::TESTNM : X86ISD::TESTM,
+ dl, VT, RHS, LHS);
+ }
+ }
+
if (Opc)
return DAG.getNode(Opc, dl, VT, Op0, Op1);
Opc = Unsigned ? X86ISD::CMPMU: X86ISD::CMPM;
@@ -17256,25 +17663,21 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
// emit two comparisons and a logic op to tie them together.
- // TODO: This can be avoided if Intel (and only Intel as of 2016) AVX is
- // available.
SDValue Cmp;
unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
- if (SSECC == 8) {
+ if (SSECC >= 8 && !Subtarget.hasAVX()) {
// LLVM predicate is SETUEQ or SETONE.
unsigned CC0, CC1;
unsigned CombineOpc;
if (Cond == ISD::SETUEQ) {
CC0 = 3; // UNORD
CC1 = 0; // EQ
- CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FOR) :
- static_cast<unsigned>(ISD::OR);
+ CombineOpc = X86ISD::FOR;
} else {
assert(Cond == ISD::SETONE);
CC0 = 7; // ORD
CC1 = 4; // NEQ
- CombineOpc = Opc == X86ISD::CMPP ? static_cast<unsigned>(X86ISD::FAND) :
- static_cast<unsigned>(ISD::AND);
+ CombineOpc = X86ISD::FAND;
}
SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
@@ -17379,6 +17782,24 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
DAG.getConstant(CmpMode, dl, MVT::i8));
}
+ // (X & Y) != 0 --> (X & Y) == Y iff Y is power-of-2.
+ // Revert part of the simplifySetCCWithAnd combine, to avoid an invert.
+ if (Cond == ISD::SETNE && ISD::isBuildVectorAllZeros(Op1.getNode())) {
+ SDValue BC0 = peekThroughBitcasts(Op0);
+ if (BC0.getOpcode() == ISD::AND) {
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+ if (getTargetConstantBitsFromNode(BC0.getOperand(1),
+ VT.getScalarSizeInBits(), UndefElts,
+ EltBits, false, false)) {
+ if (llvm::all_of(EltBits, [](APInt &V) { return V.isPowerOf2(); })) {
+ Cond = ISD::SETEQ;
+ Op1 = DAG.getBitcast(VT, BC0.getOperand(1));
+ }
+ }
+ }
+ }
+
// We are handling one of the integer comparisons here. Since SSE only has
// GT and EQ comparisons for integer, swapping operands and multiple
// operations may be required for some comparisons.
@@ -17399,7 +17820,8 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
// Special case: Use min/max operations for SETULE/SETUGE
MVT VET = VT.getVectorElementType();
bool HasMinMax =
- (Subtarget.hasSSE41() && (VET >= MVT::i8 && VET <= MVT::i32)) ||
+ (Subtarget.hasAVX512() && VET == MVT::i64) ||
+ (Subtarget.hasSSE41() && (VET == MVT::i16 || VET == MVT::i32)) ||
(Subtarget.hasSSE2() && (VET == MVT::i8));
bool MinMax = false;
if (HasMinMax) {
@@ -17560,14 +17982,10 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
// Lower ((X >>s N) & 1) != 0 to BT(X, N).
- // Lower (trunc (X >> N) to i1) to BT(X, N).
- if (Op0.hasOneUse() && isNullConstant(Op1) &&
+ if (Op0.getOpcode() == ISD::AND && Op0.hasOneUse() && isNullConstant(Op1) &&
(CC == ISD::SETEQ || CC == ISD::SETNE)) {
- if (SDValue NewSetCC = LowerToBT(Op0, CC, dl, DAG)) {
- if (VT == MVT::i1)
- return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, NewSetCC);
+ if (SDValue NewSetCC = LowerAndToBT(Op0, CC, dl, DAG))
return NewSetCC;
- }
}
// Look for X == 0, X == 1, X != 0, or X != 1. We can simplify some forms of
@@ -17584,20 +18002,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
return Op0;
CCode = X86::GetOppositeBranchCondition(CCode);
- SDValue SetCC = getSETCC(CCode, Op0.getOperand(1), dl, DAG);
- if (VT == MVT::i1)
- return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
- return SetCC;
- }
- }
- if (Op0.getValueType() == MVT::i1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
- if (isOneConstant(Op1)) {
- ISD::CondCode NewCC = ISD::getSetCCInverse(CC, true);
- return DAG.getSetCC(dl, VT, Op0, DAG.getConstant(0, dl, MVT::i1), NewCC);
- }
- if (!isNullConstant(Op1)) {
- SDValue Xor = DAG.getNode(ISD::XOR, dl, MVT::i1, Op0, Op1);
- return DAG.getSetCC(dl, VT, Xor, DAG.getConstant(0, dl, MVT::i1), CC);
+ return getSETCC(CCode, Op0.getOperand(1), dl, DAG);
}
}
@@ -17608,10 +18013,7 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
SDValue EFLAGS = EmitCmp(Op0, Op1, X86CC, dl, DAG);
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
- SDValue SetCC = getSETCC(X86CC, EFLAGS, dl, DAG);
- if (VT == MVT::i1)
- return DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, SetCC);
- return SetCC;
+ return getSETCC(X86CC, EFLAGS, dl, DAG);
}
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
@@ -17632,10 +18034,7 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
- SDValue SetCC = getSETCC(CC, Cmp.getValue(1), DL, DAG);
- if (Op.getSimpleValueType() == MVT::i1)
- return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SetCC);
- return SetCC;
+ return getSETCC(CC, Cmp.getValue(1), DL, DAG);
}
/// Return true if opcode is a X86 logical comparison.
@@ -17646,7 +18045,7 @@ static bool isX86LogicalCmp(SDValue Op) {
return true;
if (Op.getResNo() == 1 &&
(Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
- Opc == X86ISD::SBB || Opc == X86ISD::SMUL || Opc == X86ISD::UMUL ||
+ Opc == X86ISD::SBB || Opc == X86ISD::SMUL ||
Opc == X86ISD::INC || Opc == X86ISD::DEC || Opc == X86ISD::OR ||
Opc == X86ISD::XOR || Opc == X86ISD::AND))
return true;
@@ -17684,17 +18083,17 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
(Subtarget.hasSSE1() && VT == MVT::f32)) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
- int SSECC = translateX86FSETCC(
+ unsigned SSECC = translateX86FSETCC(
cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
- if (SSECC != 8) {
- if (Subtarget.hasAVX512()) {
- SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
- CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
- return DAG.getNode(VT.isVector() ? X86ISD::SELECT : X86ISD::SELECTS,
- DL, VT, Cmp, Op1, Op2);
- }
+ if (Subtarget.hasAVX512()) {
+ SDValue Cmp = DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CondOp0,
+ CondOp1, DAG.getConstant(SSECC, DL, MVT::i8));
+ assert(!VT.isVector() && "Not a scalar type?");
+ return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
+ }
+ if (SSECC < 8 || Subtarget.hasAVX()) {
SDValue Cmp = DAG.getNode(X86ISD::FSETCC, DL, VT, CondOp0, CondOp1,
DAG.getConstant(SSECC, DL, MVT::i8));
@@ -17941,7 +18340,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
// We know the result of AND is compared against zero. Try to match
// it to BT.
if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
- if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, DL, DAG)) {
+ if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, DL, DAG)) {
CC = NewSetCC.getOperand(0);
Cond = NewSetCC.getOperand(1);
AddTest = false;
@@ -17983,66 +18382,68 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (T1.getValueType() == T2.getValueType() &&
// Blacklist CopyFromReg to avoid partial register stalls.
T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
- SDVTList VTs = DAG.getVTList(T1.getValueType(), MVT::Glue);
- SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, VTs, T2, T1, CC, Cond);
+ SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
+ CC, Cond);
return DAG.getNode(ISD::TRUNCATE, DL, Op.getValueType(), Cmov);
}
}
// X86ISD::CMOV means set the result (which is operand 1) to the RHS if
// condition is true.
- SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Glue);
SDValue Ops[] = { Op2, Op1, CC, Cond };
- return DAG.getNode(X86ISD::CMOV, DL, VTs, Ops);
+ return DAG.getNode(X86ISD::CMOV, DL, Op.getValueType(), Ops);
}
-static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
- const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue LowerSIGN_EXTEND_Mask(SDValue Op,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
MVT VT = Op->getSimpleValueType(0);
SDValue In = Op->getOperand(0);
MVT InVT = In.getSimpleValueType();
+ assert(InVT.getVectorElementType() == MVT::i1 && "Unexpected input type!");
MVT VTElt = VT.getVectorElementType();
- MVT InVTElt = InVT.getVectorElementType();
SDLoc dl(Op);
- // SKX processor
- if ((InVTElt == MVT::i1) &&
- (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
-
- ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
-
- return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
-
unsigned NumElts = VT.getVectorNumElements();
- if (VT.is512BitVector() && InVTElt != MVT::i1 &&
- (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
- if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
- return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
- return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
- }
-
- if (InVTElt != MVT::i1)
- return SDValue();
-
+ // Extend VT if the scalar type is v8/v16 and BWI is not supported.
MVT ExtVT = VT;
- if (!VT.is512BitVector() && !Subtarget.hasVLX())
- ExtVT = MVT::getVectorVT(MVT::getIntegerVT(512/NumElts), NumElts);
+ if (!Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)
+ ExtVT = MVT::getVectorVT(MVT::i32, NumElts);
+
+ // Widen to 512-bits if VLX is not supported.
+ MVT WideVT = ExtVT;
+ if (!ExtVT.is512BitVector() && !Subtarget.hasVLX()) {
+ NumElts *= 512 / ExtVT.getSizeInBits();
+ InVT = MVT::getVectorVT(MVT::i1, NumElts);
+ In = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, InVT, DAG.getUNDEF(InVT),
+ In, DAG.getIntPtrConstant(0, dl));
+ WideVT = MVT::getVectorVT(ExtVT.getVectorElementType(), NumElts);
+ }
SDValue V;
- if (Subtarget.hasDQI()) {
- V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
- assert(!VT.is512BitVector() && "Unexpected vector type");
+ MVT WideEltVT = WideVT.getVectorElementType();
+ if ((Subtarget.hasDQI() && WideEltVT.getSizeInBits() >= 32) ||
+ (Subtarget.hasBWI() && WideEltVT.getSizeInBits() <= 16)) {
+ V = getExtendInVec(X86ISD::VSEXT, dl, WideVT, In, DAG);
} else {
- SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
- SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
- V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero);
- if (ExtVT == VT)
- return V;
+ SDValue NegOne = getOnesVector(WideVT, DAG, dl);
+ SDValue Zero = getZeroVector(WideVT, Subtarget, DAG, dl);
+ V = DAG.getSelect(dl, WideVT, In, NegOne, Zero);
}
- return DAG.getNode(X86ISD::VTRUNC, dl, VT, V);
+ // Truncate if we had to extend i16/i8 above.
+ if (VT != ExtVT) {
+ WideVT = MVT::getVectorVT(VTElt, NumElts);
+ V = DAG.getNode(X86ISD::VTRUNC, dl, WideVT, V);
+ }
+
+ // Extract back to 128/256-bit if we widened.
+ if (WideVT != VT)
+ V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, V,
+ DAG.getIntPtrConstant(0, dl));
+
+ return V;
}
// Lowering for SIGN_EXTEND_VECTOR_INREG and ZERO_EXTEND_VECTOR_INREG.
@@ -18139,12 +18540,17 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
MVT InVT = In.getSimpleValueType();
SDLoc dl(Op);
- if (VT.is512BitVector() || InVT.getVectorElementType() == MVT::i1)
- return LowerSIGN_EXTEND_AVX512(Op, Subtarget, DAG);
+ if (InVT.getVectorElementType() == MVT::i1)
+ return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
- if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
- (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
- (VT != MVT::v16i16 || InVT != MVT::v16i8))
+ if ((VT != MVT::v4i64 || InVT != MVT::v4i32) &&
+ (VT != MVT::v8i32 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i16 || InVT != MVT::v16i8) &&
+ (VT != MVT::v8i64 || InVT != MVT::v8i32) &&
+ (VT != MVT::v8i64 || InVT != MVT::v8i16) &&
+ (VT != MVT::v16i32 || InVT != MVT::v16i16) &&
+ (VT != MVT::v16i32 || InVT != MVT::v16i8) &&
+ (VT != MVT::v32i16 || InVT != MVT::v32i8))
return SDValue();
if (Subtarget.hasInt256())
@@ -18311,13 +18717,10 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
assert(VT == MVT::v32i8 && "Unexpected extload type");
- SmallVector<SDValue, 2> Chains;
-
SDValue BasePtr = Ld->getBasePtr();
SDValue LoadLo = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
Ld->getBasePtr(),
Ld->getMemOperand());
- Chains.push_back(LoadLo.getValue(1));
SDValue BasePtrHi =
DAG.getNode(ISD::ADD, dl, BasePtr.getValueType(), BasePtr,
@@ -18326,8 +18729,9 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
SDValue LoadHi = DAG.getLoad(MVT::v16i1, dl, Ld->getChain(),
BasePtrHi,
Ld->getMemOperand());
- Chains.push_back(LoadHi.getValue(1));
- SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+
+ SDValue NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ LoadLo.getValue(1), LoadHi.getValue(1));
DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), NewChain);
SDValue Lo = DAG.getNode(ExtOpcode, dl, MVT::v16i8, LoadLo);
@@ -18443,6 +18847,12 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
if (Ext == ISD::SEXTLOAD && RegSz >= 256)
loadRegZize = 128;
+ // If we don't have BWI we won't be able to create the shuffle needed for
+ // v8i8->v8i64.
+ if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
+ MemVT == MVT::v8i8)
+ loadRegZize = 128;
+
// Represent our vector as a sequence of elements which are the
// largest scalar that we can load.
EVT LoadUnitVecVT = EVT::getVectorVT(
@@ -18509,6 +18919,13 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
return Shuff;
}
+ if (Ext == ISD::EXTLOAD && !Subtarget.hasBWI() && RegVT == MVT::v8i64 &&
+ MemVT == MVT::v8i8) {
+ SDValue Sext = getExtendInVec(X86ISD::VZEXT, dl, RegVT, SlicedVec, DAG);
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
+ return Sext;
+ }
+
// Redistribute the loaded elements into the different locations.
SmallVector<int, 16> ShuffleVec(NumElems * SizeRatio, -1);
for (unsigned i = 0; i != NumElems; ++i)
@@ -18796,9 +19213,10 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (isTruncWithZeroHighBitsInput(Cond, DAG))
Cond = Cond.getOperand(0);
- // We know the result is compared against zero. Try to match it to BT.
- if (Cond.hasOneUse()) {
- if (SDValue NewSetCC = LowerToBT(Cond, ISD::SETNE, dl, DAG)) {
+ // We know the result of AND is compared against zero. Try to match
+ // it to BT.
+ if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
+ if (SDValue NewSetCC = LowerAndToBT(Cond, ISD::SETNE, dl, DAG)) {
CC = NewSetCC.getOperand(0);
Cond = NewSetCC.getOperand(1);
addTest = false;
@@ -18867,8 +19285,8 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
if (Is64Bit) {
// The 64 bit implementation of segmented stacks needs to clobber both r10
// r11. This makes it impossible to use it along with nested parameters.
- const Function *F = MF.getFunction();
- for (const auto &A : F->args()) {
+ const Function &F = MF.getFunction();
+ for (const auto &A : F.args()) {
if (A.hasNestAttr())
report_fatal_error("Cannot use segmented stacks with functions that "
"have nested arguments.");
@@ -18915,7 +19333,7 @@ SDValue X86TargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
SDLoc DL(Op);
if (!Subtarget.is64Bit() ||
- Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv())) {
+ Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv())) {
// vastart just stores the address of the VarArgsFrameIndex slot into the
// memory location argument.
SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -18969,7 +19387,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
assert(Op.getNumOperands() == 4);
MachineFunction &MF = DAG.getMachineFunction();
- if (Subtarget.isCallingConvWin64(MF.getFunction()->getCallingConv()))
+ if (Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv()))
// The Win64 ABI uses char* instead of a structure.
return DAG.expandVAArg(Op.getNode());
@@ -19000,7 +19418,7 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
if (ArgMode == 2) {
// Sanity Check: Make sure using fp_offset makes sense.
assert(!Subtarget.useSoftFloat() &&
- !(MF.getFunction()->hasFnAttribute(Attribute::NoImplicitFloat)) &&
+ !(MF.getFunction().hasFnAttribute(Attribute::NoImplicitFloat)) &&
Subtarget.hasSSE1());
}
@@ -19010,13 +19428,12 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
DAG.getConstant(ArgMode, dl, MVT::i8),
DAG.getConstant(Align, dl, MVT::i32)};
SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
- SDValue VAARG = DAG.getMemIntrinsicNode(X86ISD::VAARG_64, dl,
- VTs, InstOps, MVT::i64,
- MachinePointerInfo(SV),
- /*Align=*/0,
- /*Volatile=*/false,
- /*ReadMem=*/true,
- /*WriteMem=*/true);
+ SDValue VAARG = DAG.getMemIntrinsicNode(
+ X86ISD::VAARG_64, dl,
+ VTs, InstOps, MVT::i64,
+ MachinePointerInfo(SV),
+ /*Align=*/0,
+ MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
Chain = VAARG.getValue(1);
// Load the next argument and return it
@@ -19029,7 +19446,7 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
// where a va_list is still an i8*.
assert(Subtarget.is64Bit() && "This code only handles 64-bit va_copy!");
if (Subtarget.isCallingConvWin64(
- DAG.getMachineFunction().getFunction()->getCallingConv()))
+ DAG.getMachineFunction().getFunction().getCallingConv()))
// Probably a Win64 va_copy.
return DAG.expandVACopy(Op.getNode());
@@ -19172,8 +19589,8 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
} else {
- SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
- DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
+ SDValue ShOps[4] = {ShAmt, DAG.getConstant(0, dl, SVT),
+ DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
ShAmt = DAG.getBuildVector(MVT::v4i32, dl, ShOps);
}
@@ -19193,9 +19610,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
const SDLoc &dl) {
if (isAllOnesConstant(Mask))
- return DAG.getTargetConstant(1, dl, MaskVT);
+ return DAG.getConstant(1, dl, MaskVT);
if (X86::isZeroNode(Mask))
- return DAG.getTargetConstant(0, dl, MaskVT);
+ return DAG.getConstant(0, dl, MaskVT);
if (MaskVT.bitsGT(Mask.getSimpleValueType())) {
// Mask should be extended
@@ -19255,13 +19672,12 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
switch (Op.getOpcode()) {
default: break;
- case X86ISD::PCMPEQM:
- case X86ISD::PCMPGTM:
case X86ISD::CMPM:
+ case X86ISD::CMPM_RND:
case X86ISD::CMPMU:
+ case X86ISD::VPSHUFBITQMB:
return DAG.getNode(ISD::AND, dl, VT, Op, VMask);
case X86ISD::VFPCLASS:
- case X86ISD::VFPCLASSS:
return DAG.getNode(ISD::OR, dl, VT, Op, VMask);
case X86ISD::VTRUNC:
case X86ISD::VTRUNCS:
@@ -19370,8 +19786,8 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn,
return DAG.getNode(ISD::SUB, dl, PtrVT, RegNodeBase, ParentFrameOffset);
}
-static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+ SelectionDAG &DAG) const {
// Helper to detect if the operand is CUR_DIRECTION rounding mode.
auto isRoundModeCurDirection = [](SDValue Rnd) {
if (!isa<ConstantSDNode>(Rnd))
@@ -19442,14 +19858,36 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue passThru = Op.getOperand(3);
SDValue Mask = Op.getOperand(4);
unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
- if (IntrWithRoundingModeOpcode != 0) {
- SDValue Rnd = Op.getOperand(5);
- if (!isRoundModeCurDirection(Rnd))
+ // There are 2 kinds of intrinsics in this group:
+ // (1) With suppress-all-exceptions (sae) or rounding mode- 6 operands
+ // (2) With rounding mode and sae - 7 operands.
+ bool HasRounding = IntrWithRoundingModeOpcode != 0;
+ if (Op.getNumOperands() == (5U + HasRounding)) {
+ if (HasRounding) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, VT, Src1, Src2, Rnd),
+ Mask, passThru, Subtarget, DAG);
+ }
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ Src2),
+ Mask, passThru, Subtarget, DAG);
+ }
+
+ assert(Op.getNumOperands() == (6U + HasRounding) &&
+ "Unexpected intrinsic form");
+ SDValue RoundingMode = Op.getOperand(5);
+ if (HasRounding) {
+ SDValue Sae = Op.getOperand(6);
+ if (!isRoundModeCurDirection(Sae))
return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
- dl, VT, Src1, Src2, Rnd),
+ dl, VT, Src1, Src2,
+ RoundingMode, Sae),
Mask, passThru, Subtarget, DAG);
}
- return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
+ return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
+ Src2, RoundingMode),
Mask, passThru, Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK_RM: {
@@ -19518,16 +19956,23 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Src1, Src2, Rnd),
Mask, PassThru, Subtarget, DAG);
}
- case INTR_TYPE_3OP_SCALAR_MASK_RM: {
+ case INTR_TYPE_3OP_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
SDValue Src2 = Op.getOperand(2);
SDValue Src3 = Op.getOperand(3);
SDValue PassThru = Op.getOperand(4);
SDValue Mask = Op.getOperand(5);
- SDValue Sae = Op.getOperand(6);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(6);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+ dl, VT, Src1, Src2, Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1,
- Src2, Src3, Sae),
+ Src2, Src3),
Mask, PassThru, Subtarget, DAG);
}
case INTR_TYPE_3OP_MASK_RM: {
@@ -19664,10 +20109,39 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
else
PassThru = Src1;
- SDValue Rnd = Op.getOperand(5);
+ unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+ if (IntrWithRoundingModeOpcode != 0) {
+ SDValue Rnd = Op.getOperand(5);
+ if (!isRoundModeCurDirection(Rnd))
+ return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode, dl,
+ Op.getValueType(), Src1, Src2,
+ Src3, Rnd),
+ Mask, PassThru, Subtarget, DAG);
+ }
+
return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl,
Op.getValueType(), Src1, Src2,
- Src3, Rnd),
+ Src3),
+ Mask, PassThru, Subtarget, DAG);
+ }
+ case IFMA_OP_MASKZ:
+ case IFMA_OP_MASK: {
+ SDValue Src1 = Op.getOperand(1);
+ SDValue Src2 = Op.getOperand(2);
+ SDValue Src3 = Op.getOperand(3);
+ SDValue Mask = Op.getOperand(4);
+ MVT VT = Op.getSimpleValueType();
+ SDValue PassThru = Src1;
+
+ // set PassThru element
+ if (IntrData->Type == IFMA_OP_MASKZ)
+ PassThru = getZeroVector(VT, Subtarget, DAG, dl);
+
+ // Node we need to swizzle the operands to pass the multiply operands
+ // first.
+ return getVectorMaskingNode(DAG.getNode(IntrData->Opc0,
+ dl, Op.getValueType(),
+ Src2, Src3, Src1),
Mask, PassThru, Subtarget, DAG);
}
case TERLOG_OP_MASK:
@@ -19726,9 +20200,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
MVT BitcastVT = MVT::getVectorVT(MVT::i1,
Mask.getSimpleValueType().getSizeInBits());
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Imm);
- SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask,
- DAG.getTargetConstant(0, dl, MaskVT),
- Subtarget, DAG);
+ SDValue FPclassMask = getVectorMaskingNode(FPclass, Mask, SDValue(),
+ Subtarget, DAG);
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
DAG.getUNDEF(BitcastVT), FPclassMask,
DAG.getIntPtrConstant(0, dl));
@@ -19739,9 +20212,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
SDValue Imm = Op.getOperand(2);
SDValue Mask = Op.getOperand(3);
SDValue FPclass = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Imm);
- SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask,
- DAG.getTargetConstant(0, dl, MVT::i1), Subtarget, DAG);
- return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, FPclassMask,
+ SDValue FPclassMask = getScalarMaskingNode(FPclass, Mask, SDValue(),
+ Subtarget, DAG);
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, FPclassMask,
DAG.getIntPtrConstant(0, dl));
}
case CMP_MASK:
@@ -19783,9 +20256,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Cmp = DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
Op.getOperand(2));
}
- SDValue CmpMask = getVectorMaskingNode(Cmp, Mask,
- DAG.getTargetConstant(0, dl,
- MaskVT),
+ SDValue CmpMask = getVectorMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
SDValue Res = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, BitcastVT,
DAG.getUNDEF(BitcastVT), CmpMask,
@@ -19808,11 +20279,9 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
if(!Cmp.getNode())
Cmp = DAG.getNode(IntrData->Opc0, dl, MVT::v1i1, Src1, Src2, CC);
- SDValue CmpMask = getScalarMaskingNode(Cmp, Mask,
- DAG.getTargetConstant(0, dl,
- MVT::i1),
+ SDValue CmpMask = getScalarMaskingNode(Cmp, Mask, SDValue(),
Subtarget, DAG);
- return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i8, CmpMask,
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i8, CmpMask,
DAG.getIntPtrConstant(0, dl));
}
case COMI: { // Comparison intrinsics
@@ -19866,7 +20335,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
else
FCmp = DAG.getNode(X86ISD::FSETCCM_RND, dl, MVT::v1i1, LHS, RHS,
DAG.getConstant(CondVal, dl, MVT::i8), Sae);
- return DAG.getNode(X86ISD::VEXTRACT, dl, MVT::i32, FCmp,
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32, FCmp,
DAG.getIntPtrConstant(0, dl));
}
case VSHIFT:
@@ -19891,18 +20360,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
Mask = DAG.getBitcast(MaskVT, Mask);
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Mask);
}
- case KUNPCK: {
- MVT VT = Op.getSimpleValueType();
- MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits()/2);
-
- SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
- SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
- // Arguments should be swapped.
- SDValue Res = DAG.getNode(IntrData->Opc0, dl,
- MVT::getVectorVT(MVT::i1, VT.getSizeInBits()),
- Src2, Src1);
- return DAG.getBitcast(VT, Res);
- }
case MASK_BINOP: {
MVT VT = Op.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
@@ -19953,37 +20410,25 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
DAG.getIntPtrConstant(0, dl));
return DAG.getBitcast(Op.getValueType(), Res);
}
- case BRCST_SUBVEC_TO_VEC: {
- SDValue Src = Op.getOperand(1);
- SDValue Passthru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
- EVT resVT = Passthru.getValueType();
- SDValue subVec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, resVT,
- DAG.getUNDEF(resVT), Src,
- DAG.getIntPtrConstant(0, dl));
- SDValue immVal;
- if (Src.getSimpleValueType().is256BitVector() && resVT.is512BitVector())
- immVal = DAG.getConstant(0x44, dl, MVT::i8);
- else
- immVal = DAG.getConstant(0, dl, MVT::i8);
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT,
- subVec, subVec, immVal),
- Mask, Passthru, Subtarget, DAG);
- }
- case BRCST32x2_TO_VEC: {
- SDValue Src = Op.getOperand(1);
- SDValue PassThru = Op.getOperand(2);
- SDValue Mask = Op.getOperand(3);
-
- assert((VT.getScalarType() == MVT::i32 ||
- VT.getScalarType() == MVT::f32) && "Unexpected type!");
- //bitcast Src to packed 64
- MVT ScalarVT = VT.getScalarType() == MVT::i32 ? MVT::i64 : MVT::f64;
- MVT BitcastVT = MVT::getVectorVT(ScalarVT, Src.getValueSizeInBits()/64);
- Src = DAG.getBitcast(BitcastVT, Src);
-
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
- Mask, PassThru, Subtarget, DAG);
+ case ROUNDP: {
+ assert(IntrData->Opc0 == X86ISD::VRNDSCALE && "Unexpected opcode");
+ // Clear the upper bits of the rounding immediate so that the legacy
+ // intrinsic can't trigger the scaling behavior of VRNDSCALE.
+ SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
+ Op.getOperand(2),
+ DAG.getConstant(0xf, dl, MVT::i32));
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), RoundingMode);
+ }
+ case ROUNDS: {
+ assert(IntrData->Opc0 == X86ISD::VRNDSCALES && "Unexpected opcode");
+ // Clear the upper bits of the rounding immediate so that the legacy
+ // intrinsic can't trigger the scaling behavior of VRNDSCALE.
+ SDValue RoundingMode = DAG.getNode(ISD::AND, dl, MVT::i32,
+ Op.getOperand(3),
+ DAG.getConstant(0xf, dl, MVT::i32));
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1), Op.getOperand(2), RoundingMode);
}
default:
break;
@@ -20187,7 +20632,8 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
auto &Context = MF.getMMI().getContext();
MCSymbol *S = Context.getOrCreateSymbol(Twine("GCC_except_table") +
Twine(MF.getFunctionNumber()));
- return DAG.getNode(X86ISD::Wrapper, dl, VT, DAG.getMCSymbol(S, PtrVT));
+ return DAG.getNode(getGlobalWrapperKind(), dl, VT,
+ DAG.getMCSymbol(S, PtrVT));
}
case Intrinsic::x86_seh_lsda: {
@@ -20589,18 +21035,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
case RDSEED:
case RDRAND: {
// Emit the node with the right value type.
- SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::Glue, MVT::Other);
+ SDVTList VTs = DAG.getVTList(Op->getValueType(0), MVT::i32, MVT::Other);
SDValue Result = DAG.getNode(IntrData->Opc0, dl, VTs, Op.getOperand(0));
// If the value returned by RDRAND/RDSEED was valid (CF=1), return 1.
// Otherwise return the value from Rand, which is always 0, casted to i32.
SDValue Ops[] = { DAG.getZExtOrTrunc(Result, dl, Op->getValueType(1)),
DAG.getConstant(1, dl, Op->getValueType(1)),
- DAG.getConstant(X86::COND_B, dl, MVT::i32),
+ DAG.getConstant(X86::COND_B, dl, MVT::i8),
SDValue(Result.getNode(), 1) };
- SDValue isValid = DAG.getNode(X86ISD::CMOV, dl,
- DAG.getVTList(Op->getValueType(1), MVT::Glue),
- Ops);
+ SDValue isValid = DAG.getNode(X86ISD::CMOV, dl, Op->getValueType(1), Ops);
// Return { result, isValid, chain }.
return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
@@ -21292,7 +21736,14 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
SDValue NibbleShift = DAG.getConstant(0x4, DL, CurrVT);
SDValue Lo = DAG.getNode(ISD::AND, DL, CurrVT, Op0, NibbleMask);
SDValue Hi = DAG.getNode(ISD::SRL, DL, CurrVT, Op0, NibbleShift);
- SDValue HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+ SDValue HiZ;
+ if (CurrVT.is512BitVector()) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+ HiZ = DAG.getSetCC(DL, MaskVT, Hi, Zero, ISD::SETEQ);
+ HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+ } else {
+ HiZ = DAG.getSetCC(DL, CurrVT, Hi, Zero, ISD::SETEQ);
+ }
Lo = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Lo);
Hi = DAG.getNode(X86ISD::PSHUFB, DL, CurrVT, InRegLUT, Hi);
@@ -21312,8 +21763,15 @@ static SDValue LowerVectorCTLZInRegLUT(SDValue Op, const SDLoc &DL,
SDValue Shift = DAG.getConstant(CurrScalarSizeInBits, DL, NextVT);
// Check if the upper half of the input element is zero.
- SDValue HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
- DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ if (CurrVT.is512BitVector()) {
+ MVT MaskVT = MVT::getVectorVT(MVT::i1, CurrVT.getVectorNumElements());
+ HiZ = DAG.getSetCC(DL, MaskVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ HiZ = DAG.getNode(ISD::SIGN_EXTEND, DL, CurrVT, HiZ);
+ } else {
+ HiZ = DAG.getSetCC(DL, CurrVT, DAG.getBitcast(CurrVT, Op0),
+ DAG.getBitcast(CurrVT, Zero), ISD::SETEQ);
+ }
HiZ = DAG.getBitcast(NextVT, HiZ);
// Move the upper/lower halves to the lower bits as we'll be extending to
@@ -21505,6 +21963,19 @@ static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
}
static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
+ MVT VT = Op.getSimpleValueType();
+ if (VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) {
+ // Since X86 does not have CMOV for 8-bit integer, we don't convert
+ // 8-bit integer abs to NEG and CMOV.
+ SDLoc DL(Op);
+ SDValue N0 = Op.getOperand(0);
+ SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
+ DAG.getConstant(0, DL, VT), N0);
+ SDValue Ops[] = {N0, Neg, DAG.getConstant(X86::COND_GE, DL, MVT::i8),
+ SDValue(Neg.getNode(), 1)};
+ return DAG.getNode(X86ISD::CMOV, DL, VT, Ops);
+ }
+
assert(Op.getSimpleValueType().is256BitVector() &&
Op.getSimpleValueType().isInteger() &&
"Only handle AVX 256-bit vector integer operation");
@@ -21700,7 +22171,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
return Lower256IntArith(Op, DAG);
// Only i8 vectors should need custom lowering after this.
- assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+ assert((VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+ (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
"Unsupported vector type");
// Lower v16i8/v32i8 as extension to v8i16/v16i16 vector pairs, multiply,
@@ -21712,22 +22184,36 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
// and then ashr/lshr the upper bits down to the lower bits before multiply.
unsigned Opcode = Op.getOpcode();
unsigned ExShift = (ISD::MULHU == Opcode ? ISD::SRL : ISD::SRA);
- unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
+ unsigned ExAVX = (ISD::MULHU == Opcode ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
+
+ // For 512-bit vectors, split into 256-bit vectors to allow the
+ // sign-extension to occur.
+ if (VT == MVT::v64i8)
+ return Lower512IntArith(Op, DAG);
// AVX2 implementations - extend xmm subvectors to ymm.
if (Subtarget.hasInt256()) {
+ unsigned NumElems = VT.getVectorNumElements();
SDValue Lo = DAG.getIntPtrConstant(0, dl);
- SDValue Hi = DAG.getIntPtrConstant(VT.getVectorNumElements() / 2, dl);
+ SDValue Hi = DAG.getIntPtrConstant(NumElems / 2, dl);
if (VT == MVT::v32i8) {
- SDValue ALo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Lo);
- SDValue BLo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Lo);
- SDValue AHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, A, Hi);
- SDValue BHi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v16i8, B, Hi);
- ALo = DAG.getNode(ExSSE41, dl, MVT::v16i16, ALo);
- BLo = DAG.getNode(ExSSE41, dl, MVT::v16i16, BLo);
- AHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, AHi);
- BHi = DAG.getNode(ExSSE41, dl, MVT::v16i16, BHi);
+ if (Subtarget.hasBWI()) {
+ SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v32i16, A);
+ SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v32i16, B);
+ SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v32i16, ExA, ExB);
+ Mul = DAG.getNode(ISD::SRL, dl, MVT::v32i16, Mul,
+ DAG.getConstant(8, dl, MVT::v32i16));
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+ }
+ SDValue ALo = extract128BitVector(A, 0, DAG, dl);
+ SDValue BLo = extract128BitVector(B, 0, DAG, dl);
+ SDValue AHi = extract128BitVector(A, NumElems / 2, DAG, dl);
+ SDValue BHi = extract128BitVector(B, NumElems / 2, DAG, dl);
+ ALo = DAG.getNode(ExAVX, dl, MVT::v16i16, ALo);
+ BLo = DAG.getNode(ExAVX, dl, MVT::v16i16, BLo);
+ AHi = DAG.getNode(ExAVX, dl, MVT::v16i16, AHi);
+ BHi = DAG.getNode(ExAVX, dl, MVT::v16i16, BHi);
Lo = DAG.getNode(ISD::SRL, dl, MVT::v16i16,
DAG.getNode(ISD::MUL, dl, MVT::v16i16, ALo, BLo),
DAG.getConstant(8, dl, MVT::v16i16));
@@ -21745,19 +22231,23 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
}
- SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
- SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
+ SDValue ExA = DAG.getNode(ExAVX, dl, MVT::v16i16, A);
+ SDValue ExB = DAG.getNode(ExAVX, dl, MVT::v16i16, B);
SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
- SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
- DAG.getConstant(8, dl, MVT::v16i16));
- Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Lo);
- Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, MulH, Hi);
+ Mul = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
+ DAG.getConstant(8, dl, MVT::v16i16));
+ // If we have BWI we can use truncate instruction.
+ if (Subtarget.hasBWI())
+ return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
+ Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Lo);
+ Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v8i16, Mul, Hi);
return DAG.getNode(X86ISD::PACKUS, dl, VT, Lo, Hi);
}
assert(VT == MVT::v16i8 &&
"Pre-AVX2 support only supports v16i8 multiplication");
MVT ExVT = MVT::v8i16;
+ unsigned ExSSE41 = (ISD::MULHU == Opcode ? X86ISD::VZEXT : X86ISD::VSEXT);
// Extract the lo parts and zero/sign extend to i16.
SDValue ALo, BLo;
@@ -21885,7 +22375,10 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
}
assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
- (VT == MVT::v8i32 && Subtarget.hasInt256()));
+ (VT == MVT::v8i32 && Subtarget.hasInt256()) ||
+ (VT == MVT::v16i32 && Subtarget.hasAVX512()));
+
+ int NumElts = VT.getVectorNumElements();
// PMULxD operations multiply each even value (starting at 0) of LHS with
// the related value of RHS and produce a widen result.
@@ -21899,17 +22392,17 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
//
// Place the odd value at an even position (basically, shift all values 1
// step to the left):
- const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1};
+ const int Mask[] = {1, -1, 3, -1, 5, -1, 7, -1, 9, -1, 11, -1, 13, -1, 15, -1};
// <a|b|c|d> => <b|undef|d|undef>
SDValue Odd0 = DAG.getVectorShuffle(VT, dl, Op0, Op0,
- makeArrayRef(&Mask[0], VT.getVectorNumElements()));
+ makeArrayRef(&Mask[0], NumElts));
// <e|f|g|h> => <f|undef|h|undef>
SDValue Odd1 = DAG.getVectorShuffle(VT, dl, Op1, Op1,
- makeArrayRef(&Mask[0], VT.getVectorNumElements()));
+ makeArrayRef(&Mask[0], NumElts));
// Emit two multiplies, one for the lower 2 ints and one for the higher 2
// ints.
- MVT MulVT = VT == MVT::v4i32 ? MVT::v2i64 : MVT::v4i64;
+ MVT MulVT = MVT::getVectorVT(MVT::i64, NumElts / 2);
bool IsSigned = Op->getOpcode() == ISD::SMUL_LOHI;
unsigned Opcode =
(!IsSigned || !Subtarget.hasSSE41()) ? X86ISD::PMULUDQ : X86ISD::PMULDQ;
@@ -21921,19 +22414,16 @@ static SDValue LowerMUL_LOHI(SDValue Op, const X86Subtarget &Subtarget,
SDValue Mul2 = DAG.getBitcast(VT, DAG.getNode(Opcode, dl, MulVT, Odd0, Odd1));
// Shuffle it back into the right order.
- SDValue Highs, Lows;
- if (VT == MVT::v8i32) {
- const int HighMask[] = {1, 9, 3, 11, 5, 13, 7, 15};
- Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
- const int LowMask[] = {0, 8, 2, 10, 4, 12, 6, 14};
- Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
- } else {
- const int HighMask[] = {1, 5, 3, 7};
- Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
- const int LowMask[] = {0, 4, 2, 6};
- Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+ SmallVector<int, 16> HighMask(NumElts);
+ SmallVector<int, 16> LowMask(NumElts);
+ for (int i = 0; i != NumElts; ++i) {
+ HighMask[i] = (i / 2) * 2 + ((i % 2) * NumElts) + 1;
+ LowMask[i] = (i / 2) * 2 + ((i % 2) * NumElts);
}
+ SDValue Highs = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, HighMask);
+ SDValue Lows = DAG.getVectorShuffle(VT, dl, Mul1, Mul2, LowMask);
+
// If we have a signed multiply but no PMULDQ fix up the high parts of a
// unsigned multiply.
if (IsSigned && !Subtarget.hasSSE41()) {
@@ -22123,9 +22613,9 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
}
}
- // Special case in 32-bit mode, where i64 is expanded into high and low parts.
+ // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
// TODO: Replace constant extraction with getTargetConstantBitsFromNode.
- if (!Subtarget.is64Bit() && !Subtarget.hasXOP() &&
+ if (!Subtarget.hasXOP() &&
(VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) ||
(Subtarget.hasAVX512() && VT == MVT::v8i64))) {
@@ -22252,9 +22742,8 @@ static SDValue LowerScalarVariableShift(SDValue Op, SelectionDAG &DAG,
}
}
- // Special case in 32-bit mode, where i64 is expanded into high and low parts.
- if (!Subtarget.is64Bit() && VT == MVT::v2i64 &&
- Amt.getOpcode() == ISD::BITCAST &&
+ // Check cases (mainly 32-bit) where i64 is expanded into high and low parts.
+ if (VT == MVT::v2i64 && Amt.getOpcode() == ISD::BITCAST &&
Amt.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
Amt = Amt.getOperand(0);
unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() /
@@ -22389,7 +22878,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// the vector shift into four scalar shifts plus four pairs of vector
// insert/extract.
if (ConstantAmt && (VT == MVT::v8i16 || VT == MVT::v4i32)) {
- unsigned TargetOpcode = X86ISD::MOVSS;
+ bool UseMOVSD = false;
bool CanBeSimplified;
// The splat value for the first packed shift (the 'X' from the example).
SDValue Amt1 = Amt->getOperand(0);
@@ -22406,7 +22895,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
// Otherwise, check if we can still simplify this node using a MOVSD.
CanBeSimplified = Amt1 == Amt->getOperand(1) &&
Amt->getOperand(2) == Amt->getOperand(3);
- TargetOpcode = X86ISD::MOVSD;
+ UseMOVSD = true;
Amt2 = Amt->getOperand(2);
}
} else {
@@ -22417,7 +22906,7 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
CanBeSimplified = Amt2 == Amt->getOperand(i);
if (!CanBeSimplified) {
- TargetOpcode = X86ISD::MOVSD;
+ UseMOVSD = true;
CanBeSimplified = true;
Amt2 = Amt->getOperand(4);
for (unsigned i=0; i != 4 && CanBeSimplified; ++i)
@@ -22430,19 +22919,18 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
if (CanBeSimplified && isa<ConstantSDNode>(Amt1) &&
isa<ConstantSDNode>(Amt2)) {
// Replace this node with two shifts followed by a MOVSS/MOVSD/PBLEND.
- MVT CastVT = MVT::v4i32;
SDValue Splat1 =
DAG.getConstant(cast<ConstantSDNode>(Amt1)->getAPIntValue(), dl, VT);
SDValue Shift1 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat1);
SDValue Splat2 =
DAG.getConstant(cast<ConstantSDNode>(Amt2)->getAPIntValue(), dl, VT);
SDValue Shift2 = DAG.getNode(Op->getOpcode(), dl, VT, R, Splat2);
- SDValue BitCast1 = DAG.getBitcast(CastVT, Shift1);
- SDValue BitCast2 = DAG.getBitcast(CastVT, Shift2);
- if (TargetOpcode == X86ISD::MOVSD)
- return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+ SDValue BitCast1 = DAG.getBitcast(MVT::v4i32, Shift1);
+ SDValue BitCast2 = DAG.getBitcast(MVT::v4i32, Shift2);
+ if (UseMOVSD)
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
BitCast2, {0, 1, 6, 7}));
- return DAG.getBitcast(VT, DAG.getVectorShuffle(CastVT, dl, BitCast1,
+ return DAG.getBitcast(VT, DAG.getVectorShuffle(MVT::v4i32, dl, BitCast1,
BitCast2, {0, 5, 6, 7}));
}
}
@@ -22752,7 +23240,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
assert((Opcode == ISD::ROTL) && "Only ROTL supported");
// XOP has 128-bit vector variable + immediate rotates.
- // +ve/-ve Amt = rotate left/right.
+ // +ve/-ve Amt = rotate left/right - just need to handle ISD::ROTL.
// Split 256-bit integers.
if (VT.is256BitVector())
@@ -22765,13 +23253,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
if (auto *RotateConst = BVAmt->getConstantSplatNode()) {
uint64_t RotateAmt = RotateConst->getAPIntValue().getZExtValue();
assert(RotateAmt < EltSizeInBits && "Rotation out of range");
- return DAG.getNode(X86ISD::VPROTI, DL, VT, R,
+ return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
DAG.getConstant(RotateAmt, DL, MVT::i8));
}
}
// Use general rotate by variable (per-element).
- return DAG.getNode(X86ISD::VPROT, DL, VT, R, Amt);
+ return Op;
}
static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
@@ -23319,15 +23807,14 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
// TRUNC(CTPOP(ZEXT(X))) to make use of vXi32/vXi64 VPOPCNT instructions.
if (Subtarget.hasVPOPCNTDQ()) {
- if (VT == MVT::v8i16) {
- Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v8i64, Op0);
- Op = DAG.getNode(ISD::CTPOP, DL, MVT::v8i64, Op);
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
- }
- if (VT == MVT::v16i8 || VT == MVT::v16i16) {
- Op = DAG.getNode(X86ISD::VZEXT, DL, MVT::v16i32, Op0);
- Op = DAG.getNode(ISD::CTPOP, DL, MVT::v16i32, Op);
- return DAG.getNode(X86ISD::VTRUNC, DL, VT, Op);
+ unsigned NumElems = VT.getVectorNumElements();
+ assert((VT.getVectorElementType() == MVT::i8 ||
+ VT.getVectorElementType() == MVT::i16) && "Unexpected type");
+ if (NumElems <= 16) {
+ MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
+ Op = DAG.getNode(ISD::ZERO_EXTEND, DL, NewVT, Op0);
+ Op = DAG.getNode(ISD::CTPOP, DL, NewVT, Op);
+ return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
}
}
@@ -23402,12 +23889,13 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- if (Subtarget.hasXOP())
+ MVT VT = Op.getSimpleValueType();
+
+ if (Subtarget.hasXOP() && !VT.is512BitVector())
return LowerBITREVERSE_XOP(Op, DAG);
assert(Subtarget.hasSSSE3() && "SSSE3 required for BITREVERSE");
- MVT VT = Op.getSimpleValueType();
SDValue In = Op.getOperand(0);
SDLoc DL(Op);
@@ -23450,7 +23938,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getNode(ISD::OR, DL, VT, Lo, Hi);
}
-static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
+static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ bool AllowIncDec = true) {
unsigned NewOpc = 0;
switch (N->getOpcode()) {
case ISD::ATOMIC_LOAD_ADD:
@@ -23473,6 +23963,26 @@ static SDValue lowerAtomicArithWithLOCK(SDValue N, SelectionDAG &DAG) {
}
MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
+
+ if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+ // Convert to inc/dec if they aren't slow or we are optimizing for size.
+ if (AllowIncDec && (!Subtarget.slowIncDec() ||
+ DAG.getMachineFunction().getFunction().optForSize())) {
+ if ((NewOpc == X86ISD::LADD && C->isOne()) ||
+ (NewOpc == X86ISD::LSUB && C->isAllOnesValue()))
+ return DAG.getMemIntrinsicNode(X86ISD::LINC, SDLoc(N),
+ DAG.getVTList(MVT::i32, MVT::Other),
+ {N->getOperand(0), N->getOperand(1)},
+ /*MemVT=*/N->getSimpleValueType(0), MMO);
+ if ((NewOpc == X86ISD::LSUB && C->isOne()) ||
+ (NewOpc == X86ISD::LADD && C->isAllOnesValue()))
+ return DAG.getMemIntrinsicNode(X86ISD::LDEC, SDLoc(N),
+ DAG.getVTList(MVT::i32, MVT::Other),
+ {N->getOperand(0), N->getOperand(1)},
+ /*MemVT=*/N->getSimpleValueType(0), MMO);
+ }
+ }
+
return DAG.getMemIntrinsicNode(
NewOpc, SDLoc(N), DAG.getVTList(MVT::i32, MVT::Other),
{N->getOperand(0), N->getOperand(1), N->getOperand(2)},
@@ -23506,7 +24016,7 @@ static SDValue lowerAtomicArith(SDValue N, SelectionDAG &DAG,
return N;
}
- SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG);
+ SDValue LockOp = lowerAtomicArithWithLOCK(N, DAG, Subtarget);
// RAUW the chain, but don't worry about the result, as it's unused.
assert(!N->hasAnyUseOfValue(0));
DAG.ReplaceAllUsesOfValueWith(N.getValue(1), LockOp.getValue(1));
@@ -23675,19 +24185,12 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
assert(Subtarget.hasAVX512() &&
"MGATHER/MSCATTER are supported on AVX-512 arch only");
- // X86 scatter kills mask register, so its type should be added to
- // the list of return values.
- // If the "scatter" has 2 return values, it is already handled.
- if (Op.getNode()->getNumValues() == 2)
- return Op;
-
MaskedScatterSDNode *N = cast<MaskedScatterSDNode>(Op.getNode());
SDValue Src = N->getValue();
MVT VT = Src.getSimpleValueType();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported scatter op");
SDLoc dl(Op);
- SDValue NewScatter;
SDValue Index = N->getIndex();
SDValue Mask = N->getMask();
SDValue Chain = N->getChain();
@@ -23758,8 +24261,8 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
// The mask is killed by scatter, add it to the values
SDVTList VTs = DAG.getVTList(BitMaskVT, MVT::Other);
SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index};
- NewScatter = DAG.getMaskedScatter(VTs, N->getMemoryVT(), dl, Ops,
- N->getMemOperand());
+ SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
+ VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
DAG.ReplaceAllUsesWith(Op, SDValue(NewScatter.getNode(), 1));
return SDValue(NewScatter.getNode(), 1);
}
@@ -23874,8 +24377,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- assert(Subtarget.hasAVX512() &&
- "MGATHER/MSCATTER are supported on AVX-512 arch only");
+ assert(Subtarget.hasAVX2() &&
+ "MGATHER/MSCATTER are supported on AVX-512/AVX-2 arch only");
MaskedGatherSDNode *N = cast<MaskedGatherSDNode>(Op.getNode());
SDLoc dl(Op);
@@ -23889,17 +24392,22 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
unsigned NumElts = VT.getVectorNumElements();
assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
- if (!Subtarget.hasVLX() && !VT.is512BitVector() &&
+ // If the index is v2i32, we're being called by type legalization.
+ if (IndexVT == MVT::v2i32)
+ return SDValue();
+
+ if (Subtarget.hasAVX512() && !Subtarget.hasVLX() && !VT.is512BitVector() &&
!Index.getSimpleValueType().is512BitVector()) {
// AVX512F supports only 512-bit vectors. Or data or index should
// be 512 bit wide. If now the both index and data are 256-bit, but
// the vector contains 8 elements, we just sign-extend the index
if (NumElts == 8) {
Index = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v8i64, Index);
- SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
- N->getOperand(3), Index };
- DAG.UpdateNodeOperands(N, Ops);
- return Op;
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+ return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
}
// Minimal number of elements in Gather
@@ -23923,67 +24431,21 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
Src0 = ExtendToType(Src0, NewVT, DAG);
SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
- SDValue NewGather = DAG.getMaskedGather(DAG.getVTList(NewVT, MVT::Other),
- N->getMemoryVT(), dl, Ops,
- N->getMemOperand());
- SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
- NewGather.getValue(0),
- DAG.getIntPtrConstant(0, dl));
- SDValue RetOps[] = {Exract, NewGather.getValue(1)};
- return DAG.getMergeValues(RetOps, dl);
- }
- if (N->getMemoryVT() == MVT::v2i32 && Subtarget.hasVLX()) {
- // There is a special case when the return type is v2i32 is illegal and
- // the type legaizer extended it to v2i64. Without this conversion we end up
- // with VPGATHERQQ (reading q-words from the memory) instead of VPGATHERQD.
- // In order to avoid this situation, we'll build an X86 specific Gather node
- // with index v2i64 and value type v4i32.
- assert(VT == MVT::v2i64 && Src0.getValueType() == MVT::v2i64 &&
- "Unexpected type in masked gather");
- Src0 = DAG.getVectorShuffle(MVT::v4i32, dl,
- DAG.getBitcast(MVT::v4i32, Src0),
- DAG.getUNDEF(MVT::v4i32), { 0, 2, -1, -1 });
- // The mask should match the destination type. Extending mask with zeroes
- // is not necessary since instruction itself reads only two values from
- // memory.
- Mask = ExtendToType(Mask, MVT::v4i1, DAG, false);
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4i32, MVT::Other), Ops, dl, N->getMemoryVT(),
- N->getMemOperand());
-
- SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, MVT::v2i64,
- NewGather.getValue(0), DAG);
- SDValue RetOps[] = { Sext, NewGather.getValue(1) };
+ DAG.getVTList(NewVT, MaskBitVT, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+ SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
+ NewGather.getValue(0),
+ DAG.getIntPtrConstant(0, dl));
+ SDValue RetOps[] = {Extract, NewGather.getValue(2)};
return DAG.getMergeValues(RetOps, dl);
}
- if (N->getMemoryVT() == MVT::v2f32 && Subtarget.hasVLX()) {
- // This transformation is for optimization only.
- // The type legalizer extended mask and index to 4 elements vector
- // in order to match requirements of the common gather node - same
- // vector width of index and value. X86 Gather node allows mismatch
- // of vector width in order to select more optimal instruction at the
- // end.
- assert(VT == MVT::v4f32 && Src0.getValueType() == MVT::v4f32 &&
- "Unexpected type in masked gather");
- if (Mask.getOpcode() == ISD::CONCAT_VECTORS &&
- ISD::isBuildVectorAllZeros(Mask.getOperand(1).getNode()) &&
- Index.getOpcode() == ISD::CONCAT_VECTORS &&
- Index.getOperand(1).isUndef()) {
- Mask = ExtendToType(Mask.getOperand(0), MVT::v4i1, DAG, false);
- Index = Index.getOperand(0);
- } else
- return Op;
- SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
- SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
- DAG.getVTList(MVT::v4f32, MVT::Other), Ops, dl, N->getMemoryVT(),
- N->getMemOperand());
-
- SDValue RetOps[] = { NewGather.getValue(0), NewGather.getValue(1) };
- return DAG.getMergeValues(RetOps, dl);
- }
- return Op;
+ SDValue Ops[] = { N->getChain(), Src0, Mask, N->getBasePtr(), Index };
+ SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
+ N->getMemOperand());
+ return DAG.getMergeValues({NewGather, NewGather.getValue(2)}, dl);
}
SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
@@ -24049,7 +24511,6 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
- case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op,Subtarget,DAG);
case ISD::INSERT_SUBVECTOR: return LowerINSERT_SUBVECTOR(Op, Subtarget,DAG);
case ISD::SCALAR_TO_VECTOR: return LowerSCALAR_TO_VECTOR(Op, Subtarget,DAG);
case ISD::ConstantPool: return LowerConstantPool(Op, DAG);
@@ -24085,7 +24546,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::VASTART: return LowerVASTART(Op, DAG);
case ISD::VAARG: return LowerVAARG(Op, DAG);
case ISD::VACOPY: return LowerVACOPY(Op, Subtarget, DAG);
- case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, Subtarget, DAG);
+ case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
case ISD::INTRINSIC_VOID:
case ISD::INTRINSIC_W_CHAIN: return LowerINTRINSIC_W_CHAIN(Op, Subtarget, DAG);
case ISD::RETURNADDR: return LowerRETURNADDR(Op, DAG);
@@ -24203,8 +24664,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
SDValue InVec1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, RegVT, Ops);
SDValue Res = DAG.getNode(X86ISD::AVG, dl, RegVT, InVec0, InVec1);
- Results.push_back(DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
- DAG.getIntPtrConstant(0, dl)));
+ if (!ExperimentalVectorWideningLegalization)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, InVT, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
return;
}
// We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
@@ -24242,11 +24705,21 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
SDValue Src = N->getOperand(0);
if (Src.getValueType() == MVT::v2f64) {
- SDValue Idx = DAG.getIntPtrConstant(0, dl);
- SDValue Res = DAG.getNode(IsSigned ? X86ISD::CVTTP2SI
- : X86ISD::CVTTP2UI,
- dl, MVT::v4i32, Src);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+ MVT ResVT = MVT::v4i32;
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ if (!IsSigned && !Subtarget.hasVLX()) {
+ // Widen to 512-bits.
+ ResVT = MVT::v8i32;
+ Opc = ISD::FP_TO_UINT;
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
+ DAG.getUNDEF(MVT::v8f64),
+ Src, DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
+ ResVT = ExperimentalVectorWideningLegalization ? MVT::v4i32
+ : MVT::v2i32;
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, Res,
+ DAG.getIntPtrConstant(0, dl));
Results.push_back(Res);
return;
}
@@ -24256,7 +24729,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
DAG.getUNDEF(MVT::v2f32));
Res = DAG.getNode(IsSigned ? ISD::FP_TO_SINT
: ISD::FP_TO_UINT, dl, MVT::v4i32, Res);
- Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
+ if (!ExperimentalVectorWideningLegalization)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res, Idx);
Results.push_back(Res);
return;
}
@@ -24345,7 +24819,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
}
case ISD::INTRINSIC_WO_CHAIN: {
- if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), Subtarget, DAG))
+ if (SDValue V = LowerINTRINSIC_WO_CHAIN(SDValue(N, 0), DAG))
Results.push_back(V);
return;
}
@@ -24480,6 +24954,89 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ToVecInt, DAG.getIntPtrConstant(i, dl)));
Results.push_back(DAG.getBuildVector(DstVT, dl, Elts));
+ return;
+ }
+ case ISD::MGATHER: {
+ EVT VT = N->getValueType(0);
+ if (VT == MVT::v2f32 && (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ auto *Gather = cast<MaskedGatherSDNode>(N);
+ SDValue Index = Gather->getIndex();
+ if (Index.getValueType() != MVT::v2i64)
+ return;
+ SDValue Mask = Gather->getMask();
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32,
+ Gather->getValue(),
+ DAG.getUNDEF(MVT::v2f32));
+ if (!Subtarget.hasVLX()) {
+ // We need to widen the mask, but the instruction will only use 2
+ // of its elements. So we can use undef.
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getUNDEF(MVT::v2i1));
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
+ }
+ SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
+ Index };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4f32, Mask.getValueType(), MVT::Other), Ops, dl,
+ Gather->getMemoryVT(), Gather->getMemOperand());
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(2));
+ return;
+ }
+ if (VT == MVT::v2i32) {
+ auto *Gather = cast<MaskedGatherSDNode>(N);
+ SDValue Index = Gather->getIndex();
+ SDValue Mask = Gather->getMask();
+ assert(Mask.getValueType() == MVT::v2i1 && "Unexpected mask type");
+ SDValue Src0 = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32,
+ Gather->getValue(),
+ DAG.getUNDEF(MVT::v2i32));
+ // If the index is v2i64 we can use it directly.
+ if (Index.getValueType() == MVT::v2i64 &&
+ (Subtarget.hasVLX() || !Subtarget.hasAVX512())) {
+ if (!Subtarget.hasVLX()) {
+ // We need to widen the mask, but the instruction will only use 2
+ // of its elements. So we can use undef.
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getUNDEF(MVT::v2i1));
+ Mask = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i32, Mask);
+ }
+ SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
+ Index };
+ SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
+ DAG.getVTList(MVT::v4i32, Mask.getValueType(), MVT::Other), Ops, dl,
+ Gather->getMemoryVT(), Gather->getMemOperand());
+ SDValue Chain = Res.getValue(2);
+ if (!ExperimentalVectorWideningLegalization)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ EVT IndexVT = Index.getValueType();
+ EVT NewIndexVT = EVT::getVectorVT(*DAG.getContext(),
+ IndexVT.getScalarType(), 4);
+ // Otherwise we need to custom widen everything to avoid promotion.
+ Index = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewIndexVT, Index,
+ DAG.getUNDEF(IndexVT));
+ Mask = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i1, Mask,
+ DAG.getConstant(0, dl, MVT::v2i1));
+ SDValue Ops[] = { Gather->getChain(), Src0, Mask, Gather->getBasePtr(),
+ Index };
+ SDValue Res = DAG.getMaskedGather(DAG.getVTList(MVT::v4i32, MVT::Other),
+ Gather->getMemoryVT(), dl, Ops,
+ Gather->getMemOperand());
+ SDValue Chain = Res.getValue(1);
+ if (!ExperimentalVectorWideningLegalization)
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i32, Res,
+ DAG.getIntPtrConstant(0, dl));
+ Results.push_back(Res);
+ Results.push_back(Chain);
+ return;
+ }
+ break;
}
}
}
@@ -24557,9 +25114,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FMAXC: return "X86ISD::FMAXC";
case X86ISD::FMINC: return "X86ISD::FMINC";
case X86ISD::FRSQRT: return "X86ISD::FRSQRT";
- case X86ISD::FRSQRTS: return "X86ISD::FRSQRTS";
case X86ISD::FRCP: return "X86ISD::FRCP";
- case X86ISD::FRCPS: return "X86ISD::FRCPS";
case X86ISD::EXTRQI: return "X86ISD::EXTRQI";
case X86ISD::INSERTQI: return "X86ISD::INSERTQI";
case X86ISD::TLSADDR: return "X86ISD::TLSADDR";
@@ -24585,6 +25140,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::LOR: return "X86ISD::LOR";
case X86ISD::LXOR: return "X86ISD::LXOR";
case X86ISD::LAND: return "X86ISD::LAND";
+ case X86ISD::LINC: return "X86ISD::LINC";
+ case X86ISD::LDEC: return "X86ISD::LDEC";
case X86ISD::VZEXT_MOVL: return "X86ISD::VZEXT_MOVL";
case X86ISD::VZEXT_LOAD: return "X86ISD::VZEXT_LOAD";
case X86ISD::VZEXT: return "X86ISD::VZEXT";
@@ -24620,6 +25177,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::PCMPEQM: return "X86ISD::PCMPEQM";
case X86ISD::PCMPGTM: return "X86ISD::PCMPGTM";
+ case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
case X86ISD::ADD: return "X86ISD::ADD";
case X86ISD::SUB: return "X86ISD::SUB";
case X86ISD::ADC: return "X86ISD::ADC";
@@ -24635,7 +25193,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::OR: return "X86ISD::OR";
case X86ISD::XOR: return "X86ISD::XOR";
case X86ISD::AND: return "X86ISD::AND";
- case X86ISD::BEXTR: return "X86ISD::BEXTR";
case X86ISD::MUL_IMM: return "X86ISD::MUL_IMM";
case X86ISD::MOVMSK: return "X86ISD::MOVMSK";
case X86ISD::PTEST: return "X86ISD::PTEST";
@@ -24650,13 +25207,16 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::PACKUS: return "X86ISD::PACKUS";
case X86ISD::PALIGNR: return "X86ISD::PALIGNR";
case X86ISD::VALIGN: return "X86ISD::VALIGN";
+ case X86ISD::VSHLD: return "X86ISD::VSHLD";
+ case X86ISD::VSHRD: return "X86ISD::VSHRD";
+ case X86ISD::VSHLDV: return "X86ISD::VSHLDV";
+ case X86ISD::VSHRDV: return "X86ISD::VSHRDV";
case X86ISD::PSHUFD: return "X86ISD::PSHUFD";
case X86ISD::PSHUFHW: return "X86ISD::PSHUFHW";
case X86ISD::PSHUFLW: return "X86ISD::PSHUFLW";
case X86ISD::SHUFP: return "X86ISD::SHUFP";
case X86ISD::SHUF128: return "X86ISD::SHUF128";
case X86ISD::MOVLHPS: return "X86ISD::MOVLHPS";
- case X86ISD::MOVLHPD: return "X86ISD::MOVLHPD";
case X86ISD::MOVHLPS: return "X86ISD::MOVHLPS";
case X86ISD::MOVLPS: return "X86ISD::MOVLPS";
case X86ISD::MOVLPD: return "X86ISD::MOVLPD";
@@ -24670,7 +25230,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VBROADCAST: return "X86ISD::VBROADCAST";
case X86ISD::VBROADCASTM: return "X86ISD::VBROADCASTM";
case X86ISD::SUBV_BROADCAST: return "X86ISD::SUBV_BROADCAST";
- case X86ISD::VEXTRACT: return "X86ISD::VEXTRACT";
case X86ISD::VPERMILPV: return "X86ISD::VPERMILPV";
case X86ISD::VPERMILPI: return "X86ISD::VPERMILPI";
case X86ISD::VPERM2X128: return "X86ISD::VPERM2X128";
@@ -24680,8 +25239,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPERMI: return "X86ISD::VPERMI";
case X86ISD::VPTERNLOG: return "X86ISD::VPTERNLOG";
case X86ISD::VFIXUPIMM: return "X86ISD::VFIXUPIMM";
- case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
+ case X86ISD::VFIXUPIMMS: return "X86ISD::VFIXUPIMMS";
case X86ISD::VRANGE: return "X86ISD::VRANGE";
+ case X86ISD::VRANGE_RND: return "X86ISD::VRANGE_RND";
+ case X86ISD::VRANGES: return "X86ISD::VRANGES";
+ case X86ISD::VRANGES_RND: return "X86ISD::VRANGES_RND";
case X86ISD::PMULUDQ: return "X86ISD::PMULUDQ";
case X86ISD::PMULDQ: return "X86ISD::PMULDQ";
case X86ISD::PSADBW: return "X86ISD::PSADBW";
@@ -24697,14 +25259,11 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::RDSEED: return "X86ISD::RDSEED";
case X86ISD::VPMADDUBSW: return "X86ISD::VPMADDUBSW";
case X86ISD::VPMADDWD: return "X86ISD::VPMADDWD";
- case X86ISD::VPROT: return "X86ISD::VPROT";
- case X86ISD::VPROTI: return "X86ISD::VPROTI";
case X86ISD::VPSHA: return "X86ISD::VPSHA";
case X86ISD::VPSHL: return "X86ISD::VPSHL";
case X86ISD::VPCOM: return "X86ISD::VPCOM";
case X86ISD::VPCOMU: return "X86ISD::VPCOMU";
case X86ISD::VPERMIL2: return "X86ISD::VPERMIL2";
- case X86ISD::FMADD: return "X86ISD::FMADD";
case X86ISD::FMSUB: return "X86ISD::FMSUB";
case X86ISD::FNMADD: return "X86ISD::FNMADD";
case X86ISD::FNMSUB: return "X86ISD::FNMSUB";
@@ -24716,22 +25275,40 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::FNMSUB_RND: return "X86ISD::FNMSUB_RND";
case X86ISD::FMADDSUB_RND: return "X86ISD::FMADDSUB_RND";
case X86ISD::FMSUBADD_RND: return "X86ISD::FMSUBADD_RND";
+ case X86ISD::FMADDS1: return "X86ISD::FMADDS1";
+ case X86ISD::FNMADDS1: return "X86ISD::FNMADDS1";
+ case X86ISD::FMSUBS1: return "X86ISD::FMSUBS1";
+ case X86ISD::FNMSUBS1: return "X86ISD::FNMSUBS1";
case X86ISD::FMADDS1_RND: return "X86ISD::FMADDS1_RND";
case X86ISD::FNMADDS1_RND: return "X86ISD::FNMADDS1_RND";
case X86ISD::FMSUBS1_RND: return "X86ISD::FMSUBS1_RND";
case X86ISD::FNMSUBS1_RND: return "X86ISD::FNMSUBS1_RND";
+ case X86ISD::FMADDS3: return "X86ISD::FMADDS3";
+ case X86ISD::FNMADDS3: return "X86ISD::FNMADDS3";
+ case X86ISD::FMSUBS3: return "X86ISD::FMSUBS3";
+ case X86ISD::FNMSUBS3: return "X86ISD::FNMSUBS3";
case X86ISD::FMADDS3_RND: return "X86ISD::FMADDS3_RND";
case X86ISD::FNMADDS3_RND: return "X86ISD::FNMADDS3_RND";
case X86ISD::FMSUBS3_RND: return "X86ISD::FMSUBS3_RND";
case X86ISD::FNMSUBS3_RND: return "X86ISD::FNMSUBS3_RND";
+ case X86ISD::FMADD4S: return "X86ISD::FMADD4S";
+ case X86ISD::FNMADD4S: return "X86ISD::FNMADD4S";
+ case X86ISD::FMSUB4S: return "X86ISD::FMSUB4S";
+ case X86ISD::FNMSUB4S: return "X86ISD::FNMSUB4S";
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::VRNDSCALE_RND: return "X86ISD::VRNDSCALE_RND";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
+ case X86ISD::VRNDSCALES_RND: return "X86ISD::VRNDSCALES_RND";
case X86ISD::VREDUCE: return "X86ISD::VREDUCE";
+ case X86ISD::VREDUCE_RND: return "X86ISD::VREDUCE_RND";
case X86ISD::VREDUCES: return "X86ISD::VREDUCES";
+ case X86ISD::VREDUCES_RND: return "X86ISD::VREDUCES_RND";
case X86ISD::VGETMANT: return "X86ISD::VGETMANT";
+ case X86ISD::VGETMANT_RND: return "X86ISD::VGETMANT_RND";
case X86ISD::VGETMANTS: return "X86ISD::VGETMANTS";
+ case X86ISD::VGETMANTS_RND: return "X86ISD::VGETMANTS_RND";
case X86ISD::PCMPESTRI: return "X86ISD::PCMPESTRI";
case X86ISD::PCMPISTRI: return "X86ISD::PCMPISTRI";
case X86ISD::XTEST: return "X86ISD::XTEST";
@@ -24740,9 +25317,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SELECT: return "X86ISD::SELECT";
case X86ISD::SELECTS: return "X86ISD::SELECTS";
case X86ISD::ADDSUB: return "X86ISD::ADDSUB";
+ case X86ISD::RCP14: return "X86ISD::RCP14";
+ case X86ISD::RCP14S: return "X86ISD::RCP14S";
case X86ISD::RCP28: return "X86ISD::RCP28";
case X86ISD::RCP28S: return "X86ISD::RCP28S";
case X86ISD::EXP2: return "X86ISD::EXP2";
+ case X86ISD::RSQRT14: return "X86ISD::RSQRT14";
+ case X86ISD::RSQRT14S: return "X86ISD::RSQRT14S";
case X86ISD::RSQRT28: return "X86ISD::RSQRT28";
case X86ISD::RSQRT28S: return "X86ISD::RSQRT28S";
case X86ISD::FADD_RND: return "X86ISD::FADD_RND";
@@ -24780,6 +25361,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
case X86ISD::CVTPS2PH: return "X86ISD::CVTPS2PH";
case X86ISD::CVTPH2PS: return "X86ISD::CVTPH2PS";
+ case X86ISD::CVTPH2PS_RND: return "X86ISD::CVTPH2PS_RND";
case X86ISD::CVTP2SI: return "X86ISD::CVTP2SI";
case X86ISD::CVTP2UI: return "X86ISD::CVTP2UI";
case X86ISD::CVTP2SI_RND: return "X86ISD::CVTP2SI_RND";
@@ -24788,6 +25370,15 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTS2UI_RND: return "X86ISD::CVTS2UI_RND";
case X86ISD::LWPINS: return "X86ISD::LWPINS";
case X86ISD::MGATHER: return "X86ISD::MGATHER";
+ case X86ISD::MSCATTER: return "X86ISD::MSCATTER";
+ case X86ISD::VPDPBUSD: return "X86ISD::VPDPBUSD";
+ case X86ISD::VPDPBUSDS: return "X86ISD::VPDPBUSDS";
+ case X86ISD::VPDPWSSD: return "X86ISD::VPDPWSSD";
+ case X86ISD::VPDPWSSDS: return "X86ISD::VPDPWSSDS";
+ case X86ISD::VPSHUFBITQMB: return "X86ISD::VPSHUFBITQMB";
+ case X86ISD::GF2P8MULB: return "X86ISD::GF2P8MULB";
+ case X86ISD::GF2P8AFFINEQB: return "X86ISD::GF2P8AFFINEQB";
+ case X86ISD::GF2P8AFFINEINVQB: return "X86ISD::GF2P8AFFINEINVQB";
}
return nullptr;
}
@@ -24796,7 +25387,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
/// target, for a load/store of the specified type.
bool X86TargetLowering::isLegalAddressingMode(const DataLayout &DL,
const AddrMode &AM, Type *Ty,
- unsigned AS) const {
+ unsigned AS,
+ Instruction *I) const {
// X86 supports extremely general addressing modes.
CodeModel::Model M = getTargetMachine().getCodeModel();
@@ -24853,9 +25445,9 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
if (Bits == 8)
return false;
- // On AVX2 there are new vpsllv[dq] instructions (and other shifts), that make
- // variable shifts just as cheap as scalar ones.
- if (Subtarget.hasInt256() && (Bits == 32 || Bits == 64))
+ // AVX2 has vpsllv[dq] instructions (and other shifts) that make variable
+ // shifts just as cheap as scalar ones.
+ if (Subtarget.hasAVX2() && (Bits == 32 || Bits == 64))
return false;
// Otherwise, it's significantly cheaper to shift by a scalar amount than by a
@@ -24968,9 +25560,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
/// VECTOR_SHUFFLE operations, those with specific masks.
/// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
/// are assumed to be legal.
-bool
-X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
- EVT VT) const {
+bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
if (!VT.isSimple())
return false;
@@ -25522,7 +26112,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
int64_t RegSaveFrameIndex = MI.getOperand(1).getImm();
int64_t VarArgsFPOffset = MI.getOperand(2).getImm();
- if (!Subtarget.isCallingConvWin64(F->getFunction()->getCallingConv())) {
+ if (!Subtarget.isCallingConvWin64(F->getFunction().getCallingConv())) {
// If %al is 0, branch around the XMM save block.
BuildMI(MBB, DL, TII->get(X86::TEST8rr)).addReg(CountReg).addReg(CountReg);
BuildMI(MBB, DL, TII->get(X86::JE_1)).addMBB(EndMBB);
@@ -25627,65 +26217,76 @@ static bool isCMOVPseudo(MachineInstr &MI) {
}
}
-MachineBasicBlock *
-X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
- MachineBasicBlock *BB) const {
- const TargetInstrInfo *TII = Subtarget.getInstrInfo();
- DebugLoc DL = MI.getDebugLoc();
+// Helper function, which inserts PHI functions into SinkMBB:
+// %Result(i) = phi [ %FalseValue(i), FalseMBB ], [ %TrueValue(i), TrueMBB ],
+// where %FalseValue(i) and %TrueValue(i) are taken from the consequent CMOVs
+// in [MIItBegin, MIItEnd) range. It returns the last MachineInstrBuilder for
+// the last PHI function inserted.
+static MachineInstrBuilder createPHIsForCMOVsInSinkBB(
+ MachineBasicBlock::iterator MIItBegin, MachineBasicBlock::iterator MIItEnd,
+ MachineBasicBlock *TrueMBB, MachineBasicBlock *FalseMBB,
+ MachineBasicBlock *SinkMBB) {
+ MachineFunction *MF = TrueMBB->getParent();
+ const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+ DebugLoc DL = MIItBegin->getDebugLoc();
- // To "insert" a SELECT_CC instruction, we actually have to insert the
- // diamond control-flow pattern. The incoming instruction knows the
- // destination vreg to set, the condition code register to branch on, the
- // true/false values to select between, and a branch opcode to use.
- const BasicBlock *LLVM_BB = BB->getBasicBlock();
- MachineFunction::iterator It = ++BB->getIterator();
+ X86::CondCode CC = X86::CondCode(MIItBegin->getOperand(3).getImm());
+ X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
- // thisMBB:
- // ...
- // TrueVal = ...
- // cmpTY ccX, r1, r2
- // bCC copy1MBB
- // fallthrough --> copy0MBB
- MachineBasicBlock *thisMBB = BB;
- MachineFunction *F = BB->getParent();
+ MachineBasicBlock::iterator SinkInsertionPoint = SinkMBB->begin();
- // This code lowers all pseudo-CMOV instructions. Generally it lowers these
- // as described above, by inserting a BB, and then making a PHI at the join
- // point to select the true and false operands of the CMOV in the PHI.
- //
- // The code also handles two different cases of multiple CMOV opcodes
- // in a row.
- //
- // Case 1:
- // In this case, there are multiple CMOVs in a row, all which are based on
- // the same condition setting (or the exact opposite condition setting).
- // In this case we can lower all the CMOVs using a single inserted BB, and
- // then make a number of PHIs at the join point to model the CMOVs. The only
- // trickiness here, is that in a case like:
- //
- // t2 = CMOV cond1 t1, f1
- // t3 = CMOV cond1 t2, f2
- //
- // when rewriting this into PHIs, we have to perform some renaming on the
- // temps since you cannot have a PHI operand refer to a PHI result earlier
- // in the same block. The "simple" but wrong lowering would be:
- //
- // t2 = PHI t1(BB1), f1(BB2)
- // t3 = PHI t2(BB1), f2(BB2)
- //
- // but clearly t2 is not defined in BB1, so that is incorrect. The proper
- // renaming is to note that on the path through BB1, t2 is really just a
- // copy of t1, and do that renaming, properly generating:
- //
- // t2 = PHI t1(BB1), f1(BB2)
- // t3 = PHI t1(BB1), f2(BB2)
- //
- // Case 2, we lower cascaded CMOVs such as
+ // As we are creating the PHIs, we have to be careful if there is more than
+ // one. Later CMOVs may reference the results of earlier CMOVs, but later
+ // PHIs have to reference the individual true/false inputs from earlier PHIs.
+ // That also means that PHI construction must work forward from earlier to
+ // later, and that the code must maintain a mapping from earlier PHI's
+ // destination registers, and the registers that went into the PHI.
+ DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
+ MachineInstrBuilder MIB;
+
+ for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
+ unsigned DestReg = MIIt->getOperand(0).getReg();
+ unsigned Op1Reg = MIIt->getOperand(1).getReg();
+ unsigned Op2Reg = MIIt->getOperand(2).getReg();
+
+ // If this CMOV we are generating is the opposite condition from
+ // the jump we generated, then we have to swap the operands for the
+ // PHI that is going to be generated.
+ if (MIIt->getOperand(3).getImm() == OppCC)
+ std::swap(Op1Reg, Op2Reg);
+
+ if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
+ Op1Reg = RegRewriteTable[Op1Reg].first;
+
+ if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
+ Op2Reg = RegRewriteTable[Op2Reg].second;
+
+ MIB = BuildMI(*SinkMBB, SinkInsertionPoint, DL, TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg)
+ .addMBB(FalseMBB)
+ .addReg(Op2Reg)
+ .addMBB(TrueMBB);
+
+ // Add this PHI to the rewrite table.
+ RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
+ }
+
+ return MIB;
+}
+
+// Lower cascaded selects in form of (SecondCmov (FirstCMOV F, T, cc1), T, cc2).
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredCascadedSelect(MachineInstr &FirstCMOV,
+ MachineInstr &SecondCascadedCMOV,
+ MachineBasicBlock *ThisMBB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = FirstCMOV.getDebugLoc();
+
+ // We lower cascaded CMOVs such as
//
- // (CMOV (CMOV F, T, cc1), T, cc2)
+ // (SecondCascadedCMOV (FirstCMOV F, T, cc1), T, cc2)
//
- // to two successive branches. For that, we look for another CMOV as the
- // following instruction.
+ // to two successive branches.
//
// Without this, we would add a PHI between the two jumps, which ends up
// creating a few copies all around. For instance, for
@@ -25749,10 +26350,145 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
// .LBB5_4:
// retq
//
- MachineInstr *CascadedCMOV = nullptr;
- MachineInstr *LastCMOV = &MI;
+
+ // We lower cascaded CMOV into two successive branches to the same block.
+ // EFLAGS is used by both, so mark it as live in the second.
+ const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+ MachineFunction *F = ThisMBB->getParent();
+ MachineBasicBlock *FirstInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SecondInsertedMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
+
+ MachineFunction::iterator It = ++ThisMBB->getIterator();
+ F->insert(It, FirstInsertedMBB);
+ F->insert(It, SecondInsertedMBB);
+ F->insert(It, SinkMBB);
+
+ // For a cascaded CMOV, we lower it to two successive branches to
+ // the same block (SinkMBB). EFLAGS is used by both, so mark it as live in
+ // the FirstInsertedMBB.
+ FirstInsertedMBB->addLiveIn(X86::EFLAGS);
+
+ // If the EFLAGS register isn't dead in the terminator, then claim that it's
+ // live into the sink and copy blocks.
+ const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+ if (!SecondCascadedCMOV.killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(SecondCascadedCMOV, ThisMBB, TRI)) {
+ SecondInsertedMBB->addLiveIn(X86::EFLAGS);
+ SinkMBB->addLiveIn(X86::EFLAGS);
+ }
+
+ // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+ std::next(MachineBasicBlock::iterator(FirstCMOV)),
+ ThisMBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
+
+ // Fallthrough block for ThisMBB.
+ ThisMBB->addSuccessor(FirstInsertedMBB);
+ // The true block target of the first branch is always SinkMBB.
+ ThisMBB->addSuccessor(SinkMBB);
+ // Fallthrough block for FirstInsertedMBB.
+ FirstInsertedMBB->addSuccessor(SecondInsertedMBB);
+ // The true block for the branch of FirstInsertedMBB.
+ FirstInsertedMBB->addSuccessor(SinkMBB);
+ // This is fallthrough.
+ SecondInsertedMBB->addSuccessor(SinkMBB);
+
+ // Create the conditional branch instructions.
+ X86::CondCode FirstCC = X86::CondCode(FirstCMOV.getOperand(3).getImm());
+ unsigned Opc = X86::GetCondBranchFromCond(FirstCC);
+ BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
+
+ X86::CondCode SecondCC =
+ X86::CondCode(SecondCascadedCMOV.getOperand(3).getImm());
+ unsigned Opc2 = X86::GetCondBranchFromCond(SecondCC);
+ BuildMI(FirstInsertedMBB, DL, TII->get(Opc2)).addMBB(SinkMBB);
+
+ // SinkMBB:
+ // %Result = phi [ %FalseValue, SecondInsertedMBB ], [ %TrueValue, ThisMBB ]
+ unsigned DestReg = FirstCMOV.getOperand(0).getReg();
+ unsigned Op1Reg = FirstCMOV.getOperand(1).getReg();
+ unsigned Op2Reg = FirstCMOV.getOperand(2).getReg();
+ MachineInstrBuilder MIB =
+ BuildMI(*SinkMBB, SinkMBB->begin(), DL, TII->get(X86::PHI), DestReg)
+ .addReg(Op1Reg)
+ .addMBB(SecondInsertedMBB)
+ .addReg(Op2Reg)
+ .addMBB(ThisMBB);
+
+ // The second SecondInsertedMBB provides the same incoming value as the
+ // FirstInsertedMBB (the True operand of the SELECT_CC/CMOV nodes).
+ MIB.addReg(FirstCMOV.getOperand(2).getReg()).addMBB(FirstInsertedMBB);
+ // Copy the PHI result to the register defined by the second CMOV.
+ BuildMI(*SinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())), DL,
+ TII->get(TargetOpcode::COPY),
+ SecondCascadedCMOV.getOperand(0).getReg())
+ .addReg(FirstCMOV.getOperand(0).getReg());
+
+ // Now remove the CMOVs.
+ FirstCMOV.eraseFromParent();
+ SecondCascadedCMOV.eraseFromParent();
+
+ return SinkMBB;
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
+ MachineBasicBlock *ThisMBB) const {
+ const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+ DebugLoc DL = MI.getDebugLoc();
+
+ // To "insert" a SELECT_CC instruction, we actually have to insert the
+ // diamond control-flow pattern. The incoming instruction knows the
+ // destination vreg to set, the condition code register to branch on, the
+ // true/false values to select between and a branch opcode to use.
+
+ // ThisMBB:
+ // ...
+ // TrueVal = ...
+ // cmpTY ccX, r1, r2
+ // bCC copy1MBB
+ // fallthrough --> FalseMBB
+
+ // This code lowers all pseudo-CMOV instructions. Generally it lowers these
+ // as described above, by inserting a BB, and then making a PHI at the join
+ // point to select the true and false operands of the CMOV in the PHI.
+ //
+ // The code also handles two different cases of multiple CMOV opcodes
+ // in a row.
+ //
+ // Case 1:
+ // In this case, there are multiple CMOVs in a row, all which are based on
+ // the same condition setting (or the exact opposite condition setting).
+ // In this case we can lower all the CMOVs using a single inserted BB, and
+ // then make a number of PHIs at the join point to model the CMOVs. The only
+ // trickiness here, is that in a case like:
+ //
+ // t2 = CMOV cond1 t1, f1
+ // t3 = CMOV cond1 t2, f2
+ //
+ // when rewriting this into PHIs, we have to perform some renaming on the
+ // temps since you cannot have a PHI operand refer to a PHI result earlier
+ // in the same block. The "simple" but wrong lowering would be:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t2(BB1), f2(BB2)
+ //
+ // but clearly t2 is not defined in BB1, so that is incorrect. The proper
+ // renaming is to note that on the path through BB1, t2 is really just a
+ // copy of t1, and do that renaming, properly generating:
+ //
+ // t2 = PHI t1(BB1), f1(BB2)
+ // t3 = PHI t1(BB1), f2(BB2)
+ //
+ // Case 2:
+ // CMOV ((CMOV F, T, cc1), T, cc2) is checked here and handled by a separate
+ // function - EmitLoweredCascadedSelect.
+
X86::CondCode CC = X86::CondCode(MI.getOperand(3).getImm());
X86::CondCode OppCC = X86::GetOppositeBranchCondition(CC);
+ MachineInstr *LastCMOV = &MI;
MachineBasicBlock::iterator NextMIIt =
std::next(MachineBasicBlock::iterator(MI));
@@ -25762,7 +26498,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
if (isCMOVPseudo(MI)) {
// See if we have a string of CMOVS with the same condition.
- while (NextMIIt != BB->end() && isCMOVPseudo(*NextMIIt) &&
+ while (NextMIIt != ThisMBB->end() && isCMOVPseudo(*NextMIIt) &&
(NextMIIt->getOperand(3).getImm() == CC ||
NextMIIt->getOperand(3).getImm() == OppCC)) {
LastCMOV = &*NextMIIt;
@@ -25772,136 +26508,61 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
// This checks for case 2, but only do this if we didn't already find
// case 1, as indicated by LastCMOV == MI.
- if (LastCMOV == &MI && NextMIIt != BB->end() &&
+ if (LastCMOV == &MI && NextMIIt != ThisMBB->end() &&
NextMIIt->getOpcode() == MI.getOpcode() &&
NextMIIt->getOperand(2).getReg() == MI.getOperand(2).getReg() &&
NextMIIt->getOperand(1).getReg() == MI.getOperand(0).getReg() &&
NextMIIt->getOperand(1).isKill()) {
- CascadedCMOV = &*NextMIIt;
+ return EmitLoweredCascadedSelect(MI, *NextMIIt, ThisMBB);
}
- MachineBasicBlock *jcc1MBB = nullptr;
+ const BasicBlock *LLVM_BB = ThisMBB->getBasicBlock();
+ MachineFunction *F = ThisMBB->getParent();
+ MachineBasicBlock *FalseMBB = F->CreateMachineBasicBlock(LLVM_BB);
+ MachineBasicBlock *SinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
- // If we have a cascaded CMOV, we lower it to two successive branches to
- // the same block. EFLAGS is used by both, so mark it as live in the second.
- if (CascadedCMOV) {
- jcc1MBB = F->CreateMachineBasicBlock(LLVM_BB);
- F->insert(It, jcc1MBB);
- jcc1MBB->addLiveIn(X86::EFLAGS);
- }
-
- MachineBasicBlock *copy0MBB = F->CreateMachineBasicBlock(LLVM_BB);
- MachineBasicBlock *sinkMBB = F->CreateMachineBasicBlock(LLVM_BB);
- F->insert(It, copy0MBB);
- F->insert(It, sinkMBB);
+ MachineFunction::iterator It = ++ThisMBB->getIterator();
+ F->insert(It, FalseMBB);
+ F->insert(It, SinkMBB);
// If the EFLAGS register isn't dead in the terminator, then claim that it's
// live into the sink and copy blocks.
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
-
- MachineInstr *LastEFLAGSUser = CascadedCMOV ? CascadedCMOV : LastCMOV;
- if (!LastEFLAGSUser->killsRegister(X86::EFLAGS) &&
- !checkAndUpdateEFLAGSKill(LastEFLAGSUser, BB, TRI)) {
- copy0MBB->addLiveIn(X86::EFLAGS);
- sinkMBB->addLiveIn(X86::EFLAGS);
+ if (!LastCMOV->killsRegister(X86::EFLAGS) &&
+ !checkAndUpdateEFLAGSKill(LastCMOV, ThisMBB, TRI)) {
+ FalseMBB->addLiveIn(X86::EFLAGS);
+ SinkMBB->addLiveIn(X86::EFLAGS);
}
- // Transfer the remainder of BB and its successor edges to sinkMBB.
- sinkMBB->splice(sinkMBB->begin(), BB,
- std::next(MachineBasicBlock::iterator(LastCMOV)), BB->end());
- sinkMBB->transferSuccessorsAndUpdatePHIs(BB);
-
- // Add the true and fallthrough blocks as its successors.
- if (CascadedCMOV) {
- // The fallthrough block may be jcc1MBB, if we have a cascaded CMOV.
- BB->addSuccessor(jcc1MBB);
-
- // In that case, jcc1MBB will itself fallthrough the copy0MBB, and
- // jump to the sinkMBB.
- jcc1MBB->addSuccessor(copy0MBB);
- jcc1MBB->addSuccessor(sinkMBB);
- } else {
- BB->addSuccessor(copy0MBB);
- }
+ // Transfer the remainder of ThisMBB and its successor edges to SinkMBB.
+ SinkMBB->splice(SinkMBB->begin(), ThisMBB,
+ std::next(MachineBasicBlock::iterator(LastCMOV)),
+ ThisMBB->end());
+ SinkMBB->transferSuccessorsAndUpdatePHIs(ThisMBB);
- // The true block target of the first (or only) branch is always sinkMBB.
- BB->addSuccessor(sinkMBB);
+ // Fallthrough block for ThisMBB.
+ ThisMBB->addSuccessor(FalseMBB);
+ // The true block target of the first (or only) branch is always a SinkMBB.
+ ThisMBB->addSuccessor(SinkMBB);
+ // Fallthrough block for FalseMBB.
+ FalseMBB->addSuccessor(SinkMBB);
// Create the conditional branch instruction.
unsigned Opc = X86::GetCondBranchFromCond(CC);
- BuildMI(BB, DL, TII->get(Opc)).addMBB(sinkMBB);
-
- if (CascadedCMOV) {
- unsigned Opc2 = X86::GetCondBranchFromCond(
- (X86::CondCode)CascadedCMOV->getOperand(3).getImm());
- BuildMI(jcc1MBB, DL, TII->get(Opc2)).addMBB(sinkMBB);
- }
+ BuildMI(ThisMBB, DL, TII->get(Opc)).addMBB(SinkMBB);
- // copy0MBB:
- // %FalseValue = ...
- // # fallthrough to sinkMBB
- copy0MBB->addSuccessor(sinkMBB);
-
- // sinkMBB:
- // %Result = phi [ %FalseValue, copy0MBB ], [ %TrueValue, thisMBB ]
+ // SinkMBB:
+ // %Result = phi [ %FalseValue, FalseMBB ], [ %TrueValue, ThisMBB ]
// ...
MachineBasicBlock::iterator MIItBegin = MachineBasicBlock::iterator(MI);
MachineBasicBlock::iterator MIItEnd =
- std::next(MachineBasicBlock::iterator(LastCMOV));
- MachineBasicBlock::iterator SinkInsertionPoint = sinkMBB->begin();
- DenseMap<unsigned, std::pair<unsigned, unsigned>> RegRewriteTable;
- MachineInstrBuilder MIB;
-
- // As we are creating the PHIs, we have to be careful if there is more than
- // one. Later CMOVs may reference the results of earlier CMOVs, but later
- // PHIs have to reference the individual true/false inputs from earlier PHIs.
- // That also means that PHI construction must work forward from earlier to
- // later, and that the code must maintain a mapping from earlier PHI's
- // destination registers, and the registers that went into the PHI.
-
- for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; ++MIIt) {
- unsigned DestReg = MIIt->getOperand(0).getReg();
- unsigned Op1Reg = MIIt->getOperand(1).getReg();
- unsigned Op2Reg = MIIt->getOperand(2).getReg();
-
- // If this CMOV we are generating is the opposite condition from
- // the jump we generated, then we have to swap the operands for the
- // PHI that is going to be generated.
- if (MIIt->getOperand(3).getImm() == OppCC)
- std::swap(Op1Reg, Op2Reg);
-
- if (RegRewriteTable.find(Op1Reg) != RegRewriteTable.end())
- Op1Reg = RegRewriteTable[Op1Reg].first;
-
- if (RegRewriteTable.find(Op2Reg) != RegRewriteTable.end())
- Op2Reg = RegRewriteTable[Op2Reg].second;
-
- MIB = BuildMI(*sinkMBB, SinkInsertionPoint, DL,
- TII->get(X86::PHI), DestReg)
- .addReg(Op1Reg).addMBB(copy0MBB)
- .addReg(Op2Reg).addMBB(thisMBB);
-
- // Add this PHI to the rewrite table.
- RegRewriteTable[DestReg] = std::make_pair(Op1Reg, Op2Reg);
- }
-
- // If we have a cascaded CMOV, the second Jcc provides the same incoming
- // value as the first Jcc (the True operand of the SELECT_CC/CMOV nodes).
- if (CascadedCMOV) {
- MIB.addReg(MI.getOperand(2).getReg()).addMBB(jcc1MBB);
- // Copy the PHI result to the register defined by the second CMOV.
- BuildMI(*sinkMBB, std::next(MachineBasicBlock::iterator(MIB.getInstr())),
- DL, TII->get(TargetOpcode::COPY),
- CascadedCMOV->getOperand(0).getReg())
- .addReg(MI.getOperand(0).getReg());
- CascadedCMOV->eraseFromParent();
- }
+ std::next(MachineBasicBlock::iterator(LastCMOV));
+ createPHIsForCMOVsInSinkBB(MIItBegin, MIItEnd, ThisMBB, FalseMBB, SinkMBB);
// Now remove the CMOV(s).
- for (MachineBasicBlock::iterator MIIt = MIItBegin; MIIt != MIItEnd; )
- (MIIt++)->eraseFromParent();
+ ThisMBB->erase(MIItBegin, MIItEnd);
- return sinkMBB;
+ return SinkMBB;
}
MachineBasicBlock *
@@ -26094,7 +26755,7 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
DebugLoc DL = MI.getDebugLoc();
assert(!isAsynchronousEHPersonality(
- classifyEHPersonality(MF->getFunction()->getPersonalityFn())) &&
+ classifyEHPersonality(MF->getFunction().getPersonalityFn())) &&
"SEH does not use catchret!");
// Only 32-bit EH needs to worry about manually restoring stack pointers.
@@ -26121,7 +26782,7 @@ MachineBasicBlock *
X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
MachineBasicBlock *BB) const {
MachineFunction *MF = BB->getParent();
- const Constant *PerFn = MF->getFunction()->getPersonalityFn();
+ const Constant *PerFn = MF->getFunction().getPersonalityFn();
bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
// Only 32-bit SEH requires special handling for catchpad.
if (IsSEH && Subtarget.is32Bit()) {
@@ -26480,7 +27141,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
}
MachineInstrBuilder MIB = BuildMI(*MBB, MI, DL, TII->get(Op));
- addFrameReference(MIB, FI, 36);
+ addFrameReference(MIB, FI, Subtarget.is64Bit() ? 56 : 36);
if (UseImmLabel)
MIB.addMBB(DispatchBB);
else
@@ -26562,8 +27223,8 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
SetupEntryBlockForSjLj(MI, BB, DispatchBB, FI);
// Create the jump table and associated information
- MachineJumpTableInfo *JTI =
- MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
+ unsigned JTE = getJumpTableEncoding();
+ MachineJumpTableInfo *JTI = MF->getOrCreateJumpTableInfo(JTE);
unsigned MJTI = JTI->createJumpTableIndex(LPadList);
const X86RegisterInfo &RI = TII->getRegisterInfo();
@@ -26586,25 +27247,76 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
.addRegMask(RI.getNoPreservedMask());
}
- unsigned IReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ // IReg is used as an index in a memory operand and therefore can't be SP
+ unsigned IReg = MRI->createVirtualRegister(&X86::GR32_NOSPRegClass);
addFrameReference(BuildMI(DispatchBB, DL, TII->get(X86::MOV32rm), IReg), FI,
- 4);
+ Subtarget.is64Bit() ? 8 : 4);
BuildMI(DispatchBB, DL, TII->get(X86::CMP32ri))
.addReg(IReg)
.addImm(LPadList.size());
- BuildMI(DispatchBB, DL, TII->get(X86::JA_1)).addMBB(TrapBB);
+ BuildMI(DispatchBB, DL, TII->get(X86::JAE_1)).addMBB(TrapBB);
- unsigned JReg = MRI->createVirtualRegister(&X86::GR32RegClass);
- BuildMI(DispContBB, DL, TII->get(X86::SUB32ri), JReg)
- .addReg(IReg)
- .addImm(1);
- BuildMI(DispContBB, DL,
- TII->get(Subtarget.is64Bit() ? X86::JMP64m : X86::JMP32m))
- .addReg(0)
- .addImm(Subtarget.is64Bit() ? 8 : 4)
- .addReg(JReg)
- .addJumpTableIndex(MJTI)
- .addReg(0);
+ if (Subtarget.is64Bit()) {
+ unsigned BReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+ unsigned IReg64 = MRI->createVirtualRegister(&X86::GR64_NOSPRegClass);
+
+ // leaq .LJTI0_0(%rip), BReg
+ BuildMI(DispContBB, DL, TII->get(X86::LEA64r), BReg)
+ .addReg(X86::RIP)
+ .addImm(1)
+ .addReg(0)
+ .addJumpTableIndex(MJTI)
+ .addReg(0);
+ // movzx IReg64, IReg
+ BuildMI(DispContBB, DL, TII->get(TargetOpcode::SUBREG_TO_REG), IReg64)
+ .addImm(0)
+ .addReg(IReg)
+ .addImm(X86::sub_32bit);
+
+ switch (JTE) {
+ case MachineJumpTableInfo::EK_BlockAddress:
+ // jmpq *(BReg,IReg64,8)
+ BuildMI(DispContBB, DL, TII->get(X86::JMP64m))
+ .addReg(BReg)
+ .addImm(8)
+ .addReg(IReg64)
+ .addImm(0)
+ .addReg(0);
+ break;
+ case MachineJumpTableInfo::EK_LabelDifference32: {
+ unsigned OReg = MRI->createVirtualRegister(&X86::GR32RegClass);
+ unsigned OReg64 = MRI->createVirtualRegister(&X86::GR64RegClass);
+ unsigned TReg = MRI->createVirtualRegister(&X86::GR64RegClass);
+
+ // movl (BReg,IReg64,4), OReg
+ BuildMI(DispContBB, DL, TII->get(X86::MOV32rm), OReg)
+ .addReg(BReg)
+ .addImm(4)
+ .addReg(IReg64)
+ .addImm(0)
+ .addReg(0);
+ // movsx OReg64, OReg
+ BuildMI(DispContBB, DL, TII->get(X86::MOVSX64rr32), OReg64).addReg(OReg);
+ // addq BReg, OReg64, TReg
+ BuildMI(DispContBB, DL, TII->get(X86::ADD64rr), TReg)
+ .addReg(OReg64)
+ .addReg(BReg);
+ // jmpq *TReg
+ BuildMI(DispContBB, DL, TII->get(X86::JMP64r)).addReg(TReg);
+ break;
+ }
+ default:
+ llvm_unreachable("Unexpected jump table encoding");
+ }
+ } else {
+ // jmpl *.LJTI0_0(,IReg,4)
+ BuildMI(DispContBB, DL, TII->get(X86::JMP32m))
+ .addReg(0)
+ .addImm(4)
+ .addReg(IReg)
+ .addJumpTableIndex(MJTI)
+ .addReg(0);
+ }
// Add the jump table entries as successors to the MBB.
SmallPtrSet<MachineBasicBlock *, 8> SeenMBBs;
@@ -26975,21 +27687,6 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.resetAll();
switch (Opc) {
default: break;
- case X86ISD::ADD:
- case X86ISD::SUB:
- case X86ISD::ADC:
- case X86ISD::SBB:
- case X86ISD::SMUL:
- case X86ISD::UMUL:
- case X86ISD::INC:
- case X86ISD::DEC:
- case X86ISD::OR:
- case X86ISD::XOR:
- case X86ISD::AND:
- // These nodes' second result is a boolean.
- if (Op.getResNo() == 0)
- break;
- LLVM_FALLTHROUGH;
case X86ISD::SETCC:
Known.Zero.setBitsFrom(1);
break;
@@ -26998,6 +27695,17 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.Zero.setBitsFrom(NumLoBits);
break;
}
+ case X86ISD::PEXTRB:
+ case X86ISD::PEXTRW: {
+ SDValue Src = Op.getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
+ Op.getConstantOperandVal(1));
+ DAG.computeKnownBits(Src, Known, DemandedElt, Depth + 1);
+ Known = Known.zextOrTrunc(BitWidth);
+ Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
+ break;
+ }
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
@@ -27006,7 +27714,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
break;
}
- DAG.computeKnownBits(Op.getOperand(0), Known, Depth + 1);
+ DAG.computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1);
unsigned ShAmt = ShiftImm->getZExtValue();
if (Opc == X86ISD::VSHLI) {
Known.Zero <<= ShAmt;
@@ -27023,6 +27731,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
break;
}
case X86ISD::VZEXT: {
+ // TODO: Add DemandedElts support.
SDValue N0 = Op.getOperand(0);
unsigned NumElts = VT.getVectorNumElements();
@@ -27038,6 +27747,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
Known.Zero.setBitsFrom(InBitWidth);
break;
}
+ case X86ISD::CMOV: {
+ DAG.computeKnownBits(Op.getOperand(1), Known, Depth+1);
+ // If we don't know any bits, early out.
+ if (Known.isUnknown())
+ break;
+ KnownBits Known2;
+ DAG.computeKnownBits(Op.getOperand(0), Known2, Depth+1);
+
+ // Only known if known in both the LHS and RHS.
+ Known.One &= Known2.One;
+ Known.Zero &= Known2.Zero;
+ break;
+ }
+ case X86ISD::UDIVREM8_ZEXT_HREG:
+ // TODO: Support more than just the zero extended bits?
+ if (Op.getResNo() != 1)
+ break;
+ // The remainder is zero extended.
+ Known.Zero.setBitsFrom(8);
+ break;
}
}
@@ -27052,18 +27781,42 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
return VTBits;
case X86ISD::VSEXT: {
+ // TODO: Add DemandedElts support.
SDValue Src = Op.getOperand(0);
unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
Tmp += VTBits - Src.getScalarValueSizeInBits();
return Tmp;
}
- case X86ISD::VSHLI: {
+ case X86ISD::VTRUNC: {
+ // TODO: Add DemandedElts support.
SDValue Src = Op.getOperand(0);
+ unsigned NumSrcBits = Src.getScalarValueSizeInBits();
+ assert(VTBits < NumSrcBits && "Illegal truncation input type");
unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+ if (Tmp > (NumSrcBits - VTBits))
+ return Tmp - (NumSrcBits - VTBits);
+ return 1;
+ }
+
+ case X86ISD::PACKSS: {
+ // PACKSS is just a truncation if the sign bits extend to the packed size.
+ // TODO: Add DemandedElts support.
+ unsigned SrcBits = Op.getOperand(0).getScalarValueSizeInBits();
+ unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+ unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+ unsigned Tmp = std::min(Tmp0, Tmp1);
+ if (Tmp > (SrcBits - VTBits))
+ return Tmp - (SrcBits - VTBits);
+ return 1;
+ }
+
+ case X86ISD::VSHLI: {
+ SDValue Src = Op.getOperand(0);
APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
if (ShiftVal.uge(VTBits))
return VTBits; // Shifted all bits out --> zero.
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
if (ShiftVal.uge(Tmp))
return 1; // Shifted all sign bits out --> unknown.
return Tmp - ShiftVal.getZExtValue();
@@ -27071,8 +27824,10 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VSRAI: {
SDValue Src = Op.getOperand(0);
- unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+ if (ShiftVal.uge(VTBits - 1))
+ return VTBits; // Sign splat.
+ unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedElts, Depth + 1);
ShiftVal += Tmp;
return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
}
@@ -27084,12 +27839,31 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
case X86ISD::VPCOMU:
// Vector compares return zero/all-bits result values.
return VTBits;
+
+ case X86ISD::CMOV: {
+ unsigned Tmp0 = DAG.ComputeNumSignBits(Op.getOperand(0), Depth+1);
+ if (Tmp0 == 1) return 1; // Early out.
+ unsigned Tmp1 = DAG.ComputeNumSignBits(Op.getOperand(1), Depth+1);
+ return std::min(Tmp0, Tmp1);
+ }
+ case X86ISD::SDIVREM8_SEXT_HREG:
+ // TODO: Support more than just the sign extended bits?
+ if (Op.getResNo() != 1)
+ break;
+ // The remainder is sign extended.
+ return VTBits - 7;
}
// Fallback case.
return 1;
}
+SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
+ if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP)
+ return N->getOperand(0);
+ return N;
+}
+
/// Returns true (and the GlobalValue and the offset) if the node is a
/// GlobalAddress + offset.
bool X86TargetLowering::isGAPlusOffset(SDNode *N,
@@ -27130,13 +27904,18 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
}
if (Match) {
unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
- SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
- if (SrcVT != MaskVT)
+ MVT ScalarTy = MaskVT.isInteger() ? MaskVT.getScalarType() :
+ MVT::getIntegerVT(MaskEltSize);
+ SrcVT = MVT::getVectorVT(ScalarTy, SrcSize / MaskEltSize);
+
+ if (SrcVT.getSizeInBits() != MaskVT.getSizeInBits()) {
V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
+ Shuffle = unsigned(X86ISD::VZEXT);
+ } else
+ Shuffle = unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
+
DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
DstVT = MVT::getVectorVT(DstVT, NumDstElts);
- Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
- : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
return true;
}
}
@@ -27155,7 +27934,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// instructions are no slower than UNPCKLPD but has the option to
// fold the input operand into even an unaligned memory load.
if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
- if (isTargetShuffleEquivalent(Mask, {0, 0})) {
+ if (!Subtarget.hasAVX2() && isTargetShuffleEquivalent(Mask, {0, 0})) {
Shuffle = X86ISD::MOVDDUP;
SrcVT = DstVT = MVT::v2f64;
return true;
@@ -27290,7 +28069,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
// Narrow the repeated mask to create 32-bit element permutes.
SmallVector<int, 4> WordMask = RepeatedMask;
if (MaskScalarSizeInBits == 64)
- scaleShuffleMask(2, RepeatedMask, WordMask);
+ scaleShuffleMask<int>(2, RepeatedMask, WordMask);
Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
@@ -27356,7 +28135,7 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
SDValue &V1, SDValue &V2, SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget,
- unsigned &Shuffle, MVT &ShuffleVT,
+ unsigned &Shuffle, MVT &SrcVT, MVT &DstVT,
bool IsUnary) {
unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
@@ -27364,26 +28143,36 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
V2 = V1;
Shuffle = X86ISD::MOVLHPS;
- ShuffleVT = MVT::v4f32;
+ SrcVT = DstVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
V2 = V1;
Shuffle = X86ISD::MOVHLPS;
- ShuffleVT = MVT::v4f32;
+ SrcVT = DstVT = MVT::v4f32;
return true;
}
if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
std::swap(V1, V2);
Shuffle = X86ISD::MOVSD;
- ShuffleVT = MaskVT;
+ SrcVT = DstVT = MaskVT;
return true;
}
if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
(AllowFloatDomain || !Subtarget.hasSSE41())) {
Shuffle = X86ISD::MOVSS;
- ShuffleVT = MaskVT;
+ SrcVT = DstVT = MaskVT;
+ return true;
+ }
+ }
+
+ // Attempt to match against either a unary or binary PACKSS/PACKUS shuffle.
+ // TODO add support for 256/512-bit types.
+ if ((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) {
+ if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
+ Subtarget)) {
+ DstVT = MaskVT;
return true;
}
}
@@ -27396,9 +28185,9 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
DAG, Subtarget)) {
- ShuffleVT = MaskVT;
- if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
- ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+ SrcVT = DstVT = MaskVT;
+ if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
+ SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
return true;
}
}
@@ -27572,11 +28361,11 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
/// into either a single instruction if there is a special purpose instruction
/// for this operation, or into a PSHUFB instruction which is a fully general
/// instruction but should only be used to replace chains over a certain depth.
-static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
- ArrayRef<int> BaseMask, int Depth,
- bool HasVariableMask, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
+ ArrayRef<int> BaseMask, int Depth,
+ bool HasVariableMask, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
assert(!BaseMask.empty() && "Cannot combine an empty shuffle mask!");
assert((Inputs.size() == 1 || Inputs.size() == 2) &&
"Unexpected number of shuffle inputs!");
@@ -27601,9 +28390,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
unsigned NumBaseMaskElts = BaseMask.size();
if (NumBaseMaskElts == 1) {
assert(BaseMask[0] == 0 && "Invalid shuffle index found!");
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, V1);
}
unsigned RootSizeInBits = RootVT.getSizeInBits();
@@ -27621,16 +28408,19 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
bool IsEVEXShuffle =
RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
if (IsEVEXShuffle && (RootVT.getScalarSizeInBits() != BaseMaskEltSizeInBits))
- return false;
+ return SDValue();
// TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
// Handle 128-bit lane shuffles of 256-bit vectors.
+ // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
+ // we need to use the zeroing feature.
// TODO - this should support binary shuffles.
if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
+ !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
!isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::VPERM2X128)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
unsigned PermMask = 0;
PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
@@ -27642,9 +28432,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG.getUNDEF(ShuffleVT),
DAG.getConstant(PermMask, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// For masks that have been widened to 128-bit elements or more,
@@ -27653,7 +28441,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (BaseMaskEltSizeInBits > 64) {
assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
int MaskScale = BaseMaskEltSizeInBits / 64;
- scaleShuffleMask(MaskScale, BaseMask, Mask);
+ scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
} else {
Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
}
@@ -27669,7 +28457,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Only allow legal mask types.
if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
- return false;
+ return SDValue();
// Attempt to match the mask against known shuffle patterns.
MVT ShuffleSrcVT, ShuffleVT;
@@ -27678,7 +28466,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Which shuffle domains are permitted?
// Permit domain crossing at higher combine depths.
bool AllowFloatDomain = FloatDomain || (Depth > 3);
- bool AllowIntDomain = (!FloatDomain || (Depth > 3)) &&
+ bool AllowIntDomain = (!FloatDomain || (Depth > 3)) && Subtarget.hasSSE2() &&
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
@@ -27697,9 +28485,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, V1),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, V1);
}
}
@@ -27707,52 +28493,46 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
ShuffleVT)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
- return false; // AVX512 Writemask clash.
+ return SDValue(); // AVX512 Writemask clash.
Res = DAG.getBitcast(ShuffleSrcVT, V1);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
AllowIntDomain, Subtarget, Shuffle,
ShuffleVT, PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
- return false; // AVX512 Writemask clash.
+ return SDValue(); // AVX512 Writemask clash.
Res = DAG.getBitcast(ShuffleVT, V1);
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, Res,
DAG.getConstant(PermuteImm, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
}
if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
- V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
- UnaryShuffle)) {
+ V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+ ShuffleVT, UnaryShuffle)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
- return false; // AVX512 Writemask clash.
- V1 = DAG.getBitcast(ShuffleVT, V1);
+ return SDValue(); // AVX512 Writemask clash.
+ V1 = DAG.getBitcast(ShuffleSrcVT, V1);
DCI.AddToWorklist(V1.getNode());
- V2 = DAG.getBitcast(ShuffleVT, V2);
+ V2 = DAG.getBitcast(ShuffleSrcVT, V2);
DCI.AddToWorklist(V2.getNode());
Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
@@ -27760,9 +28540,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Subtarget, Shuffle, ShuffleVT,
PermuteImm)) {
if (Depth == 1 && Root.getOpcode() == Shuffle)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
- return false; // AVX512 Writemask clash.
+ return SDValue(); // AVX512 Writemask clash.
V1 = DAG.getBitcast(ShuffleVT, V1);
DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(ShuffleVT, V2);
@@ -27770,9 +28550,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Res = DAG.getNode(Shuffle, DL, ShuffleVT, V1, V2,
DAG.getConstant(PermuteImm, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// Typically from here on, we need an integer version of MaskVT.
@@ -27785,21 +28563,19 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
if (matchVectorShuffleAsEXTRQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx,
Zeroable)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::EXTRQI)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
DCI.AddToWorklist(V1.getNode());
Res = DAG.getNode(X86ISD::EXTRQI, DL, IntMaskVT, V1,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
if (matchVectorShuffleAsINSERTQ(IntMaskVT, V1, V2, Mask, BitLen, BitIdx)) {
if (Depth == 1 && Root.getOpcode() == X86ISD::INSERTQI)
- return false; // Nothing to do!
+ return SDValue(); // Nothing to do!
V1 = DAG.getBitcast(IntMaskVT, V1);
DCI.AddToWorklist(V1.getNode());
V2 = DAG.getBitcast(IntMaskVT, V2);
@@ -27808,23 +28584,25 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DAG.getConstant(BitLen, DL, MVT::i8),
DAG.getConstant(BitIdx, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
}
// Don't try to re-form single instruction chains under any circumstances now
// that we've done encoding canonicalization for them.
if (Depth < 2)
- return false;
+ return SDValue();
+
+ // Depth threshold above which we can efficiently use variable mask shuffles.
+ // TODO This should probably be target specific.
+ bool AllowVariableMask = (Depth >= 3) || HasVariableMask;
bool MaskContainsZeros =
any_of(Mask, [](int M) { return M == SM_SentinelZero; });
if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
// If we have a single input lane-crossing shuffle then lower to VPERMV.
- if (UnaryShuffle && (Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX2() &&
(MaskVT == MVT::v8f32 || MaskVT == MVT::v8i32)) ||
(Subtarget.hasAVX512() &&
@@ -27840,14 +28618,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERMV, DL, MaskVT, VPermMask, Res);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// Lower a unary+zero lane-crossing shuffle as VPERMV3 with a zero
// vector as the second source.
- if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ if (UnaryShuffle && AllowVariableMask &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@@ -27871,13 +28647,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(Zero.getNode());
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, Res, VPermMask, Zero);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// If we have a dual input lane-crossing shuffle then lower to VPERMV3.
- if ((Depth >= 3 || HasVariableMask) && !MaskContainsZeros &&
+ if (AllowVariableMask && !MaskContainsZeros &&
((Subtarget.hasAVX512() &&
(MaskVT == MVT::v8f64 || MaskVT == MVT::v8i64 ||
MaskVT == MVT::v16f32 || MaskVT == MVT::v16i32)) ||
@@ -27896,16 +28670,14 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(V2.getNode());
Res = DAG.getNode(X86ISD::VPERMV3, DL, MaskVT, V1, VPermMask, V2);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
- return false;
+ return SDValue();
}
// See if we can combine a single input shuffle with zeros to a bit-mask,
// which is much simpler than any shuffle.
- if (UnaryShuffle && MaskContainsZeros && (Depth >= 3 || HasVariableMask) &&
+ if (UnaryShuffle && MaskContainsZeros && AllowVariableMask &&
isSequentialOrUndefOrZeroInRange(Mask, 0, NumMaskElts, 0) &&
DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
@@ -27930,15 +28702,13 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// If we have a single input shuffle with different shuffle patterns in the
// the 128-bit lanes use the variable mask to VPERMILPS.
// TODO Combine other mask types at higher depths.
- if (UnaryShuffle && HasVariableMask && !MaskContainsZeros &&
+ if (UnaryShuffle && AllowVariableMask && !MaskContainsZeros &&
((MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
(MaskVT == MVT::v16f32 && Subtarget.hasAVX512()))) {
SmallVector<SDValue, 16> VPermIdx;
@@ -27953,14 +28723,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(Res.getNode());
Res = DAG.getNode(X86ISD::VPERMILPV, DL, MaskVT, Res, VPermMask);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// With XOP, binary shuffles of 128/256-bit floating point vectors can combine
// to VPERMIL2PD/VPERMIL2PS.
- if ((Depth >= 3 || HasVariableMask) && Subtarget.hasXOP() &&
+ if (AllowVariableMask && Subtarget.hasXOP() &&
(MaskVT == MVT::v2f64 || MaskVT == MVT::v4f64 || MaskVT == MVT::v4f32 ||
MaskVT == MVT::v8f32)) {
// VPERMIL2 Operation.
@@ -27994,9 +28762,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
Res = DAG.getNode(X86ISD::VPERMIL2, DL, MaskVT, V1, V2, VPerm2MaskOp,
DAG.getConstant(M2ZImm, DL, MVT::i8));
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// If we have 3 or more shuffle instructions or a chain involving a variable
@@ -28004,7 +28770,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
// Intel's manuals suggest only using PSHUFB if doing so replacing 5
// instructions, but in practice PSHUFB tends to be *very* fast so we're
// more aggressive.
- if (UnaryShuffle && (Depth >= 3 || HasVariableMask) &&
+ if (UnaryShuffle && AllowVariableMask &&
((RootVT.is128BitVector() && Subtarget.hasSSSE3()) ||
(RootVT.is256BitVector() && Subtarget.hasAVX2()) ||
(RootVT.is512BitVector() && Subtarget.hasBWI()))) {
@@ -28022,7 +28788,7 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
continue;
}
M = Ratio * M + i % Ratio;
- assert ((M / 16) == (i / 16) && "Lane crossing detected");
+ assert((M / 16) == (i / 16) && "Lane crossing detected");
PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8));
}
MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes);
@@ -28032,16 +28798,13 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(PSHUFBMaskOp.getNode());
Res = DAG.getNode(X86ISD::PSHUFB, DL, ByteVT, Res, PSHUFBMaskOp);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// With XOP, if we have a 128-bit binary input shuffle we can always combine
// to VPPERM. We match the depth requirement of PSHUFB - VPPERM is never
// slower than PSHUFB on targets that support both.
- if ((Depth >= 3 || HasVariableMask) && RootVT.is128BitVector() &&
- Subtarget.hasXOP()) {
+ if (AllowVariableMask && RootVT.is128BitVector() && Subtarget.hasXOP()) {
// VPPERM Mask Operation
// Bits[4:0] - Byte Index (0 - 31)
// Bits[7:5] - Permute Operation (0 - Source byte, 4 - ZERO)
@@ -28070,23 +28833,22 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
DCI.AddToWorklist(VPPERMMaskOp.getNode());
Res = DAG.getNode(X86ISD::VPPERM, DL, ByteVT, V1, V2, VPPERMMaskOp);
DCI.AddToWorklist(Res.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(RootVT, Res),
- /*AddTo*/ true);
- return true;
+ return DAG.getBitcast(RootVT, Res);
}
// Failed to find any combines.
- return false;
+ return SDValue();
}
// Attempt to constant fold all of the constant source ops.
// Returns true if the entire shuffle is folded to a constant.
// TODO: Extend this to merge multiple constant Ops and update the mask.
-static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
- ArrayRef<int> Mask, SDValue Root,
- bool HasVariableMask, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
+ ArrayRef<int> Mask, SDValue Root,
+ bool HasVariableMask,
+ SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
MVT VT = Root.getSimpleValueType();
unsigned SizeInBits = VT.getSizeInBits();
@@ -28103,14 +28865,14 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
OneUseConstantOp |= SrcOp.hasOneUse();
if (!getTargetConstantBitsFromNode(SrcOp, MaskSizeInBits, UndefEltsOps[i],
RawBitsOps[i]))
- return false;
+ return SDValue();
}
// Only fold if at least one of the constants is only used once or
// the combined shuffle has included a variable mask shuffle, this
// is to avoid constant pool bloat.
if (!OneUseConstantOp && !HasVariableMask)
- return false;
+ return SDValue();
// Shuffle the constant bits according to the mask.
APInt UndefElts(NumMaskElts, 0);
@@ -28162,8 +28924,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
SDLoc DL(Root);
SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
DCI.AddToWorklist(CstOp.getNode());
- DCI.CombineTo(Root.getNode(), DAG.getBitcast(VT, CstOp));
- return true;
+ return DAG.getBitcast(VT, CstOp);
}
/// \brief Fully generic combining of x86 shuffle instructions.
@@ -28195,18 +28956,15 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
/// would simplify under the threshold for PSHUFB formation because of
/// combine-ordering. To fix this, we should do the redundant instruction
/// combining in this recursive walk.
-static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
- int SrcOpIndex, SDValue Root,
- ArrayRef<int> RootMask,
- ArrayRef<const SDNode*> SrcNodes,
- int Depth, bool HasVariableMask,
- SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineX86ShufflesRecursively(
+ ArrayRef<SDValue> SrcOps, int SrcOpIndex, SDValue Root,
+ ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, int Depth,
+ bool HasVariableMask, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) {
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
if (Depth > 8)
- return false;
+ return SDValue();
// Directly rip through bitcasts to find the underlying operand.
SDValue Op = SrcOps[SrcOpIndex];
@@ -28214,7 +28972,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
MVT VT = Op.getSimpleValueType();
if (!VT.isVector())
- return false; // Bail if we hit a non-vector.
+ return SDValue(); // Bail if we hit a non-vector.
assert(Root.getSimpleValueType().isVector() &&
"Shuffles operate on vector types!");
@@ -28225,7 +28983,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
SmallVector<int, 64> OpMask;
SmallVector<SDValue, 2> OpInputs;
if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask, DAG))
- return false;
+ return SDValue();
assert(OpInputs.size() <= 2 && "Too many shuffle inputs");
SDValue Input0 = (OpInputs.size() > 0 ? OpInputs[0] : SDValue());
@@ -28334,18 +29092,15 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
}
// Handle the all undef/zero cases early.
- if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; })) {
- DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType()));
- return true;
- }
- if (all_of(Mask, [](int Idx) { return Idx < 0; })) {
- // TODO - should we handle the mixed zero/undef case as well? Just returning
- // a zero mask will lose information on undef elements possibly reducing
- // future combine possibilities.
- DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(),
- Subtarget, DAG, SDLoc(Root)));
- return true;
- }
+ if (all_of(Mask, [](int Idx) { return Idx == SM_SentinelUndef; }))
+ return DAG.getUNDEF(Root.getValueType());
+
+ // TODO - should we handle the mixed zero/undef case as well? Just returning
+ // a zero mask will lose information on undef elements possibly reducing
+ // future combine possibilities.
+ if (all_of(Mask, [](int Idx) { return Idx < 0; }))
+ return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG,
+ SDLoc(Root));
// Remove unused shuffle source ops.
resolveTargetShuffleInputsAndMask(Ops, Mask);
@@ -28364,19 +29119,19 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
for (int i = 0, e = Ops.size(); i < e; ++i)
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
- if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
- Depth + 1, HasVariableMask, DAG, DCI,
- Subtarget))
- return true;
+ if (SDValue Res = combineX86ShufflesRecursively(
+ Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
+ DAG, DCI, Subtarget))
+ return Res;
// Attempt to constant fold all of the constant source ops.
- if (combineX86ShufflesConstants(Ops, Mask, Root, HasVariableMask, DAG, DCI,
- Subtarget))
- return true;
+ if (SDValue Cst = combineX86ShufflesConstants(
+ Ops, Mask, Root, HasVariableMask, DAG, DCI, Subtarget))
+ return Cst;
// We can only combine unary and binary shuffle mask cases.
if (Ops.size() > 2)
- return false;
+ return SDValue();
// Minor canonicalization of the accumulated shuffle mask to make it easier
// to match below. All this does is detect masks with sequential pairs of
@@ -28395,6 +29150,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
std::swap(Ops[0], Ops[1]);
}
+ // Finally, try to combine into a single shuffle instruction.
return combineX86ShuffleChain(Ops, Root, Mask, Depth, HasVariableMask, DAG,
DCI, Subtarget);
}
@@ -28650,8 +29406,37 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
SDLoc DL(N);
MVT VT = N.getSimpleValueType();
SmallVector<int, 4> Mask;
-
unsigned Opcode = N.getOpcode();
+
+ // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
+ // single instruction.
+ if (VT.getScalarSizeInBits() == 64 &&
+ (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
+ Opcode == X86ISD::UNPCKL)) {
+ auto BC0 = peekThroughBitcasts(N.getOperand(0));
+ auto BC1 = peekThroughBitcasts(N.getOperand(1));
+ EVT VT0 = BC0.getValueType();
+ EVT VT1 = BC1.getValueType();
+ unsigned Opcode0 = BC0.getOpcode();
+ unsigned Opcode1 = BC1.getOpcode();
+ if (Opcode0 == Opcode1 && VT0 == VT1 &&
+ (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+ Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
+ Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
+ SDValue Lo, Hi;
+ if (Opcode == X86ISD::MOVSD) {
+ Lo = BC1.getOperand(0);
+ Hi = BC0.getOperand(1);
+ } else {
+ Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
+ Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
+ }
+ SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+ DCI.AddToWorklist(Horiz.getNode());
+ return DAG.getBitcast(VT, Horiz);
+ }
+ }
+
switch (Opcode) {
case X86ISD::PSHUFD:
case X86ISD::PSHUFLW:
@@ -28660,17 +29445,6 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
assert(Mask.size() == 4);
break;
case X86ISD::UNPCKL: {
- auto Op0 = N.getOperand(0);
- auto Op1 = N.getOperand(1);
- unsigned Opcode0 = Op0.getOpcode();
- unsigned Opcode1 = Op1.getOpcode();
-
- // Combine X86ISD::UNPCKL with 2 X86ISD::FHADD inputs into a single
- // X86ISD::FHADD. This is generated by UINT_TO_FP v2f64 scalarization.
- // TODO: Add other horizontal operations as required.
- if (VT == MVT::v2f64 && Opcode0 == Opcode1 && Opcode0 == X86ISD::FHADD)
- return DAG.getNode(Opcode0, DL, VT, Op0.getOperand(0), Op1.getOperand(0));
-
// Combine X86ISD::UNPCKL and ISD::VECTOR_SHUFFLE into X86ISD::UNPCKH, in
// which X86ISD::UNPCKL has a ISD::UNDEF operand, and ISD::VECTOR_SHUFFLE
// moves upper half elements into the lower half part. For example:
@@ -28688,7 +29462,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
if (!VT.is128BitVector())
return SDValue();
- if (Op0.isUndef() && Opcode1 == ISD::VECTOR_SHUFFLE) {
+ auto Op0 = N.getOperand(0);
+ auto Op1 = N.getOperand(1);
+ if (Op0.isUndef() && Op1.getOpcode() == ISD::VECTOR_SHUFFLE) {
ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(Op1.getNode())->getMask();
unsigned NumElts = VT.getVectorNumElements();
@@ -28999,7 +29775,7 @@ static SDValue combineShuffleToAddSubOrFMAddSub(SDNode *N,
// Try to generate X86ISD::FMADDSUB node here.
SDValue Opnd2;
- if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2))
+ if (isFMAddSub(Subtarget, DAG, Opnd0, Opnd1, Opnd2, 2))
return DAG.getNode(X86ISD::FMADDSUB, DL, VT, Opnd0, Opnd1, Opnd2);
// Do not generate X86ISD::ADDSUB node for 512-bit types even though
@@ -29056,6 +29832,40 @@ static SDValue combineShuffleOfConcatUndef(SDNode *N, SelectionDAG &DAG,
return DAG.getVectorShuffle(VT, DL, Concat, DAG.getUNDEF(VT), Mask);
}
+/// Eliminate a redundant shuffle of a horizontal math op.
+static SDValue foldShuffleOfHorizOp(SDNode *N) {
+ if (N->getOpcode() != ISD::VECTOR_SHUFFLE || !N->getOperand(1).isUndef())
+ return SDValue();
+
+ SDValue HOp = N->getOperand(0);
+ if (HOp.getOpcode() != X86ISD::HADD && HOp.getOpcode() != X86ISD::FHADD &&
+ HOp.getOpcode() != X86ISD::HSUB && HOp.getOpcode() != X86ISD::FHSUB)
+ return SDValue();
+
+ // 128-bit horizontal math instructions are defined to operate on adjacent
+ // lanes of each operand as:
+ // v4X32: A[0] + A[1] , A[2] + A[3] , B[0] + B[1] , B[2] + B[3]
+ // ...similarly for v2f64 and v8i16.
+ // TODO: 256-bit is not the same because...x86.
+ if (HOp.getOperand(0) != HOp.getOperand(1) || HOp.getValueSizeInBits() != 128)
+ return SDValue();
+
+ // When the operands of a horizontal math op are identical, the low half of
+ // the result is the same as the high half. If the shuffle is also replicating
+ // low and high halves, we don't need the shuffle.
+ // shuffle (hadd X, X), undef, [low half...high half] --> hadd X, X
+ ArrayRef<int> Mask = cast<ShuffleVectorSDNode>(N)->getMask();
+ // TODO: Other mask possibilities like {1,1} and {1,0} could be added here,
+ // but this should be tied to whatever horizontal op matching and shuffle
+ // canonicalization are producing.
+ if (isTargetShuffleEquivalent(Mask, { 0, 0 }) ||
+ isTargetShuffleEquivalent(Mask, { 0, 1, 0, 1 }) ||
+ isTargetShuffleEquivalent(Mask, { 0, 1, 2, 3, 0, 1, 2, 3 }))
+ return HOp;
+
+ return SDValue();
+}
+
static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -29064,10 +29874,14 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// If we have legalized the vector types, look for blends of FADD and FSUB
// nodes that we can fuse into an ADDSUB node.
- if (TLI.isTypeLegal(VT))
+ if (TLI.isTypeLegal(VT)) {
if (SDValue AddSub = combineShuffleToAddSubOrFMAddSub(N, Subtarget, DAG))
return AddSub;
+ if (SDValue HAddSub = foldShuffleOfHorizOp(N))
+ return HAddSub;
+ }
+
// During Type Legalization, when promoting illegal vector types,
// the backend might introduce new shuffle dag nodes and bitcasts.
//
@@ -29165,12 +29979,12 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
// specific PSHUF instruction sequences into their minimal form so that we
// can evaluate how many specialized shuffle instructions are involved in
// a particular chain.
- SmallVector<int, 1> NonceMask; // Just a placeholder.
- NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
- /*Depth*/ 1, /*HasVarMask*/ false, DAG,
- DCI, Subtarget))
- return SDValue(); // This routine will use CombineTo to replace N.
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
+ DCI.CombineTo(N, Res);
+ return SDValue();
+ }
}
return SDValue();
@@ -29287,6 +30101,53 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
SDValue N0 = BitCast.getOperand(0);
EVT VecVT = N0->getValueType(0);
+ if (VT.isVector() && VecVT.isScalarInteger() && Subtarget.hasAVX512() &&
+ N0->getOpcode() == ISD::OR) {
+ SDValue Op0 = N0->getOperand(0);
+ SDValue Op1 = N0->getOperand(1);
+ MVT TrunckVT;
+ MVT BitcastVT;
+ switch (VT.getSimpleVT().SimpleTy) {
+ default:
+ return SDValue();
+ case MVT::v16i1:
+ TrunckVT = MVT::i8;
+ BitcastVT = MVT::v8i1;
+ break;
+ case MVT::v32i1:
+ TrunckVT = MVT::i16;
+ BitcastVT = MVT::v16i1;
+ break;
+ case MVT::v64i1:
+ TrunckVT = MVT::i32;
+ BitcastVT = MVT::v32i1;
+ break;
+ }
+ bool isArg0UndefRight = Op0->getOpcode() == ISD::SHL;
+ bool isArg0UndefLeft =
+ Op0->getOpcode() == ISD::ZERO_EXTEND || Op0->getOpcode() == ISD::AND;
+ bool isArg1UndefRight = Op1->getOpcode() == ISD::SHL;
+ bool isArg1UndefLeft =
+ Op1->getOpcode() == ISD::ZERO_EXTEND || Op1->getOpcode() == ISD::AND;
+ SDValue OpLeft;
+ SDValue OpRight;
+ if (isArg0UndefRight && isArg1UndefLeft) {
+ OpLeft = Op0;
+ OpRight = Op1;
+ } else if (isArg1UndefRight && isArg0UndefLeft) {
+ OpLeft = Op1;
+ OpRight = Op0;
+ } else
+ return SDValue();
+ SDLoc DL(BitCast);
+ SDValue Shr = OpLeft->getOperand(0);
+ SDValue Trunc1 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, Shr);
+ SDValue Bitcast1 = DAG.getBitcast(BitcastVT, Trunc1);
+ SDValue Trunc2 = DAG.getNode(ISD::TRUNCATE, DL, TrunckVT, OpRight);
+ SDValue Bitcast2 = DAG.getBitcast(BitcastVT, Trunc2);
+ return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Bitcast1, Bitcast2);
+ }
+
if (!VT.isScalarInteger() || !VecVT.isSimple())
return SDValue();
@@ -29300,7 +30161,7 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// v8i16 and v16i16.
// For these two cases, we can shuffle the upper element bytes to a
// consecutive sequence at the start of the vector and treat the results as
- // v16i8 or v32i8, and for v61i8 this is the preferable solution. However,
+ // v16i8 or v32i8, and for v16i8 this is the preferable solution. However,
// for v16i16 this is not the case, because the shuffle is expensive, so we
// avoid sign-extending to this type entirely.
// For example, t0 := (v8i16 sext(v8i1 x)) needs to be shuffled as:
@@ -29319,9 +30180,8 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
FPCastVT = MVT::v4f32;
// For cases such as (i4 bitcast (v4i1 setcc v4i64 v1, v2))
// sign-extend to a 256-bit operation to avoid truncation.
- if (N0->getOpcode() == ISD::SETCC &&
- N0->getOperand(0)->getValueType(0).is256BitVector() &&
- Subtarget.hasInt256()) {
+ if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
+ N0->getOperand(0)->getValueType(0).is256BitVector()) {
SExtVT = MVT::v4i64;
FPCastVT = MVT::v4f64;
}
@@ -29333,9 +30193,9 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// If the setcc operand is 128-bit, prefer sign-extending to 128-bit over
// 256-bit because the shuffle is cheaper than sign extending the result of
// the compare.
- if (N0->getOpcode() == ISD::SETCC &&
- N0->getOperand(0)->getValueType(0).is256BitVector() &&
- Subtarget.hasInt256()) {
+ if (N0->getOpcode() == ISD::SETCC && Subtarget.hasAVX() &&
+ (N0->getOperand(0)->getValueType(0).is256BitVector() ||
+ N0->getOperand(0)->getValueType(0).is512BitVector())) {
SExtVT = MVT::v8i32;
FPCastVT = MVT::v8f32;
}
@@ -29348,23 +30208,34 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, SDValue BitCast,
// truncating the result of the compare to 128-bits.
break;
case MVT::v32i1:
- // TODO: Handle pre-AVX2 cases by splitting to two v16i1's.
- if (!Subtarget.hasInt256())
- return SDValue();
SExtVT = MVT::v32i8;
break;
};
SDLoc DL(BitCast);
SDValue V = DAG.getSExtOrTrunc(N0, DL, SExtVT);
+
+ if (SExtVT == MVT::v32i8 && !Subtarget.hasInt256()) {
+ // Handle pre-AVX2 cases by splitting to two v16i1's.
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT ShiftTy = TLI.getScalarShiftAmountTy(DAG.getDataLayout(), MVT::i32);
+ SDValue Lo = extract128BitVector(V, 0, DAG, DL);
+ SDValue Hi = extract128BitVector(V, 16, DAG, DL);
+ Lo = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Lo);
+ Hi = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Hi);
+ Hi = DAG.getNode(ISD::SHL, DL, MVT::i32, Hi,
+ DAG.getConstant(16, DL, ShiftTy));
+ V = DAG.getNode(ISD::OR, DL, MVT::i32, Lo, Hi);
+ return DAG.getZExtOrTrunc(V, DL, VT);
+ }
+
if (SExtVT == MVT::v8i16) {
- V = DAG.getBitcast(MVT::v16i8, V);
- V = DAG.getVectorShuffle(
- MVT::v16i8, DL, V, DAG.getUNDEF(MVT::v16i8),
- {0, 2, 4, 6, 8, 10, 12, 14, -1, -1, -1, -1, -1, -1, -1, -1});
+ assert(16 == DAG.ComputeNumSignBits(V) && "Expected all/none bit vector");
+ V = DAG.getNode(X86ISD::PACKSS, DL, MVT::v16i8, V,
+ DAG.getUNDEF(MVT::v8i16));
} else
assert(SExtVT.getScalarType() != MVT::i16 &&
- "Vectors of i16 must be shuffled");
+ "Vectors of i16 must be packed");
if (FPCastVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
V = DAG.getBitcast(FPCastVT, V);
V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
@@ -29463,16 +30334,22 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
// the elements of a vector.
// Returns the vector that is being reduced on, or SDValue() if a reduction
// was not matched.
-static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
+static SDValue matchBinOpReduction(SDNode *Extract, unsigned &BinOp,
+ ArrayRef<ISD::NodeType> CandidateBinOps) {
// The pattern must end in an extract from index 0.
if ((Extract->getOpcode() != ISD::EXTRACT_VECTOR_ELT) ||
!isNullConstant(Extract->getOperand(1)))
return SDValue();
- unsigned Stages =
- Log2_32(Extract->getOperand(0).getValueType().getVectorNumElements());
-
SDValue Op = Extract->getOperand(0);
+ unsigned Stages = Log2_32(Op.getValueType().getVectorNumElements());
+
+ // Match against one of the candidate binary ops.
+ if (llvm::none_of(CandidateBinOps, [Op](ISD::NodeType BinOp) {
+ return Op.getOpcode() == unsigned(BinOp);
+ }))
+ return SDValue();
+
// At each stage, we're looking for something that looks like:
// %s = shufflevector <8 x i32> %op, <8 x i32> undef,
// <8 x i32> <i32 2, i32 3, i32 undef, i32 undef,
@@ -29483,8 +30360,9 @@ static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
// <4,5,6,7,u,u,u,u>
// <2,3,u,u,u,u,u,u>
// <1,u,u,u,u,u,u,u>
+ unsigned CandidateBinOp = Op.getOpcode();
for (unsigned i = 0; i < Stages; ++i) {
- if (Op.getOpcode() != BinOp)
+ if (Op.getOpcode() != CandidateBinOp)
return SDValue();
ShuffleVectorSDNode *Shuffle =
@@ -29497,8 +30375,8 @@ static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
}
// The first operand of the shuffle should be the same as the other operand
- // of the add.
- if (!Shuffle || (Shuffle->getOperand(0) != Op))
+ // of the binop.
+ if (!Shuffle || Shuffle->getOperand(0) != Op)
return SDValue();
// Verify the shuffle has the expected (at this stage of the pyramid) mask.
@@ -29507,6 +30385,7 @@ static SDValue matchBinOpReduction(SDNode *Extract, ISD::NodeType BinOp) {
return SDValue();
}
+ BinOp = CandidateBinOp;
return Op;
}
@@ -29552,8 +30431,7 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
// In SetLT case, The second operand of the comparison can be either 1 or 0.
APInt SplatVal;
if ((CC == ISD::SETLT) &&
- !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal,
- /*AllowShrink*/false) &&
+ !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
SplatVal.isOneValue()) ||
(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
return false;
@@ -29606,6 +30484,66 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
}
+// Attempt to replace an min/max v8i16 horizontal reduction with PHMINPOSUW.
+static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ // Bail without SSE41.
+ if (!Subtarget.hasSSE41())
+ return SDValue();
+
+ EVT ExtractVT = Extract->getValueType(0);
+ if (ExtractVT != MVT::i16)
+ return SDValue();
+
+ // Check for SMAX/SMIN/UMAX/UMIN horizontal reduction patterns.
+ unsigned BinOp;
+ SDValue Src = matchBinOpReduction(
+ Extract, BinOp, {ISD::SMAX, ISD::SMIN, ISD::UMAX, ISD::UMIN});
+ if (!Src)
+ return SDValue();
+
+ EVT SrcVT = Src.getValueType();
+ EVT SrcSVT = SrcVT.getScalarType();
+ if (SrcSVT != MVT::i16 || (SrcVT.getSizeInBits() % 128) != 0)
+ return SDValue();
+
+ SDLoc DL(Extract);
+ SDValue MinPos = Src;
+
+ // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
+ while (SrcVT.getSizeInBits() > 128) {
+ unsigned NumElts = SrcVT.getVectorNumElements();
+ unsigned NumSubElts = NumElts / 2;
+ SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
+ unsigned SubSizeInBits = SrcVT.getSizeInBits();
+ SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
+ SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
+ MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
+ }
+ assert(SrcVT == MVT::v8i16 && "Unexpected value type");
+
+ // PHMINPOSUW applies to UMIN(v8i16), for SMIN/SMAX/UMAX we must apply a mask
+ // to flip the value accordingly.
+ SDValue Mask;
+ if (BinOp == ISD::SMAX)
+ Mask = DAG.getConstant(APInt::getSignedMaxValue(16), DL, SrcVT);
+ else if (BinOp == ISD::SMIN)
+ Mask = DAG.getConstant(APInt::getSignedMinValue(16), DL, SrcVT);
+ else if (BinOp == ISD::UMAX)
+ Mask = DAG.getConstant(APInt::getAllOnesValue(16), DL, SrcVT);
+
+ if (Mask)
+ MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
+
+ MinPos = DAG.getNode(X86ISD::PHMINPOS, DL, SrcVT, MinPos);
+
+ if (Mask)
+ MinPos = DAG.getNode(ISD::XOR, DL, SrcVT, Mask, MinPos);
+
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, MinPos,
+ DAG.getIntPtrConstant(0, DL));
+}
+
// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
static SDValue combineHorizontalPredicateResult(SDNode *Extract,
SelectionDAG &DAG,
@@ -29621,66 +30559,63 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
return SDValue();
// Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
- for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
- SDValue Match = matchBinOpReduction(Extract, Op);
- if (!Match)
- continue;
-
- // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
- // which we can't support here for now.
- if (Match.getScalarValueSizeInBits() != BitWidth)
- continue;
+ unsigned BinOp = 0;
+ SDValue Match = matchBinOpReduction(Extract, BinOp, {ISD::OR, ISD::AND});
+ if (!Match)
+ return SDValue();
- // We require AVX2 for PMOVMSKB for v16i16/v32i8;
- unsigned MatchSizeInBits = Match.getValueSizeInBits();
- if (!(MatchSizeInBits == 128 ||
- (MatchSizeInBits == 256 &&
- ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
- return SDValue();
+ // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+ // which we can't support here for now.
+ if (Match.getScalarValueSizeInBits() != BitWidth)
+ return SDValue();
- // Don't bother performing this for 2-element vectors.
- if (Match.getValueType().getVectorNumElements() <= 2)
- return SDValue();
+ // We require AVX2 for PMOVMSKB for v16i16/v32i8;
+ unsigned MatchSizeInBits = Match.getValueSizeInBits();
+ if (!(MatchSizeInBits == 128 ||
+ (MatchSizeInBits == 256 &&
+ ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
+ return SDValue();
- // Check that we are extracting a reduction of all sign bits.
- if (DAG.ComputeNumSignBits(Match) != BitWidth)
- return SDValue();
+ // Don't bother performing this for 2-element vectors.
+ if (Match.getValueType().getVectorNumElements() <= 2)
+ return SDValue();
- // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
- MVT MaskVT;
- if (64 == BitWidth || 32 == BitWidth)
- MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
- MatchSizeInBits / BitWidth);
- else
- MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
-
- APInt CompareBits;
- ISD::CondCode CondCode;
- if (Op == ISD::OR) {
- // any_of -> MOVMSK != 0
- CompareBits = APInt::getNullValue(32);
- CondCode = ISD::CondCode::SETNE;
- } else {
- // all_of -> MOVMSK == ((1 << NumElts) - 1)
- CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
- CondCode = ISD::CondCode::SETEQ;
- }
+ // Check that we are extracting a reduction of all sign bits.
+ if (DAG.ComputeNumSignBits(Match) != BitWidth)
+ return SDValue();
- // Perform the select as i32/i64 and then truncate to avoid partial register
- // stalls.
- unsigned ResWidth = std::max(BitWidth, 32u);
- EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
- SDLoc DL(Extract);
- SDValue Zero = DAG.getConstant(0, DL, ResVT);
- SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
- SDValue Res = DAG.getBitcast(MaskVT, Match);
- Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
- Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
- Ones, Zero, CondCode);
- return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+ // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+ MVT MaskVT;
+ if (64 == BitWidth || 32 == BitWidth)
+ MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+ MatchSizeInBits / BitWidth);
+ else
+ MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+ APInt CompareBits;
+ ISD::CondCode CondCode;
+ if (BinOp == ISD::OR) {
+ // any_of -> MOVMSK != 0
+ CompareBits = APInt::getNullValue(32);
+ CondCode = ISD::CondCode::SETNE;
+ } else {
+ // all_of -> MOVMSK == ((1 << NumElts) - 1)
+ CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+ CondCode = ISD::CondCode::SETEQ;
}
- return SDValue();
+ // Perform the select as i32/i64 and then truncate to avoid partial register
+ // stalls.
+ unsigned ResWidth = std::max(BitWidth, 32u);
+ EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
+ SDLoc DL(Extract);
+ SDValue Zero = DAG.getConstant(0, DL, ResVT);
+ SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
+ SDValue Res = DAG.getBitcast(MaskVT, Match);
+ Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
+ Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
+ Ones, Zero, CondCode);
+ return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
}
static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
@@ -29707,7 +30642,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
return SDValue();
// Match shuffle + add pyramid.
- SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
+ unsigned BinOp = 0;
+ SDValue Root = matchBinOpReduction(Extract, BinOp, {ISD::ADD});
// The operand is expected to be zero extended from i8
// (verified in detectZextAbsDiff).
@@ -29758,7 +30694,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
unsigned TypeSizeInBits = Type.getSizeInBits();
// Return the lowest TypeSizeInBits bits.
MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
- SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
+ SAD = DAG.getBitcast(ResVT, SAD);
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
Extract->getOperand(1));
}
@@ -29794,7 +30730,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
if ((NumSrcElts % Mask.size()) == 0) {
SmallVector<int, 16> ScaledMask;
int Scale = NumSrcElts / Mask.size();
- scaleShuffleMask(Scale, Mask, ScaledMask);
+ scaleShuffleMask<int>(Scale, Mask, ScaledMask);
Mask = std::move(ScaledMask);
} else if ((Mask.size() % NumSrcElts) == 0) {
SmallVector<int, 16> WidenedMask;
@@ -29843,9 +30779,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
DAG.getIntPtrConstant(SrcIdx, dl));
- SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
- DAG.getValueType(SrcSVT));
- return DAG.getZExtOrTrunc(Assert, dl, VT);
+ return DAG.getZExtOrTrunc(ExtOp, dl, VT);
}
return SDValue();
@@ -29858,10 +30792,17 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
- if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
+ if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
return NewOp;
- if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+ // TODO - Remove this once we can handle the implicit zero-extension of
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in:
+ // XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+ // combineBasicSADPattern.
+ if (N->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+ return SDValue();
+
+ if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
return NewOp;
SDValue InputVector = N->getOperand(0);
@@ -29910,6 +30851,10 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
return Cmp;
+ // Attempt to replace min/max v8i16 reductions with PHMINPOSUW.
+ if (SDValue MinMax = combineHorizontalMinMaxResult(N, DAG, Subtarget))
+ return MinMax;
+
// Only operate on vectors of 4 elements, where the alternative shuffling
// gets to be more expensive.
if (SrcVT != MVT::v4i32)
@@ -30008,18 +30953,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-// TODO - merge with combineExtractVectorElt once it can handle the implicit
-// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
-// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
-// combineBasicSADPattern.
-static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
- return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
-}
-
/// If a vector select has an operand that is -1 or 0, try to simplify the
/// select to a bitwise logic operation.
+/// TODO: Move to DAGCombiner, possibly using TargetLowering::hasAndNot()?
static SDValue
combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
@@ -30037,10 +30973,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
assert(CondVT.isVector() && "Vector select expects a vector selector!");
- bool FValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
+ bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
// Check if the first operand is all zeros and Cond type is vXi1.
// This situation only applies to avx512.
- if (FValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
+ if (TValIsAllZeros && Subtarget.hasAVX512() && Cond.hasOneUse() &&
CondVT.getVectorElementType() == MVT::i1) {
// Invert the cond to not(cond) : xor(op,allones)=not(op)
SDValue CondNew = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
@@ -30058,7 +30994,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return SDValue();
bool TValIsAllOnes = ISD::isBuildVectorAllOnes(LHS.getNode());
- FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
+ bool FValIsAllZeros = ISD::isBuildVectorAllZeros(RHS.getNode());
// Try to invert the condition if true value is not all 1s and false value is
// not all 0s.
@@ -30068,7 +31004,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
// Check if SETCC has already been promoted.
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT) ==
CondVT) {
- bool TValIsAllZeros = ISD::isBuildVectorAllZeros(LHS.getNode());
bool FValIsAllOnes = ISD::isBuildVectorAllOnes(RHS.getNode());
if (TValIsAllZeros || FValIsAllOnes) {
@@ -30084,6 +31019,10 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
}
}
+ // Cond value must be 'sign splat' to be converted to a logical op.
+ if (DAG.ComputeNumSignBits(Cond) != CondVT.getScalarSizeInBits())
+ return SDValue();
+
// vselect Cond, 111..., 000... -> Cond
if (TValIsAllOnes && FValIsAllZeros)
return DAG.getBitcast(VT, Cond);
@@ -30105,6 +31044,15 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(VT, And);
}
+ // vselect Cond, 000..., X -> andn Cond, X
+ if (TValIsAllZeros) {
+ MVT AndNVT = MVT::getVectorVT(MVT::i64, CondVT.getSizeInBits() / 64);
+ SDValue CastCond = DAG.getBitcast(AndNVT, Cond);
+ SDValue CastRHS = DAG.getBitcast(AndNVT, RHS);
+ SDValue AndN = DAG.getNode(X86ISD::ANDNP, DL, AndNVT, CastCond, CastRHS);
+ return DAG.getBitcast(VT, AndN);
+ }
+
return SDValue();
}
@@ -30120,78 +31068,52 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
return SDValue();
// Don't do this for crazy integer types.
- if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS.getValueType()))
+ EVT VT = N->getValueType(0);
+ if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
return SDValue();
- // If this is efficiently invertible, canonicalize the LHSC/RHSC values
- // so that TrueC (the true value) is larger than FalseC.
- bool NeedsCondInvert = false;
- if (TrueC->getAPIntValue().ult(FalseC->getAPIntValue()) &&
- // Efficiently invertible.
- (Cond.getOpcode() == ISD::SETCC || // setcc -> invertible.
- (Cond.getOpcode() == ISD::XOR && // xor(X, C) -> invertible.
- isa<ConstantSDNode>(Cond.getOperand(1))))) {
- NeedsCondInvert = true;
- std::swap(TrueC, FalseC);
- }
-
- // Optimize C ? 8 : 0 -> zext(C) << 3. Likewise for any pow2/0.
- if (FalseC->getAPIntValue() == 0 && TrueC->getAPIntValue().isPowerOf2()) {
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
-
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, LHS.getValueType(), Cond);
+ // We're going to use the condition bit in math or logic ops. We could allow
+ // this with a wider condition value (post-legalization it becomes an i8),
+ // but if nothing is creating selects that late, it doesn't matter.
+ if (Cond.getValueType() != MVT::i1)
+ return SDValue();
- unsigned ShAmt = TrueC->getAPIntValue().logBase2();
- return DAG.getNode(ISD::SHL, DL, LHS.getValueType(), Cond,
- DAG.getConstant(ShAmt, DL, MVT::i8));
- }
+ // A power-of-2 multiply is just a shift. LEA also cheaply handles multiply by
+ // 3, 5, or 9 with i32/i64, so those get transformed too.
+ // TODO: For constants that overflow or do not differ by power-of-2 or small
+ // multiplier, convert to 'and' + 'add'.
+ const APInt &TrueVal = TrueC->getAPIntValue();
+ const APInt &FalseVal = FalseC->getAPIntValue();
+ bool OV;
+ APInt Diff = TrueVal.ssub_ov(FalseVal, OV);
+ if (OV)
+ return SDValue();
- // Optimize cases that will turn into an LEA instruction. This requires
- // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
- if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
- uint64_t Diff = TrueC->getZExtValue() - FalseC->getZExtValue();
- if (N->getValueType(0) == MVT::i32)
- Diff = (unsigned)Diff;
+ APInt AbsDiff = Diff.abs();
+ if (AbsDiff.isPowerOf2() ||
+ ((VT == MVT::i32 || VT == MVT::i64) &&
+ (AbsDiff == 3 || AbsDiff == 5 || AbsDiff == 9))) {
- bool IsFastMultiplier = false;
- if (Diff < 10) {
- switch ((unsigned char)Diff) {
- default:
- break;
- case 1: // result = add base, cond
- case 2: // result = lea base( , cond*2)
- case 3: // result = lea base(cond, cond*2)
- case 4: // result = lea base( , cond*4)
- case 5: // result = lea base(cond, cond*4)
- case 8: // result = lea base( , cond*8)
- case 9: // result = lea base(cond, cond*8)
- IsFastMultiplier = true;
- break;
- }
+ // We need a positive multiplier constant for shift/LEA codegen. The 'not'
+ // of the condition can usually be folded into a compare predicate, but even
+ // without that, the sequence should be cheaper than a CMOV alternative.
+ if (TrueVal.slt(FalseVal)) {
+ Cond = DAG.getNOT(DL, Cond, MVT::i1);
+ std::swap(TrueC, FalseC);
}
- if (IsFastMultiplier) {
- APInt Diff = TrueC->getAPIntValue() - FalseC->getAPIntValue();
- if (NeedsCondInvert) // Invert the condition if needed.
- Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
- DAG.getConstant(1, DL, Cond.getValueType()));
+ // select Cond, TC, FC --> (zext(Cond) * (TC - FC)) + FC
+ SDValue R = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
- // Zero extend the condition if needed.
- Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
- // Scale the condition by the difference.
- if (Diff != 1)
- Cond = DAG.getNode(ISD::MUL, DL, Cond.getValueType(), Cond,
- DAG.getConstant(Diff, DL, Cond.getValueType()));
+ // Multiply condition by the difference if non-one.
+ if (!AbsDiff.isOneValue())
+ R = DAG.getNode(ISD::MUL, DL, VT, R, DAG.getConstant(AbsDiff, DL, VT));
- // Add the base if non-zero.
- if (FalseC->getAPIntValue() != 0)
- Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
- SDValue(FalseC, 0));
- return Cond;
- }
+ // Add the base if non-zero.
+ if (!FalseC->isNullValue())
+ R = DAG.getNode(ISD::ADD, DL, VT, R, SDValue(FalseC, 0));
+
+ return R;
}
return SDValue();
@@ -30231,26 +31153,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
unsigned Opcode = Op.getOpcode();
switch (Opcode) {
- case X86ISD::PALIGNR:
- // PALIGNR can be converted to VALIGND/Q for 128-bit vectors.
- if (!VT.is128BitVector())
- return false;
- Opcode = X86ISD::VALIGN;
- LLVM_FALLTHROUGH;
- case X86ISD::VALIGN: {
- if (EltVT != MVT::i32 && EltVT != MVT::i64)
- return false;
- uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
- unsigned ShiftAmt = Imm * OpEltVT.getSizeInBits();
- unsigned EltSize = EltVT.getSizeInBits();
- // Make sure we can represent the same shift with the new VT.
- if ((ShiftAmt % EltSize) != 0)
- return false;
- Imm = ShiftAmt / EltSize;
- return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
- DAG.getConstant(Imm, DL, MVT::i8));
- }
case X86ISD::SHUF128: {
if (EltVT.getSizeInBits() != 32 && EltVT.getSizeInBits() != 64)
return false;
@@ -30260,50 +31162,6 @@ static bool combineBitcastForMaskedOp(SDValue OrigOp, SelectionDAG &DAG,
return BitcastAndCombineShuffle(Opcode, Op.getOperand(0), Op.getOperand(1),
Op.getOperand(2));
}
- case ISD::INSERT_SUBVECTOR: {
- unsigned EltSize = EltVT.getSizeInBits();
- if (EltSize != 32 && EltSize != 64)
- return false;
- MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
- // Only change element size, not type.
- if (EltVT.isInteger() != OpEltVT.isInteger())
- return false;
- uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
- Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
- SDValue Op0 = DAG.getBitcast(VT, Op.getOperand(0));
- DCI.AddToWorklist(Op0.getNode());
- // Op1 needs to be bitcasted to a smaller vector with the same element type.
- SDValue Op1 = Op.getOperand(1);
- MVT Op1VT = MVT::getVectorVT(EltVT,
- Op1.getSimpleValueType().getSizeInBits() / EltSize);
- Op1 = DAG.getBitcast(Op1VT, Op1);
- DCI.AddToWorklist(Op1.getNode());
- DCI.CombineTo(OrigOp.getNode(),
- DAG.getNode(Opcode, DL, VT, Op0, Op1,
- DAG.getIntPtrConstant(Imm, DL)));
- return true;
- }
- case ISD::EXTRACT_SUBVECTOR: {
- unsigned EltSize = EltVT.getSizeInBits();
- if (EltSize != 32 && EltSize != 64)
- return false;
- MVT OpEltVT = Op.getSimpleValueType().getVectorElementType();
- // Only change element size, not type.
- if (EltVT.isInteger() != OpEltVT.isInteger())
- return false;
- uint64_t Imm = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
- Imm = (Imm * OpEltVT.getSizeInBits()) / EltSize;
- // Op0 needs to be bitcasted to a larger vector with the same element type.
- SDValue Op0 = Op.getOperand(0);
- MVT Op0VT = MVT::getVectorVT(EltVT,
- Op0.getSimpleValueType().getSizeInBits() / EltSize);
- Op0 = DAG.getBitcast(Op0VT, Op0);
- DCI.AddToWorklist(Op0.getNode());
- DCI.CombineTo(OrigOp.getNode(),
- DAG.getNode(Opcode, DL, VT, Op0,
- DAG.getIntPtrConstant(Imm, DL)));
- return true;
- }
case X86ISD::SUBV_BROADCAST: {
unsigned EltSize = EltVT.getSizeInBits();
if (EltSize != 32 && EltSize != 64)
@@ -30717,7 +31575,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
/// i.e., reusing the EFLAGS produced by the LOCKed instruction.
/// Note that this is only legal for some op/cc combinations.
static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
// This combine only operates on CMP-like nodes.
if (!(Cmp.getOpcode() == X86ISD::CMP ||
(Cmp.getOpcode() == X86ISD::SUB && !Cmp->hasAnyUseOfValue(0))))
@@ -30747,12 +31606,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
if (!CmpLHS.hasOneUse())
return SDValue();
- auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
- if (!CmpRHSC || CmpRHSC->getZExtValue() != 0)
- return SDValue();
-
- const unsigned Opc = CmpLHS.getOpcode();
-
+ unsigned Opc = CmpLHS.getOpcode();
if (Opc != ISD::ATOMIC_LOAD_ADD && Opc != ISD::ATOMIC_LOAD_SUB)
return SDValue();
@@ -30765,6 +31619,44 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
if (Opc == ISD::ATOMIC_LOAD_SUB)
Addend = -Addend;
+ auto *CmpRHSC = dyn_cast<ConstantSDNode>(CmpRHS);
+ if (!CmpRHSC)
+ return SDValue();
+
+ APInt Comparison = CmpRHSC->getAPIntValue();
+
+ // If the addend is the negation of the comparison value, then we can do
+ // a full comparison by emitting the atomic arithmetic as a locked sub.
+ if (Comparison == -Addend) {
+ // The CC is fine, but we need to rewrite the LHS of the comparison as an
+ // atomic sub.
+ auto *AN = cast<AtomicSDNode>(CmpLHS.getNode());
+ auto AtomicSub = DAG.getAtomic(
+ ISD::ATOMIC_LOAD_SUB, SDLoc(CmpLHS), CmpLHS.getValueType(),
+ /*Chain*/ CmpLHS.getOperand(0), /*LHS*/ CmpLHS.getOperand(1),
+ /*RHS*/ DAG.getConstant(-Addend, SDLoc(CmpRHS), CmpRHS.getValueType()),
+ AN->getMemOperand());
+ // If the comparision uses the CF flag we can't use INC/DEC instructions.
+ bool NeedCF = false;
+ switch (CC) {
+ default: break;
+ case X86::COND_A: case X86::COND_AE:
+ case X86::COND_B: case X86::COND_BE:
+ NeedCF = true;
+ break;
+ }
+ auto LockOp = lowerAtomicArithWithLOCK(AtomicSub, DAG, Subtarget, !NeedCF);
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
+ DAG.getUNDEF(CmpLHS.getValueType()));
+ DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
+ return LockOp;
+ }
+
+ // We can handle comparisons with zero in a number of cases by manipulating
+ // the CC used.
+ if (!Comparison.isNullValue())
+ return SDValue();
+
if (CC == X86::COND_S && Addend == 1)
CC = X86::COND_LE;
else if (CC == X86::COND_NS && Addend == 1)
@@ -30776,7 +31668,7 @@ static SDValue combineSetCCAtomicArith(SDValue Cmp, X86::CondCode &CC,
else
return SDValue();
- SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG);
+ SDValue LockOp = lowerAtomicArithWithLOCK(CmpLHS, DAG, Subtarget);
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(0),
DAG.getUNDEF(CmpLHS.getValueType()));
DAG.ReplaceAllUsesOfValueWith(CmpLHS.getValue(1), LockOp.getValue(1));
@@ -30983,14 +31875,15 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS) {
/// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
/// uses of chain values.
static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
- SelectionDAG &DAG) {
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
if (CC == X86::COND_B)
if (SDValue Flags = combineCarryThroughADD(EFLAGS))
return Flags;
if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
return R;
- return combineSetCCAtomicArith(EFLAGS, CC, DAG);
+ return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
}
/// Optimize X86ISD::CMOV [LHS, RHS, CONDCODE (e.g. X86::COND_NE), CONDVAL]
@@ -30999,10 +31892,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDLoc DL(N);
- // If the flag operand isn't dead, don't touch this CMOV.
- if (N->getNumValues() == 2 && !SDValue(N, 1).use_empty())
- return SDValue();
-
SDValue FalseOp = N->getOperand(0);
SDValue TrueOp = N->getOperand(1);
X86::CondCode CC = (X86::CondCode)N->getConstantOperandVal(2);
@@ -31021,11 +31910,11 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
// Try to simplify the EFLAGS and condition code operands.
// We can't always do this as FCMOV only supports a subset of X86 cond.
- if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG)) {
+ if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
SDValue Ops[] = {FalseOp, TrueOp, DAG.getConstant(CC, DL, MVT::i8),
Flags};
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
+ return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
@@ -31054,8 +31943,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
unsigned ShAmt = TrueC->getAPIntValue().logBase2();
Cond = DAG.getNode(ISD::SHL, DL, Cond.getValueType(), Cond,
DAG.getConstant(ShAmt, DL, MVT::i8));
- if (N->getNumValues() == 2) // Dead flag value?
- return DCI.CombineTo(N, Cond, SDValue());
return Cond;
}
@@ -31069,9 +31956,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
FalseC->getValueType(0), Cond);
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
-
- if (N->getNumValues() == 2) // Dead flag value?
- return DCI.CombineTo(N, Cond, SDValue());
return Cond;
}
@@ -31112,8 +31996,6 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
if (FalseC->getAPIntValue() != 0)
Cond = DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
SDValue(FalseC, 0));
- if (N->getNumValues() == 2) // Dead flag value?
- return DCI.CombineTo(N, Cond, SDValue());
return Cond;
}
}
@@ -31153,7 +32035,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
CmpAgainst == dyn_cast<ConstantSDNode>(TrueOp)) {
SDValue Ops[] = { FalseOp, Cond.getOperand(0),
DAG.getConstant(CC, DL, MVT::i8), Cond };
- return DAG.getNode(X86ISD::CMOV, DL, N->getVTList (), Ops);
+ return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
}
}
}
@@ -31188,10 +32070,9 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
SDValue LOps[] = {FalseOp, TrueOp, DAG.getConstant(CC0, DL, MVT::i8),
Flags};
- SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), LOps);
+ SDValue LCMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), LOps);
SDValue Ops[] = {LCMOV, TrueOp, DAG.getConstant(CC1, DL, MVT::i8), Flags};
- SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getVTList(), Ops);
- DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), SDValue(CMOV.getNode(), 1));
+ SDValue CMOV = DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
return CMOV;
}
}
@@ -31307,7 +32188,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// pmulld is supported since SSE41. It is better to use pmulld
// instead of pmullw+pmulhw, except for subtargets where pmulld is slower than
// the expansion.
- bool OptForMinSize = DAG.getMachineFunction().getFunction()->optForMinSize();
+ bool OptForMinSize = DAG.getMachineFunction().getFunction().optForMinSize();
if (Subtarget.hasSSE41() && (OptForMinSize || !Subtarget.isPMULLDSlow()))
return SDValue();
@@ -31319,15 +32200,19 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getOperand(0).getValueType();
+ unsigned NumElts = VT.getVectorNumElements();
+ if ((NumElts % 2) != 0)
+ return SDValue();
+
unsigned RegSize = 128;
MVT OpsVT = MVT::getVectorVT(MVT::i16, RegSize / 16);
- EVT ReducedVT =
- EVT::getVectorVT(*DAG.getContext(), MVT::i16, VT.getVectorNumElements());
+ EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16, NumElts);
+
// Shrink the operands of mul.
SDValue NewN0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N0);
SDValue NewN1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, N1);
- if (VT.getVectorNumElements() >= OpsVT.getVectorNumElements()) {
+ if (NumElts >= OpsVT.getVectorNumElements()) {
// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
// lower part is needed.
SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
@@ -31335,7 +32220,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
DL, VT, MulLo);
} else {
- MVT ResVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
+ MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
@@ -31344,22 +32229,22 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// Repack the lower part and higher part result of mul into a wider
// result.
// Generate shuffle functioning as punpcklwd.
- SmallVector<int, 16> ShuffleMask(VT.getVectorNumElements());
- for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
+ SmallVector<int, 16> ShuffleMask(NumElts);
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
ShuffleMask[2 * i] = i;
- ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements();
+ ShuffleMask[2 * i + 1] = i + NumElts;
}
SDValue ResLo =
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResLo = DAG.getNode(ISD::BITCAST, DL, ResVT, ResLo);
+ ResLo = DAG.getBitcast(ResVT, ResLo);
// Generate shuffle functioning as punpckhwd.
- for (unsigned i = 0; i < VT.getVectorNumElements() / 2; i++) {
- ShuffleMask[2 * i] = i + VT.getVectorNumElements() / 2;
- ShuffleMask[2 * i + 1] = i + VT.getVectorNumElements() * 3 / 2;
+ for (unsigned i = 0, e = NumElts / 2; i < e; i++) {
+ ShuffleMask[2 * i] = i + NumElts / 2;
+ ShuffleMask[2 * i + 1] = i + NumElts * 3 / 2;
}
SDValue ResHi =
DAG.getVectorShuffle(ReducedVT, DL, MulLo, MulHi, ShuffleMask);
- ResHi = DAG.getNode(ISD::BITCAST, DL, ResVT, ResHi);
+ ResHi = DAG.getBitcast(ResVT, ResHi);
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ResLo, ResHi);
}
} else {
@@ -31405,8 +32290,8 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// Repack the lower part and higher part result of mul into a wider
// result. Make sure the type of mul result is VT.
MVT ResVT = MVT::getVectorVT(MVT::i32, RegSize / 32);
- SDValue Res = DAG.getNode(X86ISD::UNPCKL, DL, OpsVT, MulLo, MulHi);
- Res = DAG.getNode(ISD::BITCAST, DL, ResVT, Res);
+ SDValue Res = getUnpackl(DAG, DL, OpsVT, MulLo, MulHi);
+ Res = DAG.getBitcast(ResVT, Res);
return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
DAG.getIntPtrConstant(0, DL));
}
@@ -31496,7 +32381,7 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
if (!MulConstantOptimization)
return SDValue();
// An imul is usually smaller than the alternative sequence.
- if (DAG.getMachineFunction().getFunction()->optForMinSize())
+ if (DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();
if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
@@ -31653,7 +32538,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
-static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N0.getValueType();
@@ -31706,6 +32591,41 @@ static SDValue combineShiftRightAlgebraic(SDNode *N, SelectionDAG &DAG) {
return SDValue();
}
+static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N0.getValueType();
+
+ // Try to improve a sequence of srl (and X, C1), C2 by inverting the order.
+ // TODO: This is a generic DAG combine that became an x86-only combine to
+ // avoid shortcomings in other folds such as bswap, bit-test ('bt'), and
+ // and-not ('andn').
+ if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+ return SDValue();
+
+ auto *ShiftC = dyn_cast<ConstantSDNode>(N1);
+ auto *AndC = dyn_cast<ConstantSDNode>(N0.getOperand(1));
+ if (!ShiftC || !AndC)
+ return SDValue();
+
+ // If we can shrink the constant mask below 8-bits or 32-bits, then this
+ // transform should reduce code size. It may also enable secondary transforms
+ // from improved known-bits analysis or instruction selection.
+ APInt MaskVal = AndC->getAPIntValue();
+ APInt NewMaskVal = MaskVal.lshr(ShiftC->getAPIntValue());
+ unsigned OldMaskSize = MaskVal.getMinSignedBits();
+ unsigned NewMaskSize = NewMaskVal.getMinSignedBits();
+ if ((OldMaskSize > 8 && NewMaskSize <= 8) ||
+ (OldMaskSize > 32 && NewMaskSize <= 32)) {
+ // srl (and X, AndC), ShiftC --> and (srl X, ShiftC), (AndC >> ShiftC)
+ SDLoc DL(N);
+ SDValue NewMask = DAG.getConstant(NewMaskVal, DL, VT);
+ SDValue NewShift = DAG.getNode(ISD::SRL, DL, VT, N0.getOperand(0), N1);
+ return DAG.getNode(ISD::AND, DL, VT, NewShift, NewMask);
+ }
+ return SDValue();
+}
+
/// \brief Returns a vector of 0s if the node in input is a vector logical
/// shift by a constant amount which is known to be bigger than or equal
/// to the vector element size in bits.
@@ -31745,7 +32665,11 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
return V;
if (N->getOpcode() == ISD::SRA)
- if (SDValue V = combineShiftRightAlgebraic(N, DAG))
+ if (SDValue V = combineShiftRightArithmetic(N, DAG))
+ return V;
+
+ if (N->getOpcode() == ISD::SRL)
+ if (SDValue V = combineShiftRightLogical(N, DAG))
return V;
// Try to fold this logical shift into a zero vector.
@@ -31756,6 +32680,90 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+ "Unexpected shift opcode");
+
+ EVT VT = N->getValueType(0);
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ unsigned DstBitsPerElt = VT.getScalarSizeInBits();
+ unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
+ assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
+ N1.getScalarValueSizeInBits() == SrcBitsPerElt &&
+ "Unexpected PACKSS/PACKUS input type");
+
+ // Constant Folding.
+ APInt UndefElts0, UndefElts1;
+ SmallVector<APInt, 32> EltBits0, EltBits1;
+ if ((N0->isUndef() || N->isOnlyUserOf(N0.getNode())) &&
+ (N1->isUndef() || N->isOnlyUserOf(N1.getNode())) &&
+ getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
+ getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
+ unsigned NumLanes = VT.getSizeInBits() / 128;
+ unsigned NumDstElts = VT.getVectorNumElements();
+ unsigned NumSrcElts = NumDstElts / 2;
+ unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
+ unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
+ bool IsSigned = (X86ISD::PACKSS == Opcode);
+
+ APInt Undefs(NumDstElts, 0);
+ SmallVector<APInt, 32> Bits(NumDstElts, APInt::getNullValue(DstBitsPerElt));
+ for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
+ for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) {
+ unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane;
+ auto &UndefElts = (Elt >= NumSrcEltsPerLane ? UndefElts1 : UndefElts0);
+ auto &EltBits = (Elt >= NumSrcEltsPerLane ? EltBits1 : EltBits0);
+
+ if (UndefElts[SrcIdx]) {
+ Undefs.setBit(Lane * NumDstEltsPerLane + Elt);
+ continue;
+ }
+
+ APInt &Val = EltBits[SrcIdx];
+ if (IsSigned) {
+ // PACKSS: Truncate signed value with signed saturation.
+ // Source values less than dst minint are saturated to minint.
+ // Source values greater than dst maxint are saturated to maxint.
+ if (Val.isSignedIntN(DstBitsPerElt))
+ Val = Val.trunc(DstBitsPerElt);
+ else if (Val.isNegative())
+ Val = APInt::getSignedMinValue(DstBitsPerElt);
+ else
+ Val = APInt::getSignedMaxValue(DstBitsPerElt);
+ } else {
+ // PACKUS: Truncate signed value with unsigned saturation.
+ // Source values less than zero are saturated to zero.
+ // Source values greater than dst maxuint are saturated to maxuint.
+ if (Val.isIntN(DstBitsPerElt))
+ Val = Val.trunc(DstBitsPerElt);
+ else if (Val.isNegative())
+ Val = APInt::getNullValue(DstBitsPerElt);
+ else
+ Val = APInt::getAllOnesValue(DstBitsPerElt);
+ }
+ Bits[Lane * NumDstEltsPerLane + Elt] = Val;
+ }
+ }
+
+ return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
+ }
+
+ // Attempt to combine as shuffle.
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
+ DCI.CombineTo(N, Res);
+ return SDValue();
+ }
+
+ return SDValue();
+}
+
static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -31796,15 +32804,24 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
N0.getOpcode() == X86ISD::VSRAI)
return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
+ // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
+ if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSHLI &&
+ N1 == N0.getOperand(1)) {
+ SDValue N00 = N0.getOperand(0);
+ unsigned NumSignBits = DAG.ComputeNumSignBits(N00);
+ if (ShiftVal.ult(NumSignBits))
+ return N00;
+ }
+
// We can decode 'whole byte' logical bit shifts as shuffles.
if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
SDValue Op(N, 0);
- SmallVector<int, 1> NonceMask; // Just a placeholder.
- NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
- /*Depth*/ 1, /*HasVarMask*/ false, DAG,
- DCI, Subtarget))
- return SDValue(); // This routine will use CombineTo to replace N.
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
+ DCI.CombineTo(N, Res);
+ return SDValue();
+ }
}
// Constant Folding.
@@ -31840,11 +32857,13 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
// Attempt to combine PINSRB/PINSRW patterns to a shuffle.
SDValue Op(N, 0);
- SmallVector<int, 1> NonceMask; // Just a placeholder.
- NonceMask.push_back(0);
- combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
- /*Depth*/ 1, /*HasVarMask*/ false, DAG,
- DCI, Subtarget);
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
+ DCI.CombineTo(N, Res);
+ return SDValue();
+ }
+
return SDValue();
}
@@ -31911,8 +32930,9 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
SDValue FSetCC =
DAG.getNode(X86ISD::FSETCCM, DL, MVT::v1i1, CMP00, CMP01,
DAG.getConstant(x86cc, DL, MVT::i8));
- return DAG.getNode(X86ISD::VEXTRACT, DL, N->getSimpleValueType(0),
- FSetCC, DAG.getIntPtrConstant(0, DL));
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
+ N->getSimpleValueType(0), FSetCC,
+ DAG.getIntPtrConstant(0, DL));
}
SDValue OnesOrZeroesF = DAG.getNode(X86ISD::FSETCC, DL,
CMP00.getValueType(), CMP00, CMP01,
@@ -32103,8 +33123,7 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
return SDValue();
APInt SplatVal;
- if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal,
- /*AllowShrink*/false) ||
+ if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
!SplatVal.isMask())
return SDValue();
@@ -32122,9 +33141,137 @@ static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
return DAG.getBitcast(N->getValueType(0), Shift);
}
+// Get the index node from the lowered DAG of a GEP IR instruction with one
+// indexing dimension.
+static SDValue getIndexFromUnindexedLoad(LoadSDNode *Ld) {
+ if (Ld->isIndexed())
+ return SDValue();
+
+ SDValue Base = Ld->getBasePtr();
+
+ if (Base.getOpcode() != ISD::ADD)
+ return SDValue();
+
+ SDValue ShiftedIndex = Base.getOperand(0);
+
+ if (ShiftedIndex.getOpcode() != ISD::SHL)
+ return SDValue();
+
+ return ShiftedIndex.getOperand(0);
+
+}
+
+static bool hasBZHI(const X86Subtarget &Subtarget, MVT VT) {
+ if (Subtarget.hasBMI2() && VT.isScalarInteger()) {
+ switch (VT.getSizeInBits()) {
+ default: return false;
+ case 64: return Subtarget.is64Bit() ? true : false;
+ case 32: return true;
+ }
+ }
+ return false;
+}
+
+// This function recognizes cases where X86 bzhi instruction can replace and
+// 'and-load' sequence.
+// In case of loading integer value from an array of constants which is defined
+// as follows:
+//
+// int array[SIZE] = {0x0, 0x1, 0x3, 0x7, 0xF ..., 2^(SIZE-1) - 1}
+//
+// then applying a bitwise and on the result with another input.
+// It's equivalent to performing bzhi (zero high bits) on the input, with the
+// same index of the load.
+static SDValue combineAndLoadToBZHI(SDNode *Node, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ MVT VT = Node->getSimpleValueType(0);
+ SDLoc dl(Node);
+
+ // Check if subtarget has BZHI instruction for the node's type
+ if (!hasBZHI(Subtarget, VT))
+ return SDValue();
+
+ // Try matching the pattern for both operands.
+ for (unsigned i = 0; i < 2; i++) {
+ SDValue N = Node->getOperand(i);
+ LoadSDNode *Ld = dyn_cast<LoadSDNode>(N.getNode());
+
+ // continue if the operand is not a load instruction
+ if (!Ld)
+ return SDValue();
+
+ const Value *MemOp = Ld->getMemOperand()->getValue();
+
+ if (!MemOp)
+ return SDValue();
+
+ if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(MemOp)) {
+ if (GlobalVariable *GV = dyn_cast<GlobalVariable>(GEP->getOperand(0))) {
+ if (GV->isConstant() && GV->hasDefinitiveInitializer()) {
+
+ Constant *Init = GV->getInitializer();
+ Type *Ty = Init->getType();
+ if (!isa<ConstantDataArray>(Init) ||
+ !Ty->getArrayElementType()->isIntegerTy() ||
+ Ty->getArrayElementType()->getScalarSizeInBits() !=
+ VT.getSizeInBits() ||
+ Ty->getArrayNumElements() >
+ Ty->getArrayElementType()->getScalarSizeInBits())
+ continue;
+
+ // Check if the array's constant elements are suitable to our case.
+ uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
+ bool ConstantsMatch = true;
+ for (uint64_t j = 0; j < ArrayElementCount; j++) {
+ ConstantInt *Elem =
+ dyn_cast<ConstantInt>(Init->getAggregateElement(j));
+ if (Elem->getZExtValue() != (((uint64_t)1 << j) - 1)) {
+ ConstantsMatch = false;
+ break;
+ }
+ }
+ if (!ConstantsMatch)
+ continue;
+
+ // Do the transformation (For 32-bit type):
+ // -> (and (load arr[idx]), inp)
+ // <- (and (srl 0xFFFFFFFF, (sub 32, idx)))
+ // that will be replaced with one bzhi instruction.
+ SDValue Inp = (i == 0) ? Node->getOperand(1) : Node->getOperand(0);
+ SDValue SizeC = DAG.getConstant(VT.getSizeInBits(), dl, VT);
+
+ // Get the Node which indexes into the array.
+ SDValue Index = getIndexFromUnindexedLoad(Ld);
+ if (!Index)
+ return SDValue();
+ Index = DAG.getZExtOrTrunc(Index, dl, VT);
+
+ SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, SizeC, Index);
+
+ SDValue AllOnes = DAG.getAllOnesConstant(dl, VT);
+ SDValue LShr = DAG.getNode(ISD::SRL, dl, VT, AllOnes, Sub);
+
+ return DAG.getNode(ISD::AND, dl, VT, Inp, LShr);
+ }
+ }
+ }
+ }
+ return SDValue();
+}
+
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ EVT VT = N->getValueType(0);
+
+ // If this is SSE1 only convert to FAND to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(
+ MVT::v4i32, DAG.getNode(X86ISD::FAND, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
+ DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -32140,45 +33287,59 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
return ShiftRight;
- EVT VT = N->getValueType(0);
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
+ if (SDValue R = combineAndLoadToBZHI(N, DAG, Subtarget))
+ return R;
// Attempt to recursively combine a bitmask AND with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- SmallVector<int, 1> NonceMask; // Just a placeholder.
- NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
- /*Depth*/ 1, /*HasVarMask*/ false, DAG,
- DCI, Subtarget))
- return SDValue(); // This routine will use CombineTo to replace N.
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
+ DCI.CombineTo(N, Res);
+ return SDValue();
+ }
}
- // Create BEXTR instructions
- // BEXTR is ((X >> imm) & (2**size-1))
- if (VT != MVT::i32 && VT != MVT::i64)
- return SDValue();
+ // Attempt to combine a scalar bitmask AND with an extracted shuffle.
+ if ((VT.getScalarSizeInBits() % 8) == 0 &&
+ N->getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+ isa<ConstantSDNode>(N->getOperand(0).getOperand(1))) {
+ SDValue BitMask = N->getOperand(1);
+ SDValue SrcVec = N->getOperand(0).getOperand(0);
+ EVT SrcVecVT = SrcVec.getValueType();
- if (!Subtarget.hasBMI() && !Subtarget.hasTBM())
- return SDValue();
- if (N0.getOpcode() != ISD::SRA && N0.getOpcode() != ISD::SRL)
- return SDValue();
+ // Check that the constant bitmask masks whole bytes.
+ APInt UndefElts;
+ SmallVector<APInt, 64> EltBits;
+ if (VT == SrcVecVT.getScalarType() &&
+ N->getOperand(0)->isOnlyUserOf(SrcVec.getNode()) &&
+ getTargetConstantBitsFromNode(BitMask, 8, UndefElts, EltBits) &&
+ llvm::all_of(EltBits, [](APInt M) {
+ return M.isNullValue() || M.isAllOnesValue();
+ })) {
+ unsigned NumElts = SrcVecVT.getVectorNumElements();
+ unsigned Scale = SrcVecVT.getScalarSizeInBits() / 8;
+ unsigned Idx = N->getOperand(0).getConstantOperandVal(1);
+
+ // Create a root shuffle mask from the byte mask and the extracted index.
+ SmallVector<int, 16> ShuffleMask(NumElts * Scale, SM_SentinelUndef);
+ for (unsigned i = 0; i != Scale; ++i) {
+ if (UndefElts[i])
+ continue;
+ int VecIdx = Scale * Idx + i;
+ ShuffleMask[VecIdx] =
+ EltBits[i].isNullValue() ? SM_SentinelZero : VecIdx;
+ }
- ConstantSDNode *MaskNode = dyn_cast<ConstantSDNode>(N1);
- ConstantSDNode *ShiftNode = dyn_cast<ConstantSDNode>(N0.getOperand(1));
- if (MaskNode && ShiftNode) {
- uint64_t Mask = MaskNode->getZExtValue();
- uint64_t Shift = ShiftNode->getZExtValue();
- if (isMask_64(Mask)) {
- uint64_t MaskSize = countPopulation(Mask);
- if (Shift + MaskSize <= VT.getSizeInBits())
- return DAG.getNode(X86ISD::BEXTR, DL, VT, N0.getOperand(0),
- DAG.getConstant(Shift | (MaskSize << 8), DL,
- VT));
+ if (SDValue Shuffle = combineX86ShufflesRecursively(
+ {SrcVec}, 0, SrcVec, ShuffleMask, {}, /*Depth*/ 2,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget))
+ return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), VT, Shuffle,
+ N->getOperand(0).getOperand(1));
}
}
+
return SDValue();
}
@@ -32411,6 +33572,18 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // If this is SSE1 only convert to FOR to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(MVT::v4i32,
+ DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N0),
+ DAG.getBitcast(MVT::v4f32, N1)));
+ }
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -32423,15 +33596,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
return R;
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- EVT VT = N->getValueType(0);
-
if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.getMachineFunction().getFunction()->optForSize();
+ bool OptForSize = DAG.getMachineFunction().getFunction().optForSize();
// SHLD/SHRD instructions have lower register pressure, but on some
// platforms they have higher latency than the equivalent
@@ -32521,38 +33690,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Generate NEG and CMOV for integer abs.
-static SDValue combineIntegerAbs(SDNode *N, SelectionDAG &DAG) {
- EVT VT = N->getValueType(0);
-
- // Since X86 does not have CMOV for 8-bit integer, we don't convert
- // 8-bit integer abs to NEG and CMOV.
- if (VT.isInteger() && VT.getSizeInBits() == 8)
- return SDValue();
-
- SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
- SDLoc DL(N);
-
- // Check pattern of XOR(ADD(X,Y), Y) where Y is SRA(X, size(X)-1)
- // and change it to SUB and CMOV.
- if (VT.isInteger() && N->getOpcode() == ISD::XOR &&
- N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
- N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0)) {
- auto *Y1C = dyn_cast<ConstantSDNode>(N1.getOperand(1));
- if (Y1C && Y1C->getAPIntValue() == VT.getSizeInBits() - 1) {
- // Generate SUB & CMOV.
- SDValue Neg = DAG.getNode(X86ISD::SUB, DL, DAG.getVTList(VT, MVT::i32),
- DAG.getConstant(0, DL, VT), N0.getOperand(0));
- SDValue Ops[] = {N0.getOperand(0), Neg,
- DAG.getConstant(X86::COND_GE, DL, MVT::i8),
- SDValue(Neg.getNode(), 1)};
- return DAG.getNode(X86ISD::CMOV, DL, DAG.getVTList(VT, MVT::Glue), Ops);
- }
- }
- return SDValue();
-}
-
/// Try to turn tests against the signbit in the form of:
/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
/// into:
@@ -32688,8 +33825,7 @@ static SDValue detectUSatPattern(SDValue In, EVT VT) {
"Unexpected types for truncate operation");
APInt C;
- if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C,
- /*AllowShrink*/false)) {
+ if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
// C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
// the element size of the destination type.
return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
@@ -33081,6 +34217,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
WideSrc0 = DAG.getVectorShuffle(WideVecVT, dl, WideSrc0,
DAG.getUNDEF(WideVecVT), ShuffleVec);
}
+
// Prepare the new mask.
SDValue NewMask;
SDValue Mask = Mld->getMask();
@@ -33103,12 +34240,9 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
WidenNumElts);
unsigned NumConcat = WidenNumElts / MaskNumElts;
- SmallVector<SDValue, 16> Ops(NumConcat);
SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
Ops[0] = Mask;
- for (unsigned i = 1; i != NumConcat; ++i)
- Ops[i] = ZeroVal;
-
NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
@@ -33154,8 +34288,33 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
if (Mst->isCompressingStore())
return SDValue();
- if (!Mst->isTruncatingStore())
- return reduceMaskedStoreToScalarStore(Mst, DAG);
+ if (!Mst->isTruncatingStore()) {
+ if (SDValue ScalarStore = reduceMaskedStoreToScalarStore(Mst, DAG))
+ return ScalarStore;
+
+ // If the mask is checking (0 > X), we're creating a vector with all-zeros
+ // or all-ones elements based on the sign bits of X. AVX1 masked store only
+ // cares about the sign bit of each mask element, so eliminate the compare:
+ // mstore val, ptr, (pcmpgt 0, X) --> mstore val, ptr, X
+ // Note that by waiting to match an x86-specific PCMPGT node, we're
+ // eliminating potentially more complex matching of a setcc node which has
+ // a full range of predicates.
+ SDValue Mask = Mst->getMask();
+ if (Mask.getOpcode() == X86ISD::PCMPGT &&
+ ISD::isBuildVectorAllZeros(Mask.getOperand(0).getNode())) {
+ assert(Mask.getValueType() == Mask.getOperand(1).getValueType() &&
+ "Unexpected type for PCMPGT");
+ return DAG.getMaskedStore(
+ Mst->getChain(), SDLoc(N), Mst->getValue(), Mst->getBasePtr(),
+ Mask.getOperand(1), Mst->getMemoryVT(), Mst->getMemOperand());
+ }
+
+ // TODO: AVX512 targets should also be able to simplify something like the
+ // pattern above, but that pattern will be different. It will either need to
+ // match setcc more generally or match PCMPGTM later (in tablegen?).
+
+ return SDValue();
+ }
// Resolve truncating stores.
EVT VT = Mst->getValue().getValueType();
@@ -33226,12 +34385,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
WidenNumElts);
unsigned NumConcat = WidenNumElts / MaskNumElts;
- SmallVector<SDValue, 16> Ops(NumConcat);
SDValue ZeroVal = DAG.getConstant(0, dl, Mask.getValueType());
+ SmallVector<SDValue, 16> Ops(NumConcat, ZeroVal);
Ops[0] = Mask;
- for (unsigned i = 1; i != NumConcat; ++i)
- Ops[i] = ZeroVal;
-
NewMask = DAG.getNode(ISD::CONCAT_VECTORS, dl, NewMaskVT, Ops);
}
@@ -33384,8 +34540,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (VT.getSizeInBits() != 64)
return SDValue();
- const Function *F = DAG.getMachineFunction().getFunction();
- bool NoImplicitFloatOps = F->hasFnAttribute(Attribute::NoImplicitFloat);
+ const Function &F = DAG.getMachineFunction().getFunction();
+ bool NoImplicitFloatOps = F.hasFnAttribute(Attribute::NoImplicitFloat);
bool F64IsLegal =
!Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2();
if ((VT.isVector() ||
@@ -33393,28 +34549,10 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
isa<LoadSDNode>(St->getValue()) &&
!cast<LoadSDNode>(St->getValue())->isVolatile() &&
St->getChain().hasOneUse() && !St->isVolatile()) {
- SDNode* LdVal = St->getValue().getNode();
- LoadSDNode *Ld = nullptr;
- int TokenFactorIndex = -1;
+ LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
SmallVector<SDValue, 8> Ops;
- SDNode* ChainVal = St->getChain().getNode();
- // Must be a store of a load. We currently handle two cases: the load
- // is a direct child, and it's under an intervening TokenFactor. It is
- // possible to dig deeper under nested TokenFactors.
- if (ChainVal == LdVal)
- Ld = cast<LoadSDNode>(St->getChain());
- else if (St->getValue().hasOneUse() &&
- ChainVal->getOpcode() == ISD::TokenFactor) {
- for (unsigned i = 0, e = ChainVal->getNumOperands(); i != e; ++i) {
- if (ChainVal->getOperand(i).getNode() == LdVal) {
- TokenFactorIndex = i;
- Ld = cast<LoadSDNode>(St->getValue());
- } else
- Ops.push_back(ChainVal->getOperand(i));
- }
- }
- if (!Ld || !ISD::isNormalLoad(Ld))
+ if (!ISD::isNormalLoad(Ld))
return SDValue();
// If this is not the MMX case, i.e. we are just turning i64 load/store
@@ -33431,17 +34569,12 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
if (Subtarget.is64Bit() || F64IsLegal) {
MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
- Ld->getPointerInfo(), Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
+ Ld->getMemOperand());
+
// Make sure new load is placed in same chain order.
- SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
- if (TokenFactorIndex >= 0) {
- Ops.push_back(NewChain);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
- }
- return DAG.getStore(NewChain, StDL, NewLd, St->getBasePtr(),
- St->getPointerInfo(), St->getAlignment(),
- St->getMemOperand()->getFlags());
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
+ return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
+ St->getMemOperand());
}
// Otherwise, lower to two pairs of 32-bit loads / stores.
@@ -33456,23 +34589,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
MinAlign(Ld->getAlignment(), 4),
Ld->getMemOperand()->getFlags());
// Make sure new loads are placed in same chain order.
- SDValue NewChain = DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
- NewChain = DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
-
- if (TokenFactorIndex >= 0) {
- Ops.push_back(NewChain);
- NewChain = DAG.getNode(ISD::TokenFactor, LdDL, MVT::Other, Ops);
- }
+ DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
+ DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
LoAddr = St->getBasePtr();
HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
SDValue LoSt =
- DAG.getStore(NewChain, StDL, LoLd, LoAddr, St->getPointerInfo(),
+ DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
St->getAlignment(), St->getMemOperand()->getFlags());
- SDValue HiSt = DAG.getStore(
- NewChain, StDL, HiLd, HiAddr, St->getPointerInfo().getWithOffset(4),
- MinAlign(St->getAlignment(), 4), St->getMemOperand()->getFlags());
+ SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
+ St->getPointerInfo().getWithOffset(4),
+ MinAlign(St->getAlignment(), 4),
+ St->getMemOperand()->getFlags());
return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
}
@@ -33726,6 +34855,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
LLVM_FALLTHROUGH;
case ISD::ADD: {
+ // TODO: ISD::SUB should be here but interferes with combineSubToSubus.
SDValue Op0 = Src.getOperand(0);
SDValue Op1 = Src.getOperand(1);
if (TLI.isOperationLegal(Opcode, VT) &&
@@ -33882,8 +35012,9 @@ static SDValue combineVectorTruncation(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// This function transforms vector truncation of 'all or none' bits values.
-/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS operations.
+/// This function transforms vector truncation of 'extended sign-bits' or
+/// 'extended zero-bits' values.
+/// vXi16/vXi32/vXi64 to vXi8/vXi16/vXi32 into X86ISD::PACKSS/PACKUS operations.
static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
@@ -33904,12 +35035,6 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
MVT InVT = In.getValueType().getSimpleVT();
MVT InSVT = InVT.getScalarType();
- // Use PACKSS if the input is a splatted sign bit.
- // e.g. Comparison result, sext_in_reg, etc.
- unsigned NumSignBits = DAG.ComputeNumSignBits(In);
- if (NumSignBits != InSVT.getSizeInBits())
- return SDValue();
-
// Check we have a truncation suited for PACKSS.
if (!VT.is128BitVector() && !VT.is256BitVector())
return SDValue();
@@ -33918,7 +35043,23 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, SDLoc &DL,
if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
return SDValue();
- return truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget);
+ // Use PACKSS if the input has sign-bits that extend all the way to the
+ // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
+ unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+ unsigned NumPackedBits = std::min<unsigned>(SVT.getSizeInBits(), 16);
+ if (NumSignBits > (InSVT.getSizeInBits() - NumPackedBits))
+ return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
+
+ // Use PACKUS if the input has zero-bits that extend all the way to the
+ // packed/truncated value. e.g. masks, zext_in_reg, etc.
+ KnownBits Known;
+ DAG.computeKnownBits(In, Known);
+ unsigned NumLeadingZeroBits = Known.countMinLeadingZeros();
+ NumPackedBits = Subtarget.hasSSE41() ? NumPackedBits : 8;
+ if (NumLeadingZeroBits >= (InSVT.getSizeInBits() - NumPackedBits))
+ return truncateVectorWithPACK(X86ISD::PACKUS, VT, In, DL, DAG, Subtarget);
+
+ return SDValue();
}
static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
@@ -33947,7 +35088,7 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::MMX_MOVD2W, DL, MVT::i32, BCSrc);
}
- // Try to truncate extended sign bits with PACKSS.
+ // Try to truncate extended sign/zero bits with PACKSS/PACKUS.
if (SDValue V = combineVectorSignBitsTruncation(N, DL, DAG, Subtarget))
return V;
@@ -34038,10 +35179,10 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
unsigned NewOpcode = 0;
if (Arg.hasOneUse()) {
switch (Arg.getOpcode()) {
- case X86ISD::FMADD: NewOpcode = X86ISD::FNMSUB; break;
+ case ISD::FMA: NewOpcode = X86ISD::FNMSUB; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FNMADD; break;
case X86ISD::FNMADD: NewOpcode = X86ISD::FMSUB; break;
- case X86ISD::FNMSUB: NewOpcode = X86ISD::FMADD; break;
+ case X86ISD::FNMSUB: NewOpcode = ISD::FMA; break;
case X86ISD::FMADD_RND: NewOpcode = X86ISD::FNMSUB_RND; break;
case X86ISD::FMSUB_RND: NewOpcode = X86ISD::FNMADD_RND; break;
case X86ISD::FNMADD_RND: NewOpcode = X86ISD::FMSUB_RND; break;
@@ -34083,22 +35224,47 @@ static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+
+/// Fold a xor(setcc cond, val), 1 --> setcc (inverted(cond), val)
+static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
+ if (N->getOpcode() != ISD::XOR)
+ return SDValue();
+
+ SDValue LHS = N->getOperand(0);
+ auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
+ if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
+ return SDValue();
+
+ X86::CondCode NewCC = X86::GetOppositeBranchCondition(
+ X86::CondCode(LHS->getConstantOperandVal(0)));
+ SDLoc DL(N);
+ return getSETCC(NewCC, LHS->getOperand(1), DL, DAG);
+}
+
static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
+ // If this is SSE1 only convert to FXOR to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() &&
+ N->getValueType(0) == MVT::v4i32) {
+ return DAG.getBitcast(
+ MVT::v4i32, DAG.getNode(X86ISD::FXOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N->getOperand(0)),
+ DAG.getBitcast(MVT::v4f32, N->getOperand(1))));
+ }
+
if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
return Cmp;
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ if (SDValue SetCC = foldXor1SetCC(N, DAG))
+ return SetCC;
+
if (SDValue RV = foldXorTruncShiftIntoCmp(N, DAG))
return RV;
- if (Subtarget.hasCMov())
- if (SDValue RV = combineIntegerAbs(N, DAG))
- return RV;
-
if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
return FPLogic;
@@ -34138,10 +35304,13 @@ static SDValue combineFAndFNotToFAndn(SDNode *N, SelectionDAG &DAG,
// Vector types are handled in combineANDXORWithAllOnesIntoANDNP().
if (!((VT == MVT::f32 && Subtarget.hasSSE1()) ||
- (VT == MVT::f64 && Subtarget.hasSSE2())))
+ (VT == MVT::f64 && Subtarget.hasSSE2()) ||
+ (VT == MVT::v4f32 && Subtarget.hasSSE1() && !Subtarget.hasSSE2())))
return SDValue();
auto isAllOnesConstantFP = [](SDValue V) {
+ if (V.getSimpleValueType().isVector())
+ return ISD::isBuildVectorAllOnes(V.getNode());
auto *C = dyn_cast<ConstantFPSDNode>(V);
return C && C->getConstantFPValue()->isAllOnesValue();
};
@@ -34247,7 +35416,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
// This takes at least 3 instructions, so favor a library call when operating
// on a scalar and minimizing code size.
- if (!VT.isVector() && DAG.getMachineFunction().getFunction()->optForMinSize())
+ if (!VT.isVector() && DAG.getMachineFunction().getFunction().optForMinSize())
return SDValue();
SDValue Op0 = N->getOperand(0);
@@ -34301,12 +35470,12 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
// Attempt to recursively combine a bitmask ANDNP with shuffles.
if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
SDValue Op(N, 0);
- SmallVector<int, 1> NonceMask; // Just a placeholder.
- NonceMask.push_back(0);
- if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
- /*Depth*/ 1, /*HasVarMask*/ false, DAG,
- DCI, Subtarget))
- return SDValue(); // This routine will use CombineTo to replace N.
+ if (SDValue Res = combineX86ShufflesRecursively(
+ {Op}, 0, Op, {0}, {}, /*Depth*/ 1,
+ /*HasVarMask*/ false, DAG, DCI, Subtarget)) {
+ DCI.CombineTo(N, Res);
+ return SDValue();
+ }
}
return SDValue();
@@ -34314,19 +35483,15 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+
// BT ignores high bits in the bit index operand.
- SDValue Op1 = N->getOperand(1);
- if (Op1.hasOneUse()) {
- unsigned BitWidth = Op1.getValueSizeInBits();
- APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
- KnownBits Known;
- TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
- !DCI.isBeforeLegalizeOps());
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- if (TLI.ShrinkDemandedConstant(Op1, DemandedMask, TLO) ||
- TLI.SimplifyDemandedBits(Op1, DemandedMask, Known, TLO))
- DCI.CommitTargetLoweringOpt(TLO);
- }
+ unsigned BitWidth = N1.getValueSizeInBits();
+ APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
+ if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
+ return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
+
return SDValue();
}
@@ -34444,18 +35609,152 @@ static SDValue getDivRem8(SDNode *N, SelectionDAG &DAG) {
EVT VT = N->getValueType(0);
EVT InVT = N0.getValueType();
- if (N0.getResNo() != 1 || InVT != MVT::i8 || VT != MVT::i32)
+ if (N0.getResNo() != 1 || InVT != MVT::i8 ||
+ !(VT == MVT::i32 || VT == MVT::i64))
return SDValue();
- SDVTList NodeTys = DAG.getVTList(MVT::i8, VT);
+ SDVTList NodeTys = DAG.getVTList(MVT::i8, MVT::i32);
auto DivRemOpcode = OpcodeN0 == ISD::SDIVREM ? X86ISD::SDIVREM8_SEXT_HREG
: X86ISD::UDIVREM8_ZEXT_HREG;
SDValue R = DAG.getNode(DivRemOpcode, SDLoc(N), NodeTys, N0.getOperand(0),
N0.getOperand(1));
DAG.ReplaceAllUsesOfValueWith(N0.getValue(0), R.getValue(0));
+ // If this was a 64-bit extend, complete it.
+ if (VT == MVT::i64)
+ return DAG.getNode(OpcodeN, SDLoc(N), VT, R.getValue(1));
return R.getValue(1);
}
+// If we face {ANY,SIGN,ZERO}_EXTEND that is applied to a CMOV with constant
+// operands and the result of CMOV is not used anywhere else - promote CMOV
+// itself instead of promoting its result. This could be beneficial, because:
+// 1) X86TargetLowering::EmitLoweredSelect later can do merging of two
+// (or more) pseudo-CMOVs only when they go one-after-another and
+// getting rid of result extension code after CMOV will help that.
+// 2) Promotion of constant CMOV arguments is free, hence the
+// {ANY,SIGN,ZERO}_EXTEND will just be deleted.
+// 3) 16-bit CMOV encoding is 4 bytes, 32-bit CMOV is 3-byte, so this
+// promotion is also good in terms of code-size.
+// (64-bit CMOV is 4-bytes, that's why we don't do 32-bit => 64-bit
+// promotion).
+static SDValue combineToExtendCMOV(SDNode *Extend, SelectionDAG &DAG) {
+ SDValue CMovN = Extend->getOperand(0);
+ if (CMovN.getOpcode() != X86ISD::CMOV)
+ return SDValue();
+
+ EVT TargetVT = Extend->getValueType(0);
+ unsigned ExtendOpcode = Extend->getOpcode();
+ SDLoc DL(Extend);
+
+ EVT VT = CMovN.getValueType();
+ SDValue CMovOp0 = CMovN.getOperand(0);
+ SDValue CMovOp1 = CMovN.getOperand(1);
+
+ bool DoPromoteCMOV =
+ (VT == MVT::i16 && (TargetVT == MVT::i32 || TargetVT == MVT::i64)) &&
+ CMovN.hasOneUse() &&
+ (isa<ConstantSDNode>(CMovOp0.getNode()) &&
+ isa<ConstantSDNode>(CMovOp1.getNode()));
+
+ if (!DoPromoteCMOV)
+ return SDValue();
+
+ CMovOp0 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp0);
+ CMovOp1 = DAG.getNode(ExtendOpcode, DL, TargetVT, CMovOp1);
+
+ return DAG.getNode(X86ISD::CMOV, DL, TargetVT, CMovOp0, CMovOp1,
+ CMovN.getOperand(2), CMovN.getOperand(3));
+}
+
+// Convert (vXiY *ext(vXi1 bitcast(iX))) to extend_in_reg(broadcast(iX)).
+// This is more or less the reverse of combineBitcastvxi1.
+static SDValue
+combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ unsigned Opcode = N->getOpcode();
+ if (Opcode != ISD::SIGN_EXTEND && Opcode != ISD::ZERO_EXTEND &&
+ Opcode != ISD::ANY_EXTEND)
+ return SDValue();
+ if (!DCI.isBeforeLegalizeOps())
+ return SDValue();
+ if (!Subtarget.hasSSE2() || Subtarget.hasAVX512())
+ return SDValue();
+
+ SDValue N0 = N->getOperand(0);
+ EVT VT = N->getValueType(0);
+ EVT SVT = VT.getScalarType();
+ EVT InSVT = N0.getValueType().getScalarType();
+ unsigned EltSizeInBits = SVT.getSizeInBits();
+
+ // Input type must be extending a bool vector (bit-casted from a scalar
+ // integer) to legal integer types.
+ if (!VT.isVector())
+ return SDValue();
+ if (SVT != MVT::i64 && SVT != MVT::i32 && SVT != MVT::i16 && SVT != MVT::i8)
+ return SDValue();
+ if (InSVT != MVT::i1 || N0.getOpcode() != ISD::BITCAST)
+ return SDValue();
+
+ SDValue N00 = N0.getOperand(0);
+ EVT SclVT = N0.getOperand(0).getValueType();
+ if (!SclVT.isScalarInteger())
+ return SDValue();
+
+ SDLoc DL(N);
+ SDValue Vec;
+ SmallVector<int, 32> ShuffleMask;
+ unsigned NumElts = VT.getVectorNumElements();
+ assert(NumElts == SclVT.getSizeInBits() && "Unexpected bool vector size");
+
+ // Broadcast the scalar integer to the vector elements.
+ if (NumElts > EltSizeInBits) {
+ // If the scalar integer is greater than the vector element size, then we
+ // must split it down into sub-sections for broadcasting. For example:
+ // i16 -> v16i8 (i16 -> v8i16 -> v16i8) with 2 sub-sections.
+ // i32 -> v32i8 (i32 -> v8i32 -> v32i8) with 4 sub-sections.
+ assert((NumElts % EltSizeInBits) == 0 && "Unexpected integer scale");
+ unsigned Scale = NumElts / EltSizeInBits;
+ EVT BroadcastVT =
+ EVT::getVectorVT(*DAG.getContext(), SclVT, EltSizeInBits);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+ Vec = DAG.getBitcast(VT, Vec);
+
+ for (unsigned i = 0; i != Scale; ++i)
+ ShuffleMask.append(EltSizeInBits, i);
+ } else {
+ // For smaller scalar integers, we can simply any-extend it to the vector
+ // element size (we don't care about the upper bits) and broadcast it to all
+ // elements.
+ SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
+ Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
+ ShuffleMask.append(NumElts, 0);
+ }
+ Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+
+ // Now, mask the relevant bit in each element.
+ SmallVector<SDValue, 32> Bits;
+ for (unsigned i = 0; i != NumElts; ++i) {
+ int BitIdx = (i % EltSizeInBits);
+ APInt Bit = APInt::getBitsSet(EltSizeInBits, BitIdx, BitIdx + 1);
+ Bits.push_back(DAG.getConstant(Bit, DL, SVT));
+ }
+ SDValue BitMask = DAG.getBuildVector(VT, DL, Bits);
+ Vec = DAG.getNode(ISD::AND, DL, VT, Vec, BitMask);
+
+ // Compare against the bitmask and extend the result.
+ EVT CCVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+ Vec = DAG.getSetCC(DL, CCVT, Vec, BitMask, ISD::SETEQ);
+ Vec = DAG.getSExtOrTrunc(Vec, DL, VT);
+
+ // For SEXT, this is now done, otherwise shift the result down for
+ // zero-extension.
+ if (Opcode == ISD::SIGN_EXTEND)
+ return Vec;
+ return DAG.getNode(ISD::SRL, DL, VT, Vec,
+ DAG.getConstant(EltSizeInBits - 1, DL, VT));
+}
+
/// Convert a SEXT or ZEXT of a vector to a SIGN_EXTEND_VECTOR_INREG or
/// ZERO_EXTEND_VECTOR_INREG, this requires the splitting (or concatenating
/// with UNDEFs) of the input to vectors of the same size as the target type
@@ -34570,6 +35869,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (SDValue DivRem8 = getDivRem8(N, DAG))
return DivRem8;
+ if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
+ return NewCMov;
+
if (!DCI.isBeforeLegalizeOps()) {
if (InVT == MVT::i1) {
SDValue Zero = DAG.getConstant(0, DL, VT);
@@ -34592,6 +35894,9 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
+ if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
if (Subtarget.hasAVX() && VT.is256BitVector())
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
@@ -34604,6 +35909,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ // TODO: Handle FMSUB/FNMADD/FNMSUB as the starting opcode.
SDLoc dl(N);
EVT VT = N->getValueType(0);
@@ -34629,48 +35935,112 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
// Do not convert the passthru input of scalar intrinsics.
// FIXME: We could allow negations of the lower element only.
- bool NegA = N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
+ bool NegA = N->getOpcode() != X86ISD::FMADDS1 &&
+ N->getOpcode() != X86ISD::FMADDS1_RND && invertIfNegative(A);
bool NegB = invertIfNegative(B);
- bool NegC = N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
+ bool NegC = N->getOpcode() != X86ISD::FMADDS3 &&
+ N->getOpcode() != X86ISD::FMADDS3_RND && invertIfNegative(C);
// Negative multiplication when NegA xor NegB
bool NegMul = (NegA != NegB);
+ bool HasNeg = NegA || NegB || NegC;
unsigned NewOpcode;
if (!NegMul)
- NewOpcode = (!NegC) ? X86ISD::FMADD : X86ISD::FMSUB;
+ NewOpcode = (!NegC) ? unsigned(ISD::FMA) : unsigned(X86ISD::FMSUB);
else
NewOpcode = (!NegC) ? X86ISD::FNMADD : X86ISD::FNMSUB;
+ // For FMA, we risk reconstructing the node we started with.
+ // In order to avoid this, we check for negation or opcode change. If
+ // one of the two happened, then it is a new node and we return it.
+ if (N->getOpcode() == ISD::FMA) {
+ if (HasNeg || NewOpcode != N->getOpcode())
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+ return SDValue();
+ }
if (N->getOpcode() == X86ISD::FMADD_RND) {
switch (NewOpcode) {
- case X86ISD::FMADD: NewOpcode = X86ISD::FMADD_RND; break;
+ case ISD::FMA: NewOpcode = X86ISD::FMADD_RND; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB_RND; break;
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB_RND; break;
}
+ } else if (N->getOpcode() == X86ISD::FMADDS1) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS1; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1; break;
+ }
+ } else if (N->getOpcode() == X86ISD::FMADDS3) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS3; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3; break;
+ }
} else if (N->getOpcode() == X86ISD::FMADDS1_RND) {
switch (NewOpcode) {
- case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS1_RND; break;
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS1_RND; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS1_RND; break;
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS1_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS1_RND; break;
}
} else if (N->getOpcode() == X86ISD::FMADDS3_RND) {
switch (NewOpcode) {
- case X86ISD::FMADD: NewOpcode = X86ISD::FMADDS3_RND; break;
+ case ISD::FMA: NewOpcode = X86ISD::FMADDS3_RND; break;
case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUBS3_RND; break;
case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADDS3_RND; break;
case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUBS3_RND; break;
}
+ } else if (N->getOpcode() == X86ISD::FMADD4S) {
+ switch (NewOpcode) {
+ case ISD::FMA: NewOpcode = X86ISD::FMADD4S; break;
+ case X86ISD::FMSUB: NewOpcode = X86ISD::FMSUB4S; break;
+ case X86ISD::FNMADD: NewOpcode = X86ISD::FNMADD4S; break;
+ case X86ISD::FNMSUB: NewOpcode = X86ISD::FNMSUB4S; break;
+ }
} else {
- assert((N->getOpcode() == X86ISD::FMADD || N->getOpcode() == ISD::FMA) &&
- "Unexpected opcode!");
+ llvm_unreachable("Unexpected opcode!");
+ }
+
+ // Only return the node is the opcode was changed or one of the
+ // operand was negated. If not, we'll just recreate the same node.
+ if (HasNeg || NewOpcode != N->getOpcode()) {
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
return DAG.getNode(NewOpcode, dl, VT, A, B, C);
}
- return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+ return SDValue();
+}
+
+// Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
+static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc dl(N);
+ EVT VT = N->getValueType(0);
+
+ SDValue NegVal = isFNEG(N->getOperand(2).getNode());
+ if (!NegVal)
+ return SDValue();
+
+ unsigned NewOpcode;
+ switch (N->getOpcode()) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86ISD::FMADDSUB: NewOpcode = X86ISD::FMSUBADD; break;
+ case X86ISD::FMADDSUB_RND: NewOpcode = X86ISD::FMSUBADD_RND; break;
+ case X86ISD::FMSUBADD: NewOpcode = X86ISD::FMADDSUB; break;
+ case X86ISD::FMSUBADD_RND: NewOpcode = X86ISD::FMADDSUB_RND; break;
+ }
+
+ if (N->getNumOperands() == 4)
+ return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
+ NegVal, N->getOperand(3));
+ return DAG.getNode(NewOpcode, dl, VT, N->getOperand(0), N->getOperand(1),
+ NegVal);
}
static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
@@ -34710,9 +36080,15 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
+ return NewCMov;
+
if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
return V;
+ if (SDValue V = combineToExtendBoolVectorInReg(N, DAG, DCI, Subtarget))
+ return V;
+
if (VT.is256BitVector())
if (SDValue R = WidenMaskArithmetic(N, DAG, DCI, Subtarget))
return R;
@@ -34804,23 +36180,19 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
return V;
}
- if (VT.getScalarType() == MVT::i1 &&
+ if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
- bool IsSEXT0 =
- (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
- bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
-
- if (!IsSEXT0 || !IsVZero1) {
- // Swap the operands and update the condition code.
+ // Put build_vectors on the right.
+ if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
std::swap(LHS, RHS);
CC = ISD::getSetCCSwappedOperands(CC);
-
- IsSEXT0 = (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (LHS.getOperand(0).getValueType().getScalarType() == MVT::i1);
- IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
}
+ bool IsSEXT0 =
+ (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
+ (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+
if (IsSEXT0 && IsVZero1) {
assert(VT == LHS.getOperand(0).getValueType() &&
"Uexpected operand type");
@@ -34846,17 +36218,92 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineMOVMSK(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI) {
+ SDValue Src = N->getOperand(0);
+ MVT SrcVT = Src.getSimpleValueType();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+
+ // MOVMSK only uses the MSB from each vector element.
+ KnownBits Known;
+ APInt DemandedMask(APInt::getSignMask(SrcVT.getScalarSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Src, DemandedMask, Known, TLO)) {
+ DCI.AddToWorklist(Src.getNode());
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
+ return SDValue();
+}
+
+static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
SDLoc DL(N);
+
+ // Pre-shrink oversized index elements to avoid triggering scalarization.
+ if (DCI.isBeforeLegalize()) {
+ SDValue Index = N->getOperand(4);
+ if (Index.getScalarValueSizeInBits() > 64) {
+ EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), MVT::i64,
+ Index.getValueType().getVectorNumElements());
+ SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, IndexVT, Index);
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[4] = Trunc;
+ DAG.UpdateNodeOperands(N, NewOps);
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ }
+
+ // Try to remove sign extends from i32 to i64 on the index.
+ // Only do this before legalize in case we are relying on it for
+ // legalization.
+ // TODO: We should maybe remove any sign extend once we learn how to sign
+ // extend narrow index during lowering.
+ if (DCI.isBeforeLegalizeOps()) {
+ SDValue Index = N->getOperand(4);
+ if (Index.getScalarValueSizeInBits() == 64 &&
+ Index.getOpcode() == ISD::SIGN_EXTEND &&
+ Index.getOperand(0).getScalarValueSizeInBits() == 32) {
+ SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
+ NewOps[4] = Index.getOperand(0);
+ DAG.UpdateNodeOperands(N, NewOps);
+ // The original sign extend has less users, add back to worklist in case
+ // it needs to be removed.
+ DCI.AddToWorklist(Index.getNode());
+ DCI.AddToWorklist(N);
+ return SDValue(N, 0);
+ }
+ }
+
// Gather and Scatter instructions use k-registers for masks. The type of
// the masks is v*i1. So the mask will be truncated anyway.
// The SIGN_EXTEND_INREG my be dropped.
SDValue Mask = N->getOperand(2);
- if (Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
+ if (Subtarget.hasAVX512() && Mask.getOpcode() == ISD::SIGN_EXTEND_INREG) {
SmallVector<SDValue, 5> NewOps(N->op_begin(), N->op_end());
NewOps[2] = Mask.getOperand(0);
DAG.UpdateNodeOperands(N, NewOps);
}
+
+ // With AVX2 we only demand the upper bit of the mask.
+ if (!Subtarget.hasAVX512()) {
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ KnownBits Known;
+ APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
+ if (TLI.SimplifyDemandedBits(Mask, DemandedMask, Known, TLO)) {
+ DCI.AddToWorklist(Mask.getNode());
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+ }
+
return SDValue();
}
@@ -34868,7 +36315,7 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
SDValue EFLAGS = N->getOperand(1);
// Try to simplify the EFLAGS and condition code operands.
- if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget))
return getSETCC(CC, Flags, DL, DAG);
return SDValue();
@@ -34884,7 +36331,7 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
// Try to simplify the EFLAGS and condition code operands.
// Make sure to not keep references to operands, as combineSetCCEFLAGS can
// RAUW them under us.
- if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG)) {
+ if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG, Subtarget)) {
SDValue Cond = DAG.getConstant(CC, DL, MVT::i8);
return DAG.getNode(X86ISD::BRCOND, DL, N->getVTList(), N->getOperand(0),
N->getOperand(1), Cond, Flags);
@@ -34945,7 +36392,6 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
EVT InSVT = InVT.getScalarType();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
// UINT_TO_FP(vXi8) -> SINT_TO_FP(ZEXT(vXi8 to vXi32))
// UINT_TO_FP(vXi16) -> SINT_TO_FP(ZEXT(vXi16 to vXi32))
@@ -34955,9 +36401,7 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
- if (TLI.isOperationLegal(ISD::UINT_TO_FP, DstVT))
- return DAG.getNode(ISD::UINT_TO_FP, dl, VT, P);
-
+ // UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
@@ -35049,7 +36493,7 @@ static SDValue combineSBB(SDNode *N, SelectionDAG &DAG) {
// Optimize RES, EFLAGS = X86ISD::ADC LHS, RHS, EFLAGS
static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
- X86TargetLowering::DAGCombinerInfo &DCI) {
+ TargetLowering::DAGCombinerInfo &DCI) {
// If the LHS and RHS of the ADC node are zero, then it can't overflow and
// the result is either zero or one (depending on the input carry bit).
// Strength reduce this down to a "set on carry" aka SETCC_CARRY&1.
@@ -35260,6 +36704,9 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
SDValue MulOp = N->getOperand(0);
SDValue Phi = N->getOperand(1);
@@ -35305,6 +36752,9 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
SDLoc DL(N);
EVT VT = N->getValueType(0);
SDValue Op0 = N->getOperand(0);
@@ -35362,16 +36812,13 @@ static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
Sad = DAG.getNode(ISD::TRUNCATE, DL, VT, Sad);
if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
- // Update part of elements of the reduction vector. This is done by first
- // extracting a sub-vector from it, updating this sub-vector, and inserting
- // it back.
- SDValue SubPhi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Phi,
- DAG.getIntPtrConstant(0, DL));
- SDValue Res = DAG.getNode(ISD::ADD, DL, ResVT, Sad, SubPhi);
- return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Phi, Res,
- DAG.getIntPtrConstant(0, DL));
- } else
- return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
+ // Fill the upper elements with zero to match the add width.
+ SDValue Zero = DAG.getConstant(0, DL, VT);
+ Sad = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Zero, Sad,
+ DAG.getIntPtrConstant(0, DL));
+ }
+
+ return DAG.getNode(ISD::ADD, DL, VT, Sad, Phi);
}
/// Convert vector increment or decrement to sub/add with an all-ones constant:
@@ -35392,7 +36839,7 @@ static SDValue combineIncDecVector(SDNode *N, SelectionDAG &DAG) {
SDNode *N1 = N->getOperand(1).getNode();
APInt SplatVal;
- if (!ISD::isConstantSplatVector(N1, SplatVal, /*AllowShrink*/false) ||
+ if (!ISD::isConstantSplatVector(N1, SplatVal) ||
!SplatVal.isOneValue())
return SDValue();
@@ -35426,6 +36873,89 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
return combineAddOrSubToADCOrSBB(N, DAG);
}
+static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDValue Op0 = N->getOperand(0);
+ SDValue Op1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // PSUBUS is supported, starting from SSE2, but special preprocessing
+ // for v8i32 requires umin, which appears in SSE41.
+ if (!(Subtarget.hasSSE2() && (VT == MVT::v16i8 || VT == MVT::v8i16)) &&
+ !(Subtarget.hasSSE41() && (VT == MVT::v8i32)) &&
+ !(Subtarget.hasAVX2() && (VT == MVT::v32i8 || VT == MVT::v16i16)) &&
+ !(Subtarget.hasAVX512() && Subtarget.hasBWI() &&
+ (VT == MVT::v64i8 || VT == MVT::v32i16 || VT == MVT::v16i32 ||
+ VT == MVT::v8i64)))
+ return SDValue();
+
+ SDValue SubusLHS, SubusRHS;
+ // Try to find umax(a,b) - b or a - umin(a,b) patterns
+ // they may be converted to subus(a,b).
+ // TODO: Need to add IR cannonicialization for this code.
+ if (Op0.getOpcode() == ISD::UMAX) {
+ SubusRHS = Op1;
+ SDValue MaxLHS = Op0.getOperand(0);
+ SDValue MaxRHS = Op0.getOperand(1);
+ if (MaxLHS == Op1)
+ SubusLHS = MaxRHS;
+ else if (MaxRHS == Op1)
+ SubusLHS = MaxLHS;
+ else
+ return SDValue();
+ } else if (Op1.getOpcode() == ISD::UMIN) {
+ SubusLHS = Op0;
+ SDValue MinLHS = Op1.getOperand(0);
+ SDValue MinRHS = Op1.getOperand(1);
+ if (MinLHS == Op0)
+ SubusRHS = MinRHS;
+ else if (MinRHS == Op0)
+ SubusRHS = MinLHS;
+ else
+ return SDValue();
+ } else
+ return SDValue();
+
+ // PSUBUS doesn't support v8i32/v8i64/v16i32, but it can be enabled with
+ // special preprocessing in some cases.
+ if (VT != MVT::v8i32 && VT != MVT::v16i32 && VT != MVT::v8i64)
+ return DAG.getNode(X86ISD::SUBUS, SDLoc(N), VT, SubusLHS, SubusRHS);
+
+ // Special preprocessing case can be only applied
+ // if the value was zero extended from 16 bit,
+ // so we require first 16 bits to be zeros for 32 bit
+ // values, or first 48 bits for 64 bit values.
+ KnownBits Known;
+ DAG.computeKnownBits(SubusLHS, Known);
+ unsigned NumZeros = Known.countMinLeadingZeros();
+ if ((VT == MVT::v8i64 && NumZeros < 48) || NumZeros < 16)
+ return SDValue();
+
+ EVT ExtType = SubusLHS.getValueType();
+ EVT ShrinkedType;
+ if (VT == MVT::v8i32 || VT == MVT::v8i64)
+ ShrinkedType = MVT::v8i16;
+ else
+ ShrinkedType = NumZeros >= 24 ? MVT::v16i8 : MVT::v16i16;
+
+ // If SubusLHS is zeroextended - truncate SubusRHS to it's
+ // size SubusRHS = umin(0xFFF.., SubusRHS).
+ SDValue SaturationConst =
+ DAG.getConstant(APInt::getLowBitsSet(ExtType.getScalarSizeInBits(),
+ ShrinkedType.getScalarSizeInBits()),
+ SDLoc(SubusLHS), ExtType);
+ SDValue UMin = DAG.getNode(ISD::UMIN, SDLoc(SubusLHS), ExtType, SubusRHS,
+ SaturationConst);
+ SDValue NewSubusLHS =
+ DAG.getZExtOrTrunc(SubusLHS, SDLoc(SubusLHS), ShrinkedType);
+ SDValue NewSubusRHS = DAG.getZExtOrTrunc(UMin, SDLoc(SubusRHS), ShrinkedType);
+ SDValue Psubus = DAG.getNode(X86ISD::SUBUS, SDLoc(N), ShrinkedType,
+ NewSubusLHS, NewSubusRHS);
+ // Zero extend the result, it may be used somewhere as 32 bit,
+ // if not zext and following trunc will shrink.
+ return DAG.getZExtOrTrunc(Psubus, SDLoc(N), ExtType);
+}
+
static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
@@ -35459,6 +36989,10 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
if (SDValue V = combineIncDecVector(N, DAG))
return V;
+ // Try to create PSUBUS if SUB's argument is max/min
+ if (SDValue V = combineSubToSubus(N, DAG, Subtarget))
+ return V;
+
return combineAddOrSubToADCOrSBB(N, DAG);
}
@@ -35554,39 +37088,26 @@ static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
-/// Canonicalize (LSUB p, 1) -> (LADD p, -1).
-static SDValue combineLockSub(SDNode *N, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
- SDValue Chain = N->getOperand(0);
- SDValue LHS = N->getOperand(1);
- SDValue RHS = N->getOperand(2);
- MVT VT = RHS.getSimpleValueType();
- SDLoc DL(N);
-
- auto *C = dyn_cast<ConstantSDNode>(RHS);
- if (!C || C->getZExtValue() != 1)
- return SDValue();
-
- RHS = DAG.getConstant(-1, DL, VT);
- MachineMemOperand *MMO = cast<MemSDNode>(N)->getMemOperand();
- return DAG.getMemIntrinsicNode(X86ISD::LADD, DL,
- DAG.getVTList(MVT::i32, MVT::Other),
- {Chain, LHS, RHS}, VT, MMO);
-}
-
-// TEST (AND a, b) ,(AND a, b) -> TEST a, b
-static SDValue combineTestM(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineTestM(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
SDValue Op0 = N->getOperand(0);
SDValue Op1 = N->getOperand(1);
- if (Op0 != Op1 || Op1->getOpcode() != ISD::AND)
- return SDValue();
-
- EVT VT = N->getValueType(0);
+ MVT VT = N->getSimpleValueType(0);
SDLoc DL(N);
- return DAG.getNode(X86ISD::TESTM, DL, VT,
- Op0->getOperand(0), Op0->getOperand(1));
+ // TEST (AND a, b) ,(AND a, b) -> TEST a, b
+ if (Op0 == Op1 && Op1->getOpcode() == ISD::AND)
+ return DAG.getNode(X86ISD::TESTM, DL, VT, Op0->getOperand(0),
+ Op0->getOperand(1));
+
+ // TEST op0, BUILD_VECTOR(all_zero) -> BUILD_VECTOR(all_zero)
+ // TEST BUILD_VECTOR(all_zero), op1 -> BUILD_VECTOR(all_zero)
+ if (ISD::isBuildVectorAllZeros(Op0.getNode()) ||
+ ISD::isBuildVectorAllZeros(Op1.getNode()))
+ return getZeroVector(VT, Subtarget, DAG, DL);
+
+ return SDValue();
}
static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
@@ -35610,21 +37131,55 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
if (DCI.isBeforeLegalizeOps())
return SDValue();
+ MVT OpVT = N->getSimpleValueType(0);
+
+ // Early out for mask vectors.
+ if (OpVT.getVectorElementType() == MVT::i1)
+ return SDValue();
+
SDLoc dl(N);
SDValue Vec = N->getOperand(0);
SDValue SubVec = N->getOperand(1);
- SDValue Idx = N->getOperand(2);
- unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
- MVT OpVT = N->getSimpleValueType(0);
+ unsigned IdxVal = N->getConstantOperandVal(2);
MVT SubVecVT = SubVec.getSimpleValueType();
+ if (ISD::isBuildVectorAllZeros(Vec.getNode())) {
+ // Inserting zeros into zeros is a nop.
+ if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ return Vec;
+
+ // If we're inserting into a zero vector and then into a larger zero vector,
+ // just insert into the larger zero vector directly.
+ if (SubVec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+ ISD::isBuildVectorAllZeros(SubVec.getOperand(0).getNode())) {
+ unsigned Idx2Val = SubVec.getConstantOperandVal(2);
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec,
+ SubVec.getOperand(1),
+ DAG.getIntPtrConstant(IdxVal + Idx2Val, dl));
+ }
+
+ // If we're inserting a bitcast into zeros, rewrite the insert and move the
+ // bitcast to the other side. This helps with detecting zero extending
+ // during isel.
+ // TODO: Is this useful for other indices than 0?
+ if (SubVec.getOpcode() == ISD::BITCAST && IdxVal == 0) {
+ MVT CastVT = SubVec.getOperand(0).getSimpleValueType();
+ unsigned NumElems = OpVT.getSizeInBits() / CastVT.getScalarSizeInBits();
+ MVT NewVT = MVT::getVectorVT(CastVT.getVectorElementType(), NumElems);
+ SDValue Insert = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, NewVT,
+ DAG.getBitcast(NewVT, Vec),
+ SubVec.getOperand(0), N->getOperand(2));
+ return DAG.getBitcast(OpVT, Insert);
+ }
+ }
+
// If this is an insert of an extract, combine to a shuffle. Don't do this
- // if the insert or extract can be represented with a subvector operation.
+ // if the insert or extract can be represented with a subregister operation.
if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
SubVec.getOperand(0).getSimpleValueType() == OpVT &&
(IdxVal != 0 || !Vec.isUndef())) {
- int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
+ int ExtIdxVal = SubVec.getConstantOperandVal(1);
if (ExtIdxVal != 0) {
int VecNumElts = OpVT.getVectorNumElements();
int SubVecNumElts = SubVecVT.getVectorNumElements();
@@ -35679,17 +37234,36 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
}
// If lower/upper loads are the same and the only users of the load, then
// lower to a VBROADCASTF128/VBROADCASTI128/etc.
- if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+ if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2)))
if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
- SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
+ SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode()))
return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
- }
- }
+
// If this is subv_broadcast insert into both halves, use a larger
// subv_broadcast.
- if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+ if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2)
return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
SubVec.getOperand(0));
+
+ // If we're inserting all zeros into the upper half, change this to
+ // an insert into an all zeros vector. We will match this to a move
+ // with implicit upper bit zeroing during isel.
+ if (ISD::isBuildVectorAllZeros(SubVec.getNode()))
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT,
+ getZeroVector(OpVT, Subtarget, DAG, dl), SubVec2,
+ Vec.getOperand(2));
+
+ // If we are inserting into both halves of the vector, the starting
+ // vector should be undef. If it isn't, make it so. Only do this if the
+ // the early insert has no other uses.
+ // TODO: Should this be a generic DAG combine?
+ if (!Vec.getOperand(0).isUndef() && Vec.hasOneUse()) {
+ Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, DAG.getUNDEF(OpVT),
+ SubVec2, Vec.getOperand(2));
+ DCI.AddToWorklist(Vec.getNode());
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, OpVT, Vec, SubVec,
+ N->getOperand(2));
+
}
}
}
@@ -35697,6 +37271,32 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ MVT OpVT = N->getSimpleValueType(0);
+ SDValue InVec = N->getOperand(0);
+ unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+
+ if (ISD::isBuildVectorAllZeros(InVec.getNode()))
+ return getZeroVector(OpVT, Subtarget, DAG, SDLoc(N));
+
+ if (ISD::isBuildVectorAllOnes(InVec.getNode())) {
+ if (OpVT.getScalarType() == MVT::i1)
+ return DAG.getConstant(1, SDLoc(N), OpVT);
+ return getOnesVector(OpVT, DAG, SDLoc(N));
+ }
+
+ if (InVec.getOpcode() == ISD::BUILD_VECTOR)
+ return DAG.getBuildVector(
+ OpVT, SDLoc(N),
+ InVec.getNode()->ops().slice(IdxVal, OpVT.getVectorNumElements()));
+
+ return SDValue();
+}
SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
DAGCombinerInfo &DCI) const {
@@ -35704,12 +37304,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
switch (N->getOpcode()) {
default: break;
case ISD::EXTRACT_VECTOR_ELT:
- return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case X86ISD::PEXTRW:
case X86ISD::PEXTRB:
- return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+ return combineExtractVectorElt(N, DAG, DCI, Subtarget);
case ISD::INSERT_SUBVECTOR:
return combineInsertSubvector(N, DAG, DCI, Subtarget);
+ case ISD::EXTRACT_SUBVECTOR:
+ return combineExtractSubvector(N, DAG, DCI, Subtarget);
case ISD::VSELECT:
case ISD::SELECT:
case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
@@ -35753,6 +37354,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::SETCC: return combineSetCC(N, DAG, Subtarget);
case X86ISD::SETCC: return combineX86SetCC(N, DAG, Subtarget);
case X86ISD::BRCOND: return combineBrCond(N, DAG, Subtarget);
+ case X86ISD::PACKSS:
+ case X86ISD::PACKUS: return combineVectorPack(N, DAG, DCI, Subtarget);
case X86ISD::VSHLI:
case X86ISD::VSRAI:
case X86ISD::VSRLI:
@@ -35784,6 +37387,7 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::MOVDDUP:
case X86ISD::MOVSS:
case X86ISD::MOVSD:
+ case X86ISD::VBROADCAST:
case X86ISD::VPPERM:
case X86ISD::VPERMI:
case X86ISD::VPERMV:
@@ -35795,15 +37399,23 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case X86ISD::VPERM2X128:
case X86ISD::VZEXT_MOVL:
case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
- case X86ISD::FMADD:
case X86ISD::FMADD_RND:
case X86ISD::FMADDS1_RND:
case X86ISD::FMADDS3_RND:
+ case X86ISD::FMADDS1:
+ case X86ISD::FMADDS3:
+ case X86ISD::FMADD4S:
case ISD::FMA: return combineFMA(N, DAG, Subtarget);
+ case X86ISD::FMADDSUB_RND:
+ case X86ISD::FMSUBADD_RND:
+ case X86ISD::FMADDSUB:
+ case X86ISD::FMSUBADD: return combineFMADDSUB(N, DAG, Subtarget);
+ case X86ISD::MOVMSK: return combineMOVMSK(N, DAG, DCI);
+ case X86ISD::MGATHER:
+ case X86ISD::MSCATTER:
case ISD::MGATHER:
- case ISD::MSCATTER: return combineGatherScatter(N, DAG);
- case X86ISD::LSUB: return combineLockSub(N, DAG, Subtarget);
- case X86ISD::TESTM: return combineTestM(N, DAG);
+ case ISD::MSCATTER: return combineGatherScatter(N, DAG, DCI, Subtarget);
+ case X86ISD::TESTM: return combineTestM(N, DAG, Subtarget);
case X86ISD::PCMPEQ:
case X86ISD::PCMPGT: return combineVectorCompare(N, DAG, Subtarget);
}
@@ -35910,6 +37522,27 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
return Promote;
}
+bool X86TargetLowering::
+ isDesirableToCombineBuildVectorToShuffleTruncate(
+ ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
+
+ assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
+ "Element count mismatch");
+ assert(
+ Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
+ "Shuffle Mask expected to be legal");
+
+ // For 32-bit elements VPERMD is better than shuffle+truncate.
+ // TODO: After we improve lowerBuildVector, add execption for VPERMW.
+ if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
+ return false;
+
+ if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
+ return false;
+
+ return true;
+}
+
//===----------------------------------------------------------------------===//
// X86 Inline Assembly Support
//===----------------------------------------------------------------------===//
@@ -36041,8 +37674,8 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
case 'v':
case 'Y':
case 'l':
- return C_RegisterClass;
case 'k': // AVX512 masking registers.
+ return C_RegisterClass;
case 'a':
case 'b':
case 'c':
@@ -36074,8 +37707,15 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
switch (Constraint[1]) {
default:
break;
- case 'k':
+ case 'z':
+ case '0':
return C_Register;
+ case 'i':
+ case 'm':
+ case 'k':
+ case 't':
+ case '2':
+ return C_RegisterClass;
}
}
}
@@ -36123,15 +37763,42 @@ TargetLowering::ConstraintWeight
if (type->isX86_MMXTy() && Subtarget.hasMMX())
weight = CW_SpecificReg;
break;
- case 'Y':
- // Other "Y<x>" (e.g. "Yk") constraints should be implemented below.
- if (constraint[1] == 'k') {
- // Support for 'Yk' (similarly to the 'k' variant below).
- weight = CW_SpecificReg;
+ case 'Y': {
+ unsigned Size = StringRef(constraint).size();
+ // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
+ char NextChar = Size == 2 ? constraint[1] : 'i';
+ if (Size > 2)
break;
+ switch (NextChar) {
+ default:
+ return CW_Invalid;
+ // XMM0
+ case 'z':
+ case '0':
+ if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+ return CW_SpecificReg;
+ return CW_Invalid;
+ // Conditional OpMask regs (AVX512)
+ case 'k':
+ if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
+ return CW_Register;
+ return CW_Invalid;
+ // Any MMX reg
+ case 'm':
+ if (type->isX86_MMXTy() && Subtarget.hasMMX())
+ return weight;
+ return CW_Invalid;
+ // Any SSE reg when ISA >= SSE2, same as 'Y'
+ case 'i':
+ case 't':
+ case '2':
+ if (!Subtarget.hasSSE2())
+ return CW_Invalid;
+ break;
}
- // Else fall through (handle "Y" constraint).
+ // Fall through (handle "Y" constraint).
LLVM_FALLTHROUGH;
+ }
case 'v':
if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
weight = CW_Register;
@@ -36143,7 +37810,8 @@ TargetLowering::ConstraintWeight
break;
case 'k':
// Enable conditional vector operations using %k<#> registers.
- weight = CW_SpecificReg;
+ if ((type->getPrimitiveSizeInBits() == 64) && Subtarget.hasAVX512())
+ weight = CW_Register;
break;
case 'I':
if (ConstantInt *C = dyn_cast<ConstantInt>(info.CallOperandVal)) {
@@ -36545,6 +38213,17 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
switch (Constraint[1]) {
default:
break;
+ case 'i':
+ case 't':
+ case '2':
+ return getRegForInlineAsmConstraint(TRI, "Y", VT);
+ case 'm':
+ if (!Subtarget.hasMMX()) break;
+ return std::make_pair(0U, &X86::VR64RegClass);
+ case 'z':
+ case '0':
+ if (!Subtarget.hasSSE1()) break;
+ return std::make_pair(X86::XMM0, &X86::VR128RegClass);
case 'k':
// This register class doesn't allocate k0 for masked vector operation.
if (Subtarget.hasAVX512()) { // Only supported in AVX512.
@@ -36637,12 +38316,14 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
if (Size == 1) Size = 8;
unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
if (DestReg > 0) {
- Res.first = DestReg;
- Res.second = Size == 8 ? &X86::GR8RegClass
- : Size == 16 ? &X86::GR16RegClass
- : Size == 32 ? &X86::GR32RegClass
- : &X86::GR64RegClass;
- assert(Res.second->contains(Res.first) && "Register in register class");
+ bool is64Bit = Subtarget.is64Bit();
+ const TargetRegisterClass *RC =
+ Size == 8 ? (is64Bit ? &X86::GR8RegClass : &X86::GR8_NOREXRegClass)
+ : Size == 16 ? (is64Bit ? &X86::GR16RegClass : &X86::GR16_NOREXRegClass)
+ : Size == 32 ? (is64Bit ? &X86::GR32RegClass : &X86::GR32_NOREXRegClass)
+ : &X86::GR64RegClass;
+ if (RC->contains(DestReg))
+ Res = std::make_pair(DestReg, RC);
} else {
// No register found/type mismatch.
Res.first = 0;
@@ -36750,7 +38431,7 @@ void X86TargetLowering::insertCopiesSplitCSR(
// fine for CXX_FAST_TLS since the C++-style TLS access functions should be
// nounwind. If we want to generalize this later, we may need to emit
// CFI pseudo-instructions.
- assert(Entry->getParent()->getFunction()->hasFnAttribute(
+ assert(Entry->getParent()->getFunction().hasFnAttribute(
Attribute::NoUnwind) &&
"Function should be nounwind in insertCopiesSplitCSR!");
Entry->addLiveIn(*I);
@@ -36773,8 +38454,8 @@ bool X86TargetLowering::supportSwiftError() const {
/// string if not applicable.
StringRef X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
// If the function specifically requests stack probes, emit them.
- if (MF.getFunction()->hasFnAttribute("probe-stack"))
- return MF.getFunction()->getFnAttribute("probe-stack").getValueAsString();
+ if (MF.getFunction().hasFnAttribute("probe-stack"))
+ return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
// Generally, if we aren't on Windows, the platform ABI does not include
// support for stack probes, so don't emit them.
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index dbbc2bbba6a4..8464081b1b08 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -17,7 +17,7 @@
#include "llvm/CodeGen/CallingConvLower.h"
#include "llvm/CodeGen/SelectionDAG.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/Target/TargetOptions.h"
namespace llvm {
@@ -214,7 +214,7 @@ namespace llvm {
// FP vector get exponent.
FGETEXP_RND, FGETEXPS_RND,
// Extract Normalized Mantissas.
- VGETMANT, VGETMANTS,
+ VGETMANT, VGETMANT_RND, VGETMANTS, VGETMANTS_RND,
// FP Scale.
SCALEF,
SCALEFS,
@@ -254,7 +254,9 @@ namespace llvm {
/// Note that these typically require refinement
/// in order to obtain suitable precision.
FRSQRT, FRCP,
- FRSQRTS, FRCPS,
+
+ // AVX-512 reciprocal approximations with a little more precision.
+ RSQRT14, RSQRT14S, RCP14, RCP14S,
// Thread Local Storage.
TLSADDR,
@@ -333,6 +335,9 @@ namespace llvm {
// Vector integer comparisons, the result is in a mask vector.
PCMPEQM, PCMPGTM,
+ // v8i16 Horizontal minimum and position.
+ PHMINPOS,
+
MULTISHIFT,
/// Vector comparison generating mask bits for fp and
@@ -346,9 +351,6 @@ namespace llvm {
ADD, SUB, ADC, SBB, SMUL,
INC, DEC, OR, XOR, AND,
- // Bit field extract.
- BEXTR,
-
// LOW, HI, FLAGS = umul LHS, RHS.
UMUL,
@@ -391,13 +393,17 @@ namespace llvm {
PSHUFHW,
PSHUFLW,
SHUFP,
+ // VBMI2 Concat & Shift.
+ VSHLD,
+ VSHRD,
+ VSHLDV,
+ VSHRDV,
//Shuffle Packed Values at 128-bit granularity.
SHUF128,
MOVDDUP,
MOVSHDUP,
MOVSLDUP,
MOVLHPS,
- MOVLHPD,
MOVHLPS,
MOVLPS,
MOVLPD,
@@ -428,11 +434,13 @@ namespace llvm {
VFIXUPIMM,
VFIXUPIMMS,
// Range Restriction Calculation For Packed Pairs of Float32/64 values.
- VRANGE,
+ VRANGE, VRANGE_RND, VRANGES, VRANGES_RND,
// Reduce - Perform Reduction Transformation on scalar\packed FP.
- VREDUCE, VREDUCES,
+ VREDUCE, VREDUCE_RND, VREDUCES, VREDUCES_RND,
// RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
- VRNDSCALE, VRNDSCALES,
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ VRNDSCALE, VRNDSCALE_RND, VRNDSCALES, VRNDSCALES_RND,
// Tests Types Of a FP Values for packed types.
VFPCLASS,
// Tests Types Of a FP Values for scalar types.
@@ -445,14 +453,9 @@ namespace llvm {
// Broadcast subvector to vector.
SUBV_BROADCAST,
- // Extract vector element.
- VEXTRACT,
-
/// SSE4A Extraction and Insertion.
EXTRQI, INSERTQI,
- // XOP variable/immediate rotations.
- VPROT, VPROTI,
// XOP arithmetic/logical shifts.
VPSHA, VPSHL,
// XOP signed/unsigned integer comparisons.
@@ -471,10 +474,20 @@ namespace llvm {
// Multiply and Add Packed Integers.
VPMADDUBSW, VPMADDWD,
+
+ // AVX512IFMA multiply and add.
+ // NOTE: These are different than the instruction and perform
+ // op0 x op1 + op2.
VPMADD52L, VPMADD52H,
+ // VNNI
+ VPDPBUSD,
+ VPDPBUSDS,
+ VPDPWSSD,
+ VPDPWSSDS,
+
// FMA nodes.
- FMADD,
+ // We use the target independent ISD::FMA for the non-inverted case.
FNMADD,
FMSUB,
FNMSUB,
@@ -489,6 +502,15 @@ namespace llvm {
FMADDSUB_RND,
FMSUBADD_RND,
+ // FMA4 specific scalar intrinsics bits that zero the non-scalar bits.
+ FMADD4S, FNMADD4S, FMSUB4S, FNMSUB4S,
+
+ // Scalar intrinsic FMA.
+ FMADDS1, FMADDS3,
+ FNMADDS1, FNMADDS3,
+ FMSUBS1, FMSUBS3,
+ FNMSUBS1, FNMSUBS3,
+
// Scalar intrinsic FMA with rounding mode.
// Two versions, passthru bits on op1 or op3.
FMADDS1_RND, FMADDS3_RND,
@@ -500,6 +522,9 @@ namespace llvm {
COMPRESS,
EXPAND,
+ // Bits shuffle
+ VPSHUFBITQMB,
+
// Convert Unsigned/Integer to Floating-Point Value with rounding mode.
SINT_TO_FP_RND, UINT_TO_FP_RND,
SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
@@ -557,7 +582,10 @@ namespace llvm {
RSQRT28, RSQRT28S, RCP28, RCP28S, EXP2,
// Conversions between float and half-float.
- CVTPS2PH, CVTPH2PS,
+ CVTPS2PH, CVTPH2PS, CVTPH2PS_RND,
+
+ // Galois Field Arithmetic Instructions
+ GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
// LWP insert record.
LWPINS,
@@ -571,7 +599,7 @@ namespace llvm {
/// LOCK-prefixed arithmetic read-modify-write instructions.
/// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
- LADD, LSUB, LOR, LXOR, LAND,
+ LADD, LSUB, LOR, LXOR, LAND, LINC, LDEC,
// Load, scalar_to_vector, and zero extend.
VZEXT_LOAD,
@@ -617,8 +645,8 @@ namespace llvm {
// Vector truncating masked store with unsigned/signed saturation
VMTRUNCSTOREUS, VMTRUNCSTORES,
- // X86 specific gather
- MGATHER
+ // X86 specific gather and scatter
+ MGATHER, MSCATTER,
// WARNING: Do not add anything in the end unless you want the node to
// have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
@@ -628,46 +656,6 @@ namespace llvm {
/// Define some predicates that are used for node matching.
namespace X86 {
- /// Return true if the specified
- /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
- /// suitable for input to VEXTRACTF128, VEXTRACTI128 instructions.
- bool isVEXTRACT128Index(SDNode *N);
-
- /// Return true if the specified
- /// INSERT_SUBVECTOR operand specifies a subvector insert that is
- /// suitable for input to VINSERTF128, VINSERTI128 instructions.
- bool isVINSERT128Index(SDNode *N);
-
- /// Return true if the specified
- /// EXTRACT_SUBVECTOR operand specifies a vector extract that is
- /// suitable for input to VEXTRACTF64X4, VEXTRACTI64X4 instructions.
- bool isVEXTRACT256Index(SDNode *N);
-
- /// Return true if the specified
- /// INSERT_SUBVECTOR operand specifies a subvector insert that is
- /// suitable for input to VINSERTF64X4, VINSERTI64X4 instructions.
- bool isVINSERT256Index(SDNode *N);
-
- /// Return the appropriate
- /// immediate to extract the specified EXTRACT_SUBVECTOR index
- /// with VEXTRACTF128, VEXTRACTI128 instructions.
- unsigned getExtractVEXTRACT128Immediate(SDNode *N);
-
- /// Return the appropriate
- /// immediate to insert at the specified INSERT_SUBVECTOR index
- /// with VINSERTF128, VINSERT128 instructions.
- unsigned getInsertVINSERT128Immediate(SDNode *N);
-
- /// Return the appropriate
- /// immediate to extract the specified EXTRACT_SUBVECTOR index
- /// with VEXTRACTF64X4, VEXTRACTI64x4 instructions.
- unsigned getExtractVEXTRACT256Immediate(SDNode *N);
-
- /// Return the appropriate
- /// immediate to insert at the specified INSERT_SUBVECTOR index
- /// with VINSERTF64x4, VINSERTI64x4 instructions.
- unsigned getInsertVINSERT256Immediate(SDNode *N);
-
/// Returns true if Elt is a constant zero or floating point constant +0.0.
bool isZeroNode(SDValue Elt);
@@ -696,7 +684,7 @@ namespace llvm {
void markLibCallAttributes(MachineFunction *MF, unsigned CC,
ArgListTy &Args) const override;
- MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+ MVT getScalarShiftAmountTy(const DataLayout &, EVT VT) const override {
return MVT::i8;
}
@@ -767,18 +755,18 @@ namespace llvm {
SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
- // Return true if it is profitable to combine a BUILD_VECTOR to a TRUNCATE
- // for given operand and result types.
+ // Return true if it is profitable to combine a BUILD_VECTOR with a
+ // stride-pattern to a shuffle and a truncate.
// Example of such a combine:
- // v4i32 build_vector((extract_elt V, 0),
- // (extract_elt V, 2),
- // (extract_elt V, 4),
- // (extract_elt V, 6))
+ // v4i32 build_vector((extract_elt V, 1),
+ // (extract_elt V, 3),
+ // (extract_elt V, 5),
+ // (extract_elt V, 7))
// -->
- // v4i32 truncate (bitcast V to v4i64)
- bool isDesirableToCombineBuildVectorToTruncate() const override {
- return true;
- }
+ // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
+ // v4i64)
+ bool isDesirableToCombineBuildVectorToShuffleTruncate(
+ ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;
/// Return true if the target has native support for
/// the specified value type and it is 'desirable' to use the type for the
@@ -799,6 +787,11 @@ namespace llvm {
/// This method returns the name of a target specific DAG node.
const char *getTargetNodeName(unsigned Opcode) const override;
+ bool mergeStoresAfterLegalization() const override { return true; }
+
+ bool canMergeStoresTo(unsigned AddressSpace, EVT MemVT,
+ const SelectionDAG &DAG) const override;
+
bool isCheapToSpeculateCttz() const override;
bool isCheapToSpeculateCtlz() const override;
@@ -854,6 +847,8 @@ namespace llvm {
const SelectionDAG &DAG,
unsigned Depth) const override;
+ SDValue unwrapAddress(SDValue N) const override;
+
bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA,
int64_t &Offset) const override;
@@ -903,7 +898,8 @@ namespace llvm {
/// Return true if the addressing mode represented
/// by AM is legal for this target, for a load/store of the specified type.
bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM,
- Type *Ty, unsigned AS) const override;
+ Type *Ty, unsigned AS,
+ Instruction *I = nullptr) const override;
/// Return true if the specified immediate is legal
/// icmp immediate, that is the target has icmp instructions which can
@@ -966,6 +962,7 @@ namespace llvm {
/// true and stores the intrinsic information into the IntrinsicInfo that was
/// passed to the function.
bool getTgtMemIntrinsic(IntrinsicInfo &Info, const CallInst &I,
+ MachineFunction &MF,
unsigned Intrinsic) const override;
/// Returns true if the target can instruction select the
@@ -977,8 +974,7 @@ namespace llvm {
/// VECTOR_SHUFFLE operations, those with specific masks. By default, if a
/// target supports the VECTOR_SHUFFLE node, all mask values are assumed to
/// be legal.
- bool isShuffleMaskLegal(const SmallVectorImpl<int> &Mask,
- EVT VT) const override;
+ bool isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const override;
/// Similar to isShuffleMaskLegal. This is used by Targets can use this to
/// indicate if there is a suitable VECTOR_SHUFFLE that can be used to
@@ -1013,13 +1009,19 @@ namespace llvm {
bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
Type *Ty) const override;
- bool convertSelectOfConstantsToMath() const override {
- return true;
- }
+ bool convertSelectOfConstantsToMath(EVT VT) const override;
/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
/// with this index.
- bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
+ bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+ unsigned Index) const override;
+
+ bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
+ unsigned AddrSpace) const override {
+ // If we can replace more than 2 scalar stores, there will be a reduction
+ // in instructions even after we add a vector constant load.
+ return NumElem > 2;
+ }
/// Intel processors have a unified instruction and data cache
const char * getClearCacheBuiltinName() const override {
@@ -1051,9 +1053,13 @@ namespace llvm {
Value *getIRStackGuard(IRBuilder<> &IRB) const override;
bool useLoadStackGuardNode() const override;
+ bool useStackGuardXorFP() const override;
void insertSSPDeclarations(Module &M) const override;
Value *getSDagStackGuard(const Module &M) const override;
Value *getSSPStackGuardCheck(const Module &M) const override;
+ SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
+ const SDLoc &DL) const override;
+
/// Return true if the target stores SafeStack pointer at a fixed offset in
/// some non-standard address space, and populates the address space and
@@ -1164,8 +1170,6 @@ namespace llvm {
SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
- SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const;
- SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
unsigned getGlobalWrapperKind(const GlobalValue *GV = nullptr) const;
@@ -1184,8 +1188,6 @@ namespace llvm {
SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
- SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1207,6 +1209,7 @@ namespace llvm {
SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue
LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1222,8 +1225,8 @@ namespace llvm {
const SDLoc &dl, SelectionDAG &DAG) const override;
bool supportSplitCSR(MachineFunction *MF) const override {
- return MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
- MF->getFunction()->hasFnAttribute(Attribute::NoUnwind);
+ return MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
}
void initializeSplitCSR(MachineBasicBlock *Entry) const override;
void insertCopiesSplitCSR(
@@ -1268,6 +1271,10 @@ namespace llvm {
EmitVAStartSaveXMMRegsWithCustomInserter(MachineInstr &BInstr,
MachineBasicBlock *BB) const;
+ MachineBasicBlock *EmitLoweredCascadedSelect(MachineInstr &MI1,
+ MachineInstr &MI2,
+ MachineBasicBlock *BB) const;
+
MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
MachineBasicBlock *BB) const;
@@ -1421,19 +1428,93 @@ namespace llvm {
}
};
- // X86 specific Gather node.
- class X86MaskedGatherSDNode : public MaskedGatherScatterSDNode {
+ // X86 specific Gather/Scatter nodes.
+ // The class has the same order of operands as MaskedGatherScatterSDNode for
+ // convenience.
+ class X86MaskedGatherScatterSDNode : public MemSDNode {
public:
- X86MaskedGatherSDNode(unsigned Order,
- const DebugLoc &dl, SDVTList VTs, EVT MemVT,
- MachineMemOperand *MMO)
- : MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT, MMO)
- {}
+ X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
+ const DebugLoc &dl, SDVTList VTs, EVT MemVT,
+ MachineMemOperand *MMO)
+ : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
+
+ const SDValue &getBasePtr() const { return getOperand(3); }
+ const SDValue &getIndex() const { return getOperand(4); }
+ const SDValue &getMask() const { return getOperand(2); }
+ const SDValue &getValue() const { return getOperand(1); }
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MGATHER ||
+ N->getOpcode() == X86ISD::MSCATTER;
+ }
+ };
+
+ class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
+ public:
+ X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
+ EVT MemVT, MachineMemOperand *MMO)
+ : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
+ MMO) {}
+
static bool classof(const SDNode *N) {
return N->getOpcode() == X86ISD::MGATHER;
}
};
+ class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
+ public:
+ X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
+ EVT MemVT, MachineMemOperand *MMO)
+ : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
+ MMO) {}
+
+ static bool classof(const SDNode *N) {
+ return N->getOpcode() == X86ISD::MSCATTER;
+ }
+ };
+
+ /// Generate unpacklo/unpackhi shuffle mask.
+ template <typename T = int>
+ void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
+ bool Unary) {
+ assert(Mask.empty() && "Expected an empty shuffle mask vector");
+ int NumElts = VT.getVectorNumElements();
+ int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+ for (int i = 0; i < NumElts; ++i) {
+ unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+ int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+ Pos += (Unary ? 0 : NumElts * (i % 2));
+ Pos += (Lo ? 0 : NumEltsInLane / 2);
+ Mask.push_back(Pos);
+ }
+ }
+
+ /// Helper function to scale a shuffle or target shuffle mask, replacing each
+ /// mask index with the scaled sequential indices for an equivalent narrowed
+ /// mask. This is the reverse process to canWidenShuffleElements, but can
+ /// always succeed.
+ template <typename T>
+ void scaleShuffleMask(int Scale, ArrayRef<T> Mask,
+ SmallVectorImpl<T> &ScaledMask) {
+ assert(0 < Scale && "Unexpected scaling factor");
+ int NumElts = Mask.size();
+ ScaledMask.assign(static_cast<size_t>(NumElts * Scale), -1);
+
+ for (int i = 0; i != NumElts; ++i) {
+ int M = Mask[i];
+
+ // Repeat sentinel values in every mask element.
+ if (M < 0) {
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = M;
+ continue;
+ }
+
+ // Scale mask element and increment across each mask element.
+ for (int s = 0; s != Scale; ++s)
+ ScaledMask[(Scale * i) + s] = (Scale * M) + s;
+ }
+ }
} // end namespace llvm
#endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index 08b501ff20bf..2acd8d17beb2 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -12,94 +12,123 @@
//
//===----------------------------------------------------------------------===//
-class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat>
- : I<o, F, outs, ins, asm, pat>, TB, Requires<[Has3DNow]> {
+let Sched = WriteFAdd in {
+def I3DNOW_FALU_ITINS : OpndItins<
+ IIC_3DNOW_FALU_RR, IIC_3DNOW_FALU_RM
+>;
}
-class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+let Sched = WriteCvtF2I in {
+def I3DNOW_FCVT_F2I_ITINS : OpndItins<
+ IIC_3DNOW_FCVT_F2I_RR, IIC_3DNOW_FCVT_F2I_RM
+>;
+}
+
+let Sched = WriteCvtI2F in {
+def I3DNOW_FCVT_I2F_ITINS : OpndItins<
+ IIC_3DNOW_FCVT_I2F_RR, IIC_3DNOW_FCVT_I2F_RM
+>;
+}
+
+let Sched = WriteVecIMul in {
+def I3DNOW_MISC_FUNC_ITINS : OpndItins<
+ IIC_3DNOW_MISC_FUNC_REG, IIC_3DNOW_MISC_FUNC_MEM
+>;
+}
+
+let Sched = WriteShuffle in {
+def I3DNOW_PSHUF_ITINS : OpndItins<
+ IIC_MMX_PSHUF, IIC_MMX_PSHUF
+>;
+}
+
+class I3DNow<bits<8> o, Format F, dag outs, dag ins, string asm, list<dag> pat,
+ InstrItinClass itin>
+ : I<o, F, outs, ins, asm, pat, itin>, TB, Requires<[Has3DNow]> {
+}
+
+class I3DNow_binop<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat,
+ InstrItinClass itin>
: I3DNow<o, F, (outs VR64:$dst), ins,
- !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat>,
+ !strconcat(Mnemonic, "\t{$src2, $dst|$dst, $src2}"), pat, itin>,
Has3DNow0F0FOpcode {
// FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
let isAsmParserOnly = 1;
let Constraints = "$src1 = $dst";
}
-class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat>
+class I3DNow_conv<bits<8> o, Format F, dag ins, string Mnemonic, list<dag> pat,
+ InstrItinClass itin>
: I3DNow<o, F, (outs VR64:$dst), ins,
- !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat>,
+ !strconcat(Mnemonic, "\t{$src, $dst|$dst, $src}"), pat, itin>,
Has3DNow0F0FOpcode {
// FIXME: The disassembler doesn't support Has3DNow0F0FOpcode yet.
let isAsmParserOnly = 1;
}
-multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
- def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn, []>;
- def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
-}
-
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, bit Commutable = 0,
- string Ver = ""> {
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, OpndItins itins,
+ bit Commutable = 0, string Ver = ""> {
let isCommutable = Commutable in
def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))],
+ itins.rr>, Sched<[itins.Sched]>;
def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1,
- (bitconvert (load_mmx addr:$src2))))]>;
-}
-
-multiclass I3DNow_conv_rm<bits<8> opc, string Mn> {
- def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src1), Mn, []>;
- def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src1), Mn, []>;
+ (bitconvert (load_mmx addr:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, OpndItins itins,
+ string Ver = ""> {
def rr : I3DNow_conv<opc, MRMSrcReg, (ins VR64:$src), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
- !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))]>;
+ !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm : I3DNow_conv<opc, MRMSrcMem, (ins i64mem:$src), Mn,
[(set VR64:$dst, (!cast<Intrinsic>(
!strconcat("int_x86_3dnow", Ver, "_", Mn))
- (bitconvert (load_mmx addr:$src))))]>;
+ (bitconvert (load_mmx addr:$src))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", 1>;
-defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id">;
-defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc">;
-defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", 1>;
-defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", 1>;
-defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge">;
-defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
-defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax">;
-defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin">;
-defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", 1>;
-defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp">;
-defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
-defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
-defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
-defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
-defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", 1>;
-defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", 1>;
-defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd">;
-defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", 1>;
-
+defm PAVGUSB : I3DNow_binop_rm_int<0xBF, "pavgusb", I3DNOW_MISC_FUNC_ITINS, 1>;
+defm PF2ID : I3DNow_conv_rm_int<0x1D, "pf2id", I3DNOW_FCVT_F2I_ITINS>;
+defm PFACC : I3DNow_binop_rm_int<0xAE, "pfacc", I3DNOW_FALU_ITINS>;
+defm PFADD : I3DNow_binop_rm_int<0x9E, "pfadd", I3DNOW_FALU_ITINS, 1>;
+defm PFCMPEQ : I3DNow_binop_rm_int<0xB0, "pfcmpeq", I3DNOW_FALU_ITINS, 1>;
+defm PFCMPGE : I3DNow_binop_rm_int<0x90, "pfcmpge", I3DNOW_FALU_ITINS>;
+defm PFCMPGT : I3DNow_binop_rm_int<0xA0, "pfcmpgt", I3DNOW_FALU_ITINS>;
+defm PFMAX : I3DNow_binop_rm_int<0xA4, "pfmax", I3DNOW_FALU_ITINS>;
+defm PFMIN : I3DNow_binop_rm_int<0x94, "pfmin", I3DNOW_FALU_ITINS>;
+defm PFMUL : I3DNow_binop_rm_int<0xB4, "pfmul", I3DNOW_FALU_ITINS, 1>;
+defm PFRCP : I3DNow_conv_rm_int<0x96, "pfrcp", I3DNOW_FALU_ITINS>;
+defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1", I3DNOW_FALU_ITINS>;
+defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2", I3DNOW_FALU_ITINS>;
+defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1", I3DNOW_FALU_ITINS>;
+defm PFRSQRT : I3DNow_conv_rm_int<0x97, "pfrsqrt", I3DNOW_FALU_ITINS>;
+defm PFSUB : I3DNow_binop_rm_int<0x9A, "pfsub", I3DNOW_FALU_ITINS, 1>;
+defm PFSUBR : I3DNow_binop_rm_int<0xAA, "pfsubr", I3DNOW_FALU_ITINS, 1>;
+defm PI2FD : I3DNow_conv_rm_int<0x0D, "pi2fd", I3DNOW_FCVT_I2F_ITINS>;
+defm PMULHRW : I3DNow_binop_rm_int<0xB7, "pmulhrw", I3DNOW_MISC_FUNC_ITINS, 1>;
def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
- [(int_x86_mmx_femms)]>;
+ [(int_x86_mmx_femms)], IIC_MMX_EMMS>;
+let SchedRW = [WriteLoad] in {
def PREFETCH : I3DNow<0x0D, MRM0m, (outs), (ins i8mem:$addr),
"prefetch\t$addr",
- [(prefetch addr:$addr, (i32 0), imm, (i32 1))]>;
-
+ [(prefetch addr:$addr, (i32 0), imm, (i32 1))],
+ IIC_SSE_PREFETCH>;
def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
- [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))]>, TB,
- Requires<[HasPrefetchW]>;
+ [(prefetch addr:$addr, (i32 1), (i32 3), (i32 1))],
+ IIC_SSE_PREFETCH>, TB, Requires<[HasPrefetchW]>;
+}
// "3DNowA" instructions
-defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
-defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
-defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", 0, "a">;
-defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", 0, "a">;
-defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
+defm PF2IW : I3DNow_conv_rm_int<0x1C, "pf2iw", I3DNOW_FCVT_F2I_ITINS, "a">;
+defm PI2FW : I3DNow_conv_rm_int<0x0C, "pi2fw", I3DNOW_FCVT_I2F_ITINS, "a">;
+defm PFNACC : I3DNow_binop_rm_int<0x8A, "pfnacc", I3DNOW_FALU_ITINS, 0, "a">;
+defm PFPNACC : I3DNow_binop_rm_int<0x8E, "pfpnacc", I3DNOW_FALU_ITINS, 0, "a">;
+defm PSWAPD : I3DNow_conv_rm_int<0xBB, "pswapd", I3DNOW_PSHUF_ITINS, "a">;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index 0ae960e7d566..2a2286e42405 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -192,6 +192,7 @@ class X86KVectorVTInfo<RegisterClass _krc, RegisterClass _krcwm,
ValueType KVT = _vt;
}
+def v1i1_info : X86KVectorVTInfo<VK1, VK1WM, v1i1>;
def v2i1_info : X86KVectorVTInfo<VK2, VK2WM, v2i1>;
def v4i1_info : X86KVectorVTInfo<VK4, VK4WM, v4i1>;
def v8i1_info : X86KVectorVTInfo<VK8, VK8WM, v8i1>;
@@ -211,8 +212,8 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
list<dag> Pattern,
list<dag> MaskingPattern,
list<dag> ZeroMaskingPattern,
+ InstrItinClass itin,
string MaskingConstraint = "",
- InstrItinClass itin = NoItinerary,
bit IsCommutable = 0,
bit IsKCommutable = 0> {
let isCommutable = IsCommutable in
@@ -251,9 +252,9 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,
+ InstrItinClass itin,
SDNode Select = vselect,
string MaskingConstraint = "",
- InstrItinClass itin = NoItinerary,
bit IsCommutable = 0,
bit IsKCommutable = 0> :
AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
@@ -262,25 +263,30 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
[(set _.RC:$dst, MaskingRHS)],
[(set _.RC:$dst,
(Select _.KRCWM:$mask, RHS, _.ImmAllZerosV))],
- MaskingConstraint, NoItinerary, IsCommutable,
+ itin, MaskingConstraint, IsCommutable,
IsKCommutable>;
-// Similar to AVX512_maskable_common, but with scalar types.
-multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
- dag Outs,
- dag Ins, dag MaskingIns, dag ZeroMaskingIns,
- string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm,
- SDNode Select = vselect,
- string MaskingConstraint = "",
- InstrItinClass itin = NoItinerary,
- bit IsCommutable = 0,
- bit IsKCommutable = 0> :
- AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
- AttSrcAsm, IntelSrcAsm,
- [], [], [],
- MaskingConstraint, NoItinerary, IsCommutable,
- IsKCommutable>;
+// This multiclass generates the unconditional/non-masking, the masking and
+// the zero-masking variant of the vector instruction. In the masking case, the
+// perserved vector elements come from a new dummy input operand tied to $dst.
+// This version uses a separate dag for non-masking and masking.
+multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
+ dag Outs, dag Ins, string OpcodeStr,
+ string AttSrcAsm, string IntelSrcAsm,
+ dag RHS, dag MaskRHS,
+ InstrItinClass itin,
+ bit IsCommutable = 0, bit IsKCommutable = 0,
+ SDNode Select = vselect> :
+ AVX512_maskable_custom<O, F, Outs, Ins,
+ !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
+ !con((ins _.KRCWM:$mask), Ins),
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ [(set _.RC:$dst, RHS)],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
+ [(set _.RC:$dst,
+ (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
+ itin, "$src0 = $dst", IsCommutable, IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -289,15 +295,15 @@ multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
- InstrItinClass itin = NoItinerary,
+ InstrItinClass itin,
bit IsCommutable = 0, bit IsKCommutable = 0,
SDNode Select = vselect> :
AVX512_maskable_common<O, F, _, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (Select _.KRCWM:$mask, RHS, _.RC:$src0), Select,
- "$src0 = $dst", itin, IsCommutable, IsKCommutable>;
+ (Select _.KRCWM:$mask, RHS, _.RC:$src0), itin,
+ Select, "$src0 = $dst", IsCommutable, IsKCommutable>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the scalar instruction.
@@ -305,14 +311,10 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS,
- InstrItinClass itin = NoItinerary,
+ InstrItinClass itin,
bit IsCommutable = 0> :
- AVX512_maskable_common<O, F, _, Outs, Ins,
- !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
- !con((ins _.KRCWM:$mask), Ins),
- OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (X86selects _.KRCWM:$mask, RHS, _.RC:$src0),
- X86selects, "$src0 = $dst", itin, IsCommutable>;
+ AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ RHS, itin, IsCommutable, 0, X86selects>;
// Similar to AVX512_maskable but in this case one of the source operands
// ($src1) is already tied to $dst so we just use that for the preserved
@@ -321,40 +323,42 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, bit IsCommutable = 0,
- bit IsKCommutable = 0> :
+ dag RHS, InstrItinClass itin,
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0,
+ SDNode Select = vselect,
+ bit MaskOnly = 0> :
AVX512_maskable_common<O, F, _, Outs,
!con((ins _.RC:$src1), NonTiedIns),
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
!con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
- OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (vselect _.KRCWM:$mask, RHS, _.RC:$src1),
- vselect, "", NoItinerary, IsCommutable, IsKCommutable>;
+ OpcodeStr, AttSrcAsm, IntelSrcAsm,
+ !if(MaskOnly, (null_frag), RHS),
+ (Select _.KRCWM:$mask, RHS, _.RC:$src1), itin,
+ Select, "", IsCommutable, IsKCommutable>;
multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag NonTiedIns, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, bit IsCommutable = 0,
- bit IsKCommutable = 0> :
- AVX512_maskable_common<O, F, _, Outs,
- !con((ins _.RC:$src1), NonTiedIns),
- !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
- !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
- OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (X86selects _.KRCWM:$mask, RHS, _.RC:$src1),
- X86selects, "", NoItinerary, IsCommutable,
- IsKCommutable>;
+ dag RHS, InstrItinClass itin,
+ bit IsCommutable = 0,
+ bit IsKCommutable = 0,
+ bit MaskOnly = 0> :
+ AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
+ IntelSrcAsm, RHS, itin, IsCommutable, IsKCommutable,
+ X86selects, MaskOnly>;
multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- list<dag> Pattern> :
+ list<dag> Pattern,
+ InstrItinClass itin> :
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, Pattern, [], [],
- "$src0 = $dst">;
+ itin, "$src0 = $dst">;
// Instruction with mask that puts result in mask register,
@@ -366,17 +370,18 @@ multiclass AVX512_maskable_custom_cmp<bits<8> O, Format F,
string AttSrcAsm, string IntelSrcAsm,
list<dag> Pattern,
list<dag> MaskingPattern,
+ InstrItinClass itin,
bit IsCommutable = 0> {
let isCommutable = IsCommutable in
def NAME: AVX512<O, F, Outs, Ins,
OpcodeStr#"\t{"#AttSrcAsm#", $dst|"#
"$dst, "#IntelSrcAsm#"}",
- Pattern, NoItinerary>;
+ Pattern, itin>;
def NAME#k: AVX512<O, F, Outs, MaskingIns,
OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
"$dst {${mask}}, "#IntelSrcAsm#"}",
- MaskingPattern, NoItinerary>, EVEX_K;
+ MaskingPattern, itin>, EVEX_K;
}
multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
@@ -385,27 +390,30 @@ multiclass AVX512_maskable_common_cmp<bits<8> O, Format F, X86VectorVTInfo _,
string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskingRHS,
+ InstrItinClass itin,
bit IsCommutable = 0> :
AVX512_maskable_custom_cmp<O, F, Outs, Ins, MaskingIns, OpcodeStr,
AttSrcAsm, IntelSrcAsm,
[(set _.KRC:$dst, RHS)],
- [(set _.KRC:$dst, MaskingRHS)], IsCommutable>;
+ [(set _.KRC:$dst, MaskingRHS)], itin, IsCommutable>;
multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
- dag RHS, bit IsCommutable = 0> :
+ dag RHS, InstrItinClass itin,
+ bit IsCommutable = 0> :
AVX512_maskable_common_cmp<O, F, _, Outs, Ins,
!con((ins _.KRCWM:$mask), Ins),
OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
- (and _.KRCWM:$mask, RHS), IsCommutable>;
+ (and _.KRCWM:$mask, RHS), itin, IsCommutable>;
multiclass AVX512_maskable_cmp_alt<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
- string AttSrcAsm, string IntelSrcAsm> :
+ string AttSrcAsm, string IntelSrcAsm,
+ InstrItinClass itin> :
AVX512_maskable_custom_cmp<O, F, Outs,
Ins, !con((ins _.KRCWM:$mask),Ins), OpcodeStr,
- AttSrcAsm, IntelSrcAsm, [],[]>;
+ AttSrcAsm, IntelSrcAsm, [],[], itin>;
// This multiclass generates the unconditional/non-masking, the masking and
// the zero-masking variant of the vector instruction. In the masking case, the
@@ -414,7 +422,7 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
dag Outs, dag Ins, string OpcodeStr,
string AttSrcAsm, string IntelSrcAsm,
dag RHS, dag MaskedRHS,
- InstrItinClass itin = NoItinerary,
+ InstrItinClass itin,
bit IsCommutable = 0, SDNode Select = vselect> :
AVX512_maskable_custom<O, F, Outs, Ins,
!con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
@@ -426,41 +434,8 @@ multiclass AVX512_maskable_logic<bits<8> O, Format F, X86VectorVTInfo _,
[(set _.RC:$dst,
(Select _.KRCWM:$mask, MaskedRHS,
_.ImmAllZerosV))],
- "$src0 = $dst", itin, IsCommutable>;
-
-// Bitcasts between 512-bit vector types. Return the original type since
-// no instruction is needed for the conversion.
-def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
-def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
-def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>;
-def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
-def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
-def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
-def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
-def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
-def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
-def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
-def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
-def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
-def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
-def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
-def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
-def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
-def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
-def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
-def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>;
-def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
-def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
-def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
-def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
-def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>;
-def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
+ itin, "$src0 = $dst", IsCommutable>;
+
// Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
// This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
@@ -478,7 +453,7 @@ def AVX512_512_SETALLONES : I<0, Pseudo, (outs VR512:$dst), (ins), "",
// Alias instructions that allow VPTERNLOG to be used with a mask to create
// a mix of all ones and all zeros elements. This is done this way to force
// the same register to be used as input for all three sources.
-let isPseudo = 1, Predicates = [HasAVX512] in {
+let isPseudo = 1, Predicates = [HasAVX512], SchedRW = [WriteVecALU] in {
def AVX512_512_SEXT_MASK_32 : I<0, Pseudo, (outs VR512:$dst),
(ins VK16WM:$mask), "",
[(set VR512:$dst, (vselect (v16i1 VK16WM:$mask),
@@ -512,28 +487,49 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
//===----------------------------------------------------------------------===//
// AVX-512 - VECTOR INSERT
//
-multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To,
- PatFrag vinsert_insert> {
- let ExeDomain = To.ExeDomain in {
- defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
+
+// Supports two different pattern operators for mask and unmasked ops. Allows
+// null_frag to be passed for one.
+multiclass vinsert_for_size_split<int Opcode, X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDPatternOperator vinsert_insert,
+ SDPatternOperator vinsert_for_mask,
+ OpndItins itins> {
+ let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
+ defm rr : AVX512_maskable_split<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
"vinsert" # From.EltTypeName # "x" # From.NumElts,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(vinsert_insert:$src3 (To.VT To.RC:$src1),
(From.VT From.RC:$src2),
- (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
-
- defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
+ (iPTR imm)),
+ (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm)), itins.rr>,
+ AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>;
+ let mayLoad = 1 in
+ defm rm : AVX512_maskable_split<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
(ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
"vinsert" # From.EltTypeName # "x" # From.NumElts,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(vinsert_insert:$src3 (To.VT To.RC:$src1),
(From.VT (bitconvert (From.LdFrag addr:$src2))),
- (iPTR imm))>, AVX512AIi8Base, EVEX_4V,
- EVEX_CD8<From.EltSize, From.CD8TupleForm>;
+ (iPTR imm)),
+ (vinsert_for_mask:$src3 (To.VT To.RC:$src1),
+ (From.VT (bitconvert (From.LdFrag addr:$src2))),
+ (iPTR imm)), itins.rm>, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<From.EltSize, From.CD8TupleForm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
+// Passes the same pattern operator for masked and unmasked ops.
+multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDPatternOperator vinsert_insert,
+ OpndItins itins> :
+ vinsert_for_size_split<Opcode, From, To, vinsert_insert, vinsert_insert, itins>;
+
multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
X86VectorVTInfo To, PatFrag vinsert_insert,
SDNodeXForm INSERT_get_vinsert_imm , list<Predicate> p> {
@@ -555,62 +551,78 @@ multiclass vinsert_for_size_lowering<string InstrStr, X86VectorVTInfo From,
}
multiclass vinsert_for_type<ValueType EltVT32, int Opcode128,
- ValueType EltVT64, int Opcode256> {
+ ValueType EltVT64, int Opcode256,
+ OpndItins itins> {
let Predicates = [HasVLX] in
defm NAME # "32x4Z256" : vinsert_for_size<Opcode128,
X86VectorVTInfo< 4, EltVT32, VR128X>,
X86VectorVTInfo< 8, EltVT32, VR256X>,
- vinsert128_insert>, EVEX_V256;
+ vinsert128_insert, itins>, EVEX_V256;
defm NAME # "32x4Z" : vinsert_for_size<Opcode128,
X86VectorVTInfo< 4, EltVT32, VR128X>,
X86VectorVTInfo<16, EltVT32, VR512>,
- vinsert128_insert>, EVEX_V512;
+ vinsert128_insert, itins>, EVEX_V512;
defm NAME # "64x4Z" : vinsert_for_size<Opcode256,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
- vinsert256_insert>, VEX_W, EVEX_V512;
+ vinsert256_insert, itins>, VEX_W, EVEX_V512;
+ // Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasVLX, HasDQI] in
- defm NAME # "64x2Z256" : vinsert_for_size<Opcode128,
+ defm NAME # "64x2Z256" : vinsert_for_size_split<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 4, EltVT64, VR256X>,
- vinsert128_insert>, VEX_W, EVEX_V256;
+ null_frag, vinsert128_insert, itins>,
+ VEX_W, EVEX_V256;
+ // Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
- defm NAME # "64x2Z" : vinsert_for_size<Opcode128,
+ defm NAME # "64x2Z" : vinsert_for_size_split<Opcode128,
X86VectorVTInfo< 2, EltVT64, VR128X>,
X86VectorVTInfo< 8, EltVT64, VR512>,
- vinsert128_insert>, VEX_W, EVEX_V512;
+ null_frag, vinsert128_insert, itins>,
+ VEX_W, EVEX_V512;
- defm NAME # "32x8Z" : vinsert_for_size<Opcode256,
+ defm NAME # "32x8Z" : vinsert_for_size_split<Opcode256,
X86VectorVTInfo< 8, EltVT32, VR256X>,
X86VectorVTInfo<16, EltVT32, VR512>,
- vinsert256_insert>, EVEX_V512;
+ null_frag, vinsert256_insert, itins>,
+ EVEX_V512;
}
}
-defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a>;
-defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a>;
+// FIXME: Is there a better scheduler itinerary for VINSERTF/VINSERTI?
+let Sched = WriteFShuffle256 in
+def AVX512_VINSERTF : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+let Sched = WriteShuffle256 in
+def AVX512_VINSERTI : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
+defm VINSERTF : vinsert_for_type<f32, 0x18, f64, 0x1a, AVX512_VINSERTF>;
+defm VINSERTI : vinsert_for_type<i32, 0x38, i64, 0x3a, AVX512_VINSERTI>;
// Codegen pattern with the alternative types,
-// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+// Even with AVX512DQ we'll still use these for unmasked operations.
defm : vinsert_for_size_lowering<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
- vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
- vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX, NoDQI]>;
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasVLX]>;
defm : vinsert_for_size_lowering<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
- vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
- vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512, NoDQI]>;
+ vinsert128_insert, INSERT_get_vinsert128_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
- vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
- vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512, NoDQI]>;
+ vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
// Codegen pattern with the alternative types insert VEC128 into VEC256
defm : vinsert_for_size_lowering<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
@@ -628,48 +640,184 @@ defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
defm : vinsert_for_size_lowering<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
vinsert256_insert, INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, X86VectorVTInfo Cast,
+ PatFrag vinsert_insert,
+ SDNodeXForm INSERT_get_vinsert_imm,
+ list<Predicate> p> {
+let Predicates = p in {
+ def : Pat<(Cast.VT
+ (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))),
+ Cast.RC:$src0)),
+ (!cast<Instruction>(InstrStr#"rrk")
+ Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+ def : Pat<(Cast.VT
+ (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT
+ (bitconvert
+ (From.LdFrag addr:$src2))),
+ (iPTR imm))),
+ Cast.RC:$src0)),
+ (!cast<Instruction>(InstrStr#"rmk")
+ Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+
+ def : Pat<(Cast.VT
+ (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT From.RC:$src2),
+ (iPTR imm))),
+ Cast.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#"rrkz")
+ Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+ def : Pat<(Cast.VT
+ (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (vinsert_insert:$ins (To.VT To.RC:$src1),
+ (From.VT
+ (bitconvert
+ (From.LdFrag addr:$src2))),
+ (iPTR imm))),
+ Cast.ImmAllZerosV)),
+ (!cast<Instruction>(InstrStr#"rmkz")
+ Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
+ (INSERT_get_vinsert_imm To.RC:$ins))>;
+}
+}
+
+defm : vinsert_for_mask_cast<"VINSERTF32x4Z256", v2f64x_info, v4f64x_info,
+ v8f32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4f32x_info, v8f32x_info,
+ v4f64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v2i64x_info, v4i64x_info,
+ v8i32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v8i16x_info, v16i16x_info,
+ v8i32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z256", v16i8x_info, v32i8x_info,
+ v8i32x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v4i32x_info, v8i32x_info,
+ v4i64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v8i16x_info, v16i16x_info,
+ v4i64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z256", v16i8x_info, v32i8x_info,
+ v4i64x_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI, HasVLX]>;
+
+defm : vinsert_for_mask_cast<"VINSERTF32x4Z", v2f64x_info, v8f64_info,
+ v16f32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x2Z", v4f32x_info, v16f32_info,
+ v8f64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v2i64x_info, v8i64_info,
+ v16i32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v8i16x_info, v32i16_info,
+ v16i32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x4Z", v16i8x_info, v64i8_info,
+ v16i32_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v4i32x_info, v16i32_info,
+ v8i64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v8i16x_info, v32i16_info,
+ v8i64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x2Z", v16i8x_info, v64i8_info,
+ v8i64_info, vinsert128_insert,
+ INSERT_get_vinsert128_imm, [HasDQI]>;
+
+defm : vinsert_for_mask_cast<"VINSERTF32x8Z", v4f64x_info, v8f64_info,
+ v16f32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTF64x4Z", v8f32x_info, v16f32_info,
+ v8f64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+
+defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v4i64x_info, v8i64_info,
+ v16i32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v16i16x_info, v32i16_info,
+ v16i32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI32x8Z", v32i8x_info, v64i8_info,
+ v16i32_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasDQI]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v8i32x_info, v16i32_info,
+ v8i64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v16i16x_info, v32i16_info,
+ v8i64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+defm : vinsert_for_mask_cast<"VINSERTI64x4Z", v32i8x_info, v64i8_info,
+ v8i64_info, vinsert256_insert,
+ INSERT_get_vinsert256_imm, [HasAVX512]>;
+
// vinsertps - insert f32 to XMM
let ExeDomain = SSEPackedSingle in {
def VINSERTPSZrr : AVX512AIi8<0x21, MRMSrcReg, (outs VR128X:$dst),
(ins VR128X:$src1, VR128X:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))]>,
- EVEX_4V;
+ [(set VR128X:$dst, (X86insertps VR128X:$src1, VR128X:$src2, imm:$src3))],
+ IIC_SSE_INSERTPS_RR>, EVEX_4V, Sched<[WriteFShuffle]>;
def VINSERTPSZrm: AVX512AIi8<0x21, MRMSrcMem, (outs VR128X:$dst),
(ins VR128X:$src1, f32mem:$src2, u8imm:$src3),
"vinsertps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set VR128X:$dst, (X86insertps VR128X:$src1,
(v4f32 (scalar_to_vector (loadf32 addr:$src2))),
- imm:$src3))]>, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+ imm:$src3))], IIC_SSE_INSERTPS_RM>, EVEX_4V,
+ EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd, ReadAfterLd]>;
}
//===----------------------------------------------------------------------===//
// AVX-512 VECTOR EXTRACT
//---
-multiclass vextract_for_size<int Opcode,
- X86VectorVTInfo From, X86VectorVTInfo To,
- PatFrag vextract_extract,
- SDNodeXForm EXTRACT_get_vextract_imm> {
+// Supports two different pattern operators for mask and unmasked ops. Allows
+// null_frag to be passed for one.
+multiclass vextract_for_size_split<int Opcode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ SDPatternOperator vextract_extract,
+ SDPatternOperator vextract_for_mask,
+ OpndItins itins> {
let hasSideEffects = 0, ExeDomain = To.ExeDomain in {
- // use AVX512_maskable_in_asm (AVX512_maskable can't be used due to
- // vextract_extract), we interesting only in patterns without mask,
- // intrinsics pattern match generated bellow.
- defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
+ defm rr : AVX512_maskable_split<Opcode, MRMDestReg, To, (outs To.RC:$dst),
(ins From.RC:$src1, u8imm:$idx),
"vextract" # To.EltTypeName # "x" # To.NumElts,
"$idx, $src1", "$src1, $idx",
- [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
- (iPTR imm)))]>,
- AVX512AIi8Base, EVEX;
+ (vextract_extract:$idx (From.VT From.RC:$src1), (iPTR imm)),
+ (vextract_for_mask:$idx (From.VT From.RC:$src1), (iPTR imm)),
+ itins.rr>, AVX512AIi8Base, EVEX, Sched<[itins.Sched]>;
+
def mr : AVX512AIi8<Opcode, MRMDestMem, (outs),
(ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
"vextract" # To.EltTypeName # "x" # To.NumElts #
"\t{$idx, $src1, $dst|$dst, $src1, $idx}",
[(store (To.VT (vextract_extract:$idx
(From.VT From.RC:$src1), (iPTR imm))),
- addr:$dst)]>, EVEX;
+ addr:$dst)], itins.rm>, EVEX,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
let mayStore = 1, hasSideEffects = 0 in
def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
@@ -678,28 +826,18 @@ multiclass vextract_for_size<int Opcode,
"vextract" # To.EltTypeName # "x" # To.NumElts #
"\t{$idx, $src1, $dst {${mask}}|"
"$dst {${mask}}, $src1, $idx}",
- []>, EVEX_K, EVEX;
+ [], itins.rm>, EVEX_K, EVEX,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (vextract_extract:$ext (From.VT From.RC:$src1),
- (iPTR imm)),
- To.RC:$src0)),
- (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rrk")
- To.RC:$src0, To.KRCWM:$mask, From.RC:$src1,
- (EXTRACT_get_vextract_imm To.RC:$ext))>;
-
- def : Pat<(To.VT (vselect To.KRCWM:$mask,
- (vextract_extract:$ext (From.VT From.RC:$src1),
- (iPTR imm)),
- To.ImmAllZerosV)),
- (!cast<Instruction>(NAME # To.EltSize # "x" # To.NumElts #
- From.ZSuffix # "rrkz")
- To.KRCWM:$mask, From.RC:$src1,
- (EXTRACT_get_vextract_imm To.RC:$ext))>;
}
+// Passes the same pattern operator for masked and unmasked ops.
+multiclass vextract_for_size<int Opcode, X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDPatternOperator vextract_extract,
+ OpndItins itins> :
+ vextract_for_size_split<Opcode, From, To, vextract_extract, vextract_extract, itins>;
+
// Codegen pattern for the alternative types
multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
X86VectorVTInfo To, PatFrag vextract_extract,
@@ -717,68 +855,79 @@ multiclass vextract_for_size_lowering<string InstrStr, X86VectorVTInfo From,
}
multiclass vextract_for_type<ValueType EltVT32, int Opcode128,
- ValueType EltVT64, int Opcode256> {
- defm NAME # "32x4Z" : vextract_for_size<Opcode128,
- X86VectorVTInfo<16, EltVT32, VR512>,
- X86VectorVTInfo< 4, EltVT32, VR128X>,
- vextract128_extract,
- EXTRACT_get_vextract128_imm>,
- EVEX_V512, EVEX_CD8<32, CD8VT4>;
- defm NAME # "64x4Z" : vextract_for_size<Opcode256,
- X86VectorVTInfo< 8, EltVT64, VR512>,
- X86VectorVTInfo< 4, EltVT64, VR256X>,
- vextract256_extract,
- EXTRACT_get_vextract256_imm>,
- VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+ ValueType EltVT64, int Opcode256,
+ OpndItins itins> {
+ let Predicates = [HasAVX512] in {
+ defm NAME # "32x4Z" : vextract_for_size<Opcode128,
+ X86VectorVTInfo<16, EltVT32, VR512>,
+ X86VectorVTInfo< 4, EltVT32, VR128X>,
+ vextract128_extract, itins>,
+ EVEX_V512, EVEX_CD8<32, CD8VT4>;
+ defm NAME # "64x4Z" : vextract_for_size<Opcode256,
+ X86VectorVTInfo< 8, EltVT64, VR512>,
+ X86VectorVTInfo< 4, EltVT64, VR256X>,
+ vextract256_extract, itins>,
+ VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT4>;
+ }
let Predicates = [HasVLX] in
defm NAME # "32x4Z256" : vextract_for_size<Opcode128,
X86VectorVTInfo< 8, EltVT32, VR256X>,
X86VectorVTInfo< 4, EltVT32, VR128X>,
- vextract128_extract,
- EXTRACT_get_vextract128_imm>,
+ vextract128_extract, itins>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
+
+ // Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasVLX, HasDQI] in
- defm NAME # "64x2Z256" : vextract_for_size<Opcode128,
+ defm NAME # "64x2Z256" : vextract_for_size_split<Opcode128,
X86VectorVTInfo< 4, EltVT64, VR256X>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- vextract128_extract,
- EXTRACT_get_vextract128_imm>,
+ null_frag, vextract128_extract, itins>,
VEX_W, EVEX_V256, EVEX_CD8<64, CD8VT2>;
+
+ // Even with DQI we'd like to only use these instructions for masking.
let Predicates = [HasDQI] in {
- defm NAME # "64x2Z" : vextract_for_size<Opcode128,
+ defm NAME # "64x2Z" : vextract_for_size_split<Opcode128,
X86VectorVTInfo< 8, EltVT64, VR512>,
X86VectorVTInfo< 2, EltVT64, VR128X>,
- vextract128_extract,
- EXTRACT_get_vextract128_imm>,
+ null_frag, vextract128_extract, itins>,
VEX_W, EVEX_V512, EVEX_CD8<64, CD8VT2>;
- defm NAME # "32x8Z" : vextract_for_size<Opcode256,
+ defm NAME # "32x8Z" : vextract_for_size_split<Opcode256,
X86VectorVTInfo<16, EltVT32, VR512>,
X86VectorVTInfo< 8, EltVT32, VR256X>,
- vextract256_extract,
- EXTRACT_get_vextract256_imm>,
+ null_frag, vextract256_extract, itins>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
}
}
-defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b>;
-defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b>;
+// FIXME: Is there a better scheduler itinerary for VEXTRACTF/VEXTRACTI?
+let Sched = WriteFShuffle256 in
+def AVX512_VEXTRACTF : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+let Sched = WriteShuffle256 in
+def AVX512_VEXTRACTI : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
+defm VEXTRACTF : vextract_for_type<f32, 0x19, f64, 0x1b, AVX512_VEXTRACTF>;
+defm VEXTRACTI : vextract_for_type<i32, 0x39, i64, 0x3b, AVX512_VEXTRACTI>;
// extract_subvector codegen patterns with the alternative types.
-// Only add this if 64x2 and its friends are not supported natively via AVX512DQ.
+// Even with AVX512DQ we'll still use these for unmasked operations.
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
- vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
- vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512, NoDQI]>;
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
- vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
- vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512, NoDQI]>;
+ vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
defm : vextract_for_size_lowering<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
- vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
- vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX, NoDQI]>;
+ vextract128_extract, EXTRACT_get_vextract128_imm, [HasVLX]>;
// Codegen pattern with the alternative types extract VEC128 from VEC256
defm : vextract_for_size_lowering<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
@@ -797,80 +946,185 @@ defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
defm : vextract_for_size_lowering<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
vextract256_extract, EXTRACT_get_vextract256_imm, [HasAVX512]>;
-// A 128-bit subvector extract from the first 256-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
- (v2i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
- (v2f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_xmm))>;
-def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
- (v4i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
- (v4f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_xmm))>;
-def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
- (v8i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_xmm))>;
-def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
- (v16i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_xmm))>;
-
-// A 256-bit subvector extract from the first 256-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(v4i64 (extract_subvector (v8i64 VR512:$src), (iPTR 0))),
- (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm))>;
-def : Pat<(v4f64 (extract_subvector (v8f64 VR512:$src), (iPTR 0))),
- (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm))>;
-def : Pat<(v8i32 (extract_subvector (v16i32 VR512:$src), (iPTR 0))),
- (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm))>;
-def : Pat<(v8f32 (extract_subvector (v16f32 VR512:$src), (iPTR 0))),
- (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm))>;
-def : Pat<(v16i16 (extract_subvector (v32i16 VR512:$src), (iPTR 0))),
- (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm))>;
-def : Pat<(v32i8 (extract_subvector (v64i8 VR512:$src), (iPTR 0))),
- (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm))>;
-
-let AddedComplexity = 25 in { // to give priority over vinsertf128rm
-// A 128-bit subvector insert to the first 512-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(v8i64 (insert_subvector undef, (v2i64 VR128X:$src), (iPTR 0))),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
-def : Pat<(v8f64 (insert_subvector undef, (v2f64 VR128X:$src), (iPTR 0))),
- (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
-def : Pat<(v16i32 (insert_subvector undef, (v4i32 VR128X:$src), (iPTR 0))),
- (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
-def : Pat<(v16f32 (insert_subvector undef, (v4f32 VR128X:$src), (iPTR 0))),
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
-def : Pat<(v32i16 (insert_subvector undef, (v8i16 VR128X:$src), (iPTR 0))),
- (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
-def : Pat<(v64i8 (insert_subvector undef, (v16i8 VR128X:$src), (iPTR 0))),
- (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)>;
-
-// A 256-bit subvector insert to the first 512-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(v8i64 (insert_subvector undef, (v4i64 VR256X:$src), (iPTR 0))),
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(v8f64 (insert_subvector undef, (v4f64 VR256X:$src), (iPTR 0))),
- (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(v16i32 (insert_subvector undef, (v8i32 VR256X:$src), (iPTR 0))),
- (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(v16f32 (insert_subvector undef, (v8f32 VR256X:$src), (iPTR 0))),
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(v32i16 (insert_subvector undef, (v16i16 VR256X:$src), (iPTR 0))),
- (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-def : Pat<(v64i8 (insert_subvector undef, (v32i8 VR256X:$src), (iPTR 0))),
- (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)>;
-}
+
+// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
+// smaller extract to enable EVEX->VEX.
+let Predicates = [NoVLX] in {
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
+ (v2i64 (VEXTRACTI128rr
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))),
+ (v2f64 (VEXTRACTF128rr
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))),
+ (v4i32 (VEXTRACTI128rr
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))),
+ (v4f32 (VEXTRACTF128rr
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
+ (v8i16 (VEXTRACTI128rr
+ (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
+ (v16i8 (VEXTRACTI128rr
+ (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+}
+
+// A 128-bit extract from bits [255:128] of a 512-bit vector should use a
+// smaller extract to enable EVEX->VEX.
+let Predicates = [HasVLX] in {
+def : Pat<(v2i64 (extract_subvector (v8i64 VR512:$src), (iPTR 2))),
+ (v2i64 (VEXTRACTI32x4Z256rr
+ (v4i64 (EXTRACT_SUBREG (v8i64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v2f64 (extract_subvector (v8f64 VR512:$src), (iPTR 2))),
+ (v2f64 (VEXTRACTF32x4Z256rr
+ (v4f64 (EXTRACT_SUBREG (v8f64 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4i32 (extract_subvector (v16i32 VR512:$src), (iPTR 4))),
+ (v4i32 (VEXTRACTI32x4Z256rr
+ (v8i32 (EXTRACT_SUBREG (v16i32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v4f32 (extract_subvector (v16f32 VR512:$src), (iPTR 4))),
+ (v4f32 (VEXTRACTF32x4Z256rr
+ (v8f32 (EXTRACT_SUBREG (v16f32 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v8i16 (extract_subvector (v32i16 VR512:$src), (iPTR 8))),
+ (v8i16 (VEXTRACTI32x4Z256rr
+ (v16i16 (EXTRACT_SUBREG (v32i16 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+def : Pat<(v16i8 (extract_subvector (v64i8 VR512:$src), (iPTR 16))),
+ (v16i8 (VEXTRACTI32x4Z256rr
+ (v32i8 (EXTRACT_SUBREG (v64i8 VR512:$src), sub_ymm)),
+ (iPTR 1)))>;
+}
+
+
+// Additional patterns for handling a bitcast between the vselect and the
+// extract_subvector.
+multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, X86VectorVTInfo Cast,
+ PatFrag vextract_extract,
+ SDNodeXForm EXTRACT_get_vextract_imm,
+ list<Predicate> p> {
+let Predicates = p in {
+ def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (vextract_extract:$ext
+ (From.VT From.RC:$src), (iPTR imm)))),
+ To.RC:$src0)),
+ (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
+ Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+
+ def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (vextract_extract:$ext
+ (From.VT From.RC:$src), (iPTR imm)))),
+ Cast.ImmAllZerosV)),
+ (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
+ Cast.KRCWM:$mask, From.RC:$src,
+ (EXTRACT_get_vextract_imm To.RC:$ext)))>;
+}
+}
+
+defm : vextract_for_mask_cast<"VEXTRACTF32x4Z256", v4f64x_info, v2f64x_info,
+ v4f32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTF64x2Z256", v8f32x_info, v4f32x_info,
+ v2f64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v4i64x_info, v2i64x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v16i16x_info, v8i16x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z256", v32i8x_info, v16i8x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v8i32x_info, v4i32x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v16i16x_info, v8i16x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z256", v32i8x_info, v16i8x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI, HasVLX]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTF32x4Z", v8f64_info, v2f64x_info,
+ v4f32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTF64x2Z", v16f32_info, v4f32x_info,
+ v2f64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v8i64_info, v2i64x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v32i16_info, v8i16x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x4Z", v64i8_info, v16i8x_info,
+ v4i32x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v16i32_info, v4i32x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v32i16_info, v8i16x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x2Z", v64i8_info, v16i8x_info,
+ v2i64x_info, vextract128_extract,
+ EXTRACT_get_vextract128_imm, [HasDQI]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTF32x8Z", v8f64_info, v4f64x_info,
+ v8f32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTF64x4Z", v16f32_info, v8f32x_info,
+ v4f64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+
+defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v8i64_info, v4i64x_info,
+ v8i32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v32i16_info, v16i16x_info,
+ v8i32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI32x8Z", v64i8_info, v32i8x_info,
+ v8i32x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasDQI]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v16i32_info, v8i32x_info,
+ v4i64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v32i16_info, v16i16x_info,
+ v4i64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
+defm : vextract_for_mask_cast<"VEXTRACTI64x4Z", v64i8_info, v32i8x_info,
+ v4i64x_info, vextract256_extract,
+ EXTRACT_get_vextract256_imm, [HasAVX512]>;
// vextractps - extract 32 bits from XMM
def VEXTRACTPSZrr : AVX512AIi8<0x17, MRMDestReg, (outs GR32:$dst),
(ins VR128X:$src1, u8imm:$src2),
"vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))]>,
- EVEX;
+ [(set GR32:$dst, (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2))],
+ IIC_SSE_EXTRACTPS_RR>, EVEX, VEX_WIG, Sched<[WriteFShuffle]>;
def VEXTRACTPSZmr : AVX512AIi8<0x17, MRMDestMem, (outs),
(ins f32mem:$dst, VR128X:$src1, u8imm:$src2),
"vextractps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(store (extractelt (bc_v4i32 (v4f32 VR128X:$src1)), imm:$src2),
- addr:$dst)]>, EVEX, EVEX_CD8<32, CD8VT1>;
+ addr:$dst)], IIC_SSE_EXTRACTPS_RM>,
+ EVEX, VEX_WIG, EVEX_CD8<32, CD8VT1>, Sched<[WriteFShuffleLd]>;
//===---------------------------------------------------------------------===//
// AVX-512 BROADCAST
@@ -894,66 +1148,108 @@ multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
DestInfo.KRCWM:$mask, (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC))>;
}
-multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
- X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
- let ExeDomain = DestInfo.ExeDomain in {
- defm r : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
+// Split version to allow mask and broadcast node to be different types. This
+// helps support the 32x2 broadcasts.
+multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
+ SchedWrite SchedRR, SchedWrite SchedRM,
+ X86VectorVTInfo MaskInfo,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo,
+ SDPatternOperator UnmaskedOp = X86VBroadcast> {
+ let ExeDomain = DestInfo.ExeDomain, hasSideEffects = 0 in {
+ defm r : AVX512_maskable_split<opc, MRMSrcReg, MaskInfo,
+ (outs MaskInfo.RC:$dst),
(ins SrcInfo.RC:$src), OpcodeStr, "$src", "$src",
- (DestInfo.VT (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src)))>,
- T8PD, EVEX;
- defm m : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))),
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+ NoItinerary>, T8PD, EVEX, Sched<[SchedRR]>;
+ let mayLoad = 1 in
+ defm m : AVX512_maskable_split<opc, MRMSrcMem, MaskInfo,
+ (outs MaskInfo.RC:$dst),
(ins SrcInfo.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
- (DestInfo.VT (X86VBroadcast
- (SrcInfo.ScalarLdFrag addr:$src)))>,
- T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>;
- }
-
- def : Pat<(DestInfo.VT (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src))))),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
- def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
- (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src)))),
- DestInfo.RC:$src0)),
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT (UnmaskedOp
+ (SrcInfo.ScalarLdFrag addr:$src))))),
+ (MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT (X86VBroadcast
+ (SrcInfo.ScalarLdFrag addr:$src))))),
+ NoItinerary>, T8PD, EVEX, EVEX_CD8<SrcInfo.EltSize, CD8VT1>,
+ Sched<[SchedRM]>;
+ }
+
+ def : Pat<(MaskInfo.VT
+ (bitconvert
+ (DestInfo.VT (UnmaskedOp
+ (SrcInfo.VT (scalar_to_vector
+ (SrcInfo.ScalarLdFrag addr:$src))))))),
+ (!cast<Instruction>(NAME#MaskInfo.ZSuffix#m) addr:$src)>;
+ def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast
+ (SrcInfo.VT (scalar_to_vector
+ (SrcInfo.ScalarLdFrag addr:$src)))))),
+ MaskInfo.RC:$src0)),
(!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
- DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
- def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
- (X86VBroadcast
- (SrcInfo.VT (scalar_to_vector
- (SrcInfo.ScalarLdFrag addr:$src)))),
- DestInfo.ImmAllZerosV)),
- (!cast<Instruction>(NAME#DestInfo.ZSuffix#mkz)
- DestInfo.KRCWM:$mask, addr:$src)>;
-}
+ MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask, addr:$src)>;
+ def : Pat<(MaskInfo.VT (vselect MaskInfo.KRCWM:$mask,
+ (bitconvert
+ (DestInfo.VT
+ (X86VBroadcast
+ (SrcInfo.VT (scalar_to_vector
+ (SrcInfo.ScalarLdFrag addr:$src)))))),
+ MaskInfo.ImmAllZerosV)),
+ (!cast<Instruction>(NAME#MaskInfo.ZSuffix#mkz)
+ MaskInfo.KRCWM:$mask, addr:$src)>;
+}
+
+// Helper class to force mask and broadcast result to same type.
+multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
+ SchedWrite SchedRR, SchedWrite SchedRM,
+ X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo> :
+ avx512_broadcast_rm_split<opc, OpcodeStr, SchedRR, SchedRM,
+ DestInfo, DestInfo, SrcInfo>;
multiclass avx512_fp_broadcast_sd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info512, _.info128>,
avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
- EVEX_V512;
+ EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info256, _.info128>,
avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
- EVEX_V256;
+ EVEX_V256;
}
}
multiclass avx512_fp_broadcast_ss<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info512, _.info128>,
avx512_broadcast_scalar<opc, OpcodeStr, _.info512, _.info128>,
EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info256, _.info128>,
avx512_broadcast_scalar<opc, OpcodeStr, _.info256, _.info128>,
EVEX_V256;
- defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteFShuffle256,
+ WriteFShuffle256Ld, _.info128, _.info128>,
avx512_broadcast_scalar<opc, OpcodeStr, _.info128, _.info128>,
EVEX_V128;
}
@@ -968,26 +1264,27 @@ def : Pat<(int_x86_avx512_vbroadcast_ss_512 addr:$src),
def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
(VBROADCASTSDZm addr:$src)>;
-multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
- SDPatternOperator OpNode,
+multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
+ X86VectorVTInfo _, SDPatternOperator OpNode,
RegisterClass SrcRC> {
let ExeDomain = _.ExeDomain in
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins SrcRC:$src),
"vpbroadcast"##_.Suffix, "$src", "$src",
- (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX;
+ (_.VT (OpNode SrcRC:$src)), NoItinerary>, T8PD, EVEX,
+ Sched<[SchedRR]>;
}
-multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name,
+multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR,
X86VectorVTInfo _, SDPatternOperator OpNode,
RegisterClass SrcRC, SubRegIndex Subreg> {
- let ExeDomain = _.ExeDomain in
+ let hasSideEffects = 0, ExeDomain = _.ExeDomain in
defm r : AVX512_maskable_custom<opc, MRMSrcReg,
(outs _.RC:$dst), (ins GR32:$src),
!con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
!con((ins _.KRCWM:$mask), (ins GR32:$src)),
"vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
- "$src0 = $dst">, T8PD, EVEX;
+ NoItinerary, "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
def : Pat <(_.VT (OpNode SrcRC:$src)),
(!cast<Instruction>(Name#r)
@@ -1006,13 +1303,13 @@ multiclass avx512_int_broadcastbw_reg_vl<bits<8> opc, string Name,
AVX512VLVectorVTInfo _, SDPatternOperator OpNode,
RegisterClass SrcRC, SubRegIndex Subreg, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, _.info512, OpNode, SrcRC,
- Subreg>, EVEX_V512;
+ defm Z : avx512_int_broadcastbw_reg<opc, Name#Z, WriteShuffle256, _.info512,
+ OpNode, SrcRC, Subreg>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, _.info256, OpNode,
- SrcRC, Subreg>, EVEX_V256;
- defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, _.info128, OpNode,
- SrcRC, Subreg>, EVEX_V128;
+ defm Z256 : avx512_int_broadcastbw_reg<opc, Name#Z256, WriteShuffle256,
+ _.info256, OpNode, SrcRC, Subreg>, EVEX_V256;
+ defm Z128 : avx512_int_broadcastbw_reg<opc, Name#Z128, WriteShuffle,
+ _.info128, OpNode, SrcRC, Subreg>, EVEX_V128;
}
}
@@ -1020,10 +1317,13 @@ multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
SDPatternOperator OpNode,
RegisterClass SrcRC, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512;
+ defm Z : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info512, OpNode,
+ SrcRC>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256;
- defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128;
+ defm Z256 : avx512_int_broadcast_reg<opc, WriteShuffle256, _.info256, OpNode,
+ SrcRC>, EVEX_V256;
+ defm Z128 : avx512_int_broadcast_reg<opc, WriteShuffle, _.info128, OpNode,
+ SrcRC>, EVEX_V128;
}
}
@@ -1054,17 +1354,20 @@ multiclass avx512_int_broadcast_rm_lowering<X86VectorVTInfo DestInfo,
multiclass avx512_int_broadcast_rm_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd> {
let Predicates = [prd] in {
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, _.info512, _.info128>,
+ defm Z : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
+ WriteShuffle256Ld, _.info512, _.info128>,
avx512_int_broadcast_rm_lowering<_.info512, _.info256>,
EVEX_V512;
// Defined separately to avoid redefinition.
defm Z_Alt : avx512_int_broadcast_rm_lowering<_.info512, _.info512>;
}
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _.info256, _.info128>,
+ defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle256,
+ WriteShuffle256Ld, _.info256, _.info128>,
avx512_int_broadcast_rm_lowering<_.info256, _.info256>,
EVEX_V256;
- defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _.info128, _.info128>,
+ defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, WriteShuffle,
+ WriteShuffleLd, _.info128, _.info128>,
EVEX_V128;
}
}
@@ -1083,8 +1386,24 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
(_Dst.VT (X86SubVBroadcast
- (_Src.VT (bitconvert (_Src.LdFrag addr:$src)))))>,
- AVX5128IBase, EVEX;
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))),
+ NoItinerary>, AVX5128IBase, EVEX,
+ Sched<[WriteShuffleLd]>;
+}
+
+// This should be used for the AVX512DQ broadcast instructions. It disables
+// the unmasked patterns so that we only use the DQ instructions when masking
+// is requested.
+multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
+ X86VectorVTInfo _Dst, X86VectorVTInfo _Src> {
+ let hasSideEffects = 0, mayLoad = 1 in
+ defm rm : AVX512_maskable_split<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
+ (ins _Src.MemOp:$src), OpcodeStr, "$src", "$src",
+ (null_frag),
+ (_Dst.VT (X86SubVBroadcast
+ (_Src.VT (bitconvert (_Src.LdFrag addr:$src))))),
+ NoItinerary>, AVX5128IBase, EVEX,
+ Sched<[WriteShuffleLd]>;
}
let Predicates = [HasAVX512] in {
@@ -1093,12 +1412,14 @@ let Predicates = [HasAVX512] in {
(VPBROADCASTQZm addr:$src)>;
}
-let Predicates = [HasVLX, HasBWI] in {
+let Predicates = [HasVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
(VPBROADCASTQZ128m addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
(VPBROADCASTQZ256m addr:$src)>;
+}
+let Predicates = [HasVLX, HasBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -1131,6 +1452,10 @@ defm VBROADCASTF64X4 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf64x4",
EVEX_V512, EVEX_CD8<64, CD8VT4>;
let Predicates = [HasAVX512] in {
+def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
+ (VBROADCASTF64X4rm addr:$src)>;
+def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
+ (VBROADCASTI64X4rm addr:$src)>;
def : Pat<(v32i16 (X86SubVBroadcast (bc_v16i16 (loadv4i64 addr:$src)))),
(VBROADCASTI64X4rm addr:$src)>;
def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
@@ -1141,9 +1466,15 @@ def : Pat<(v64i8 (X86SubVBroadcast (bc_v32i8 (loadv4i64 addr:$src)))),
def : Pat<(v8f64 (X86SubVBroadcast (v4f64 VR256X:$src))),
(VINSERTF64x4Zrr (INSERT_SUBREG (v8f64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v4f64 VR256X:$src), 1)>;
+def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
+ (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8f32 VR256X:$src), 1)>;
def : Pat<(v8i64 (X86SubVBroadcast (v4i64 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v4i64 VR256X:$src), 1)>;
+def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
+ (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
+ (v8i32 VR256X:$src), 1)>;
def : Pat<(v32i16 (X86SubVBroadcast (v16i16 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v32i16 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v16i16 VR256X:$src), 1)>;
@@ -1151,6 +1482,10 @@ def : Pat<(v64i8 (X86SubVBroadcast (v32i8 VR256X:$src))),
(VINSERTI64x4Zrr (INSERT_SUBREG (v64i8 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
(v32i8 VR256X:$src), 1)>;
+def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF32X4rm addr:$src)>;
+def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI32X4rm addr:$src)>;
def : Pat<(v32i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4rm addr:$src)>;
def : Pat<(v64i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
@@ -1165,6 +1500,10 @@ defm VBROADCASTF32X4Z256 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf32x4",
v8f32x_info, v4f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VT4>;
+def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
+ (VBROADCASTF32X4Z256rm addr:$src)>;
+def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
+ (VBROADCASTI32X4Z256rm addr:$src)>;
def : Pat<(v16i16 (X86SubVBroadcast (bc_v8i16 (loadv2i64 addr:$src)))),
(VBROADCASTI32X4Z256rm addr:$src)>;
def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
@@ -1172,9 +1511,15 @@ def : Pat<(v32i8 (X86SubVBroadcast (bc_v16i8 (loadv2i64 addr:$src)))),
// Provide fallback in case the load node that is used in the patterns above
// is used by additional users, which prevents the pattern selection.
+def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
+ (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2f64 VR128X:$src), 1)>;
def : Pat<(v8f32 (X86SubVBroadcast (v4f32 VR128X:$src))),
(VINSERTF32x4Z256rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(v4f32 VR128X:$src), 1)>;
+def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
+ (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
+ (v2i64 VR128X:$src), 1)>;
def : Pat<(v8i32 (X86SubVBroadcast (v4i32 VR128X:$src))),
(VINSERTI32x4Z256rr (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
(v4i32 VR128X:$src), 1)>;
@@ -1187,92 +1532,41 @@ def : Pat<(v32i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
}
let Predicates = [HasVLX, HasDQI] in {
-defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+defm VBROADCASTI64X2Z128 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
v4i64x_info, v2i64x_info>, VEX_W,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
-defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
v4f64x_info, v2f64x_info>, VEX_W,
EVEX_V256, EVEX_CD8<64, CD8VT2>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
- (VINSERTF64x2Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v2f64 VR128X:$src), 1)>;
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
- (VINSERTI64x2Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v2i64 VR128X:$src), 1)>;
-}
-
-let Predicates = [HasVLX, NoDQI] in {
-def : Pat<(v4f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
- (VBROADCASTF32X4Z256rm addr:$src)>;
-def : Pat<(v4i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
- (VBROADCASTI32X4Z256rm addr:$src)>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v4f64 (X86SubVBroadcast (v2f64 VR128X:$src))),
- (VINSERTF32x4Z256rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v2f64 VR128X:$src), 1)>;
-def : Pat<(v4i64 (X86SubVBroadcast (v2i64 VR128X:$src))),
- (VINSERTI32x4Z256rr (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm),
- (v2i64 VR128X:$src), 1)>;
-}
-
-let Predicates = [HasAVX512, NoDQI] in {
-def : Pat<(v8f64 (X86SubVBroadcast (loadv2f64 addr:$src))),
- (VBROADCASTF32X4rm addr:$src)>;
-def : Pat<(v8i64 (X86SubVBroadcast (loadv2i64 addr:$src))),
- (VBROADCASTI32X4rm addr:$src)>;
-
-def : Pat<(v16f32 (X86SubVBroadcast (loadv8f32 addr:$src))),
- (VBROADCASTF64X4rm addr:$src)>;
-def : Pat<(v16i32 (X86SubVBroadcast (bc_v8i32 (loadv4i64 addr:$src)))),
- (VBROADCASTI64X4rm addr:$src)>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
- (VINSERTF64x4Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v8f32 VR256X:$src), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
- (VINSERTI64x4Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v8i32 VR256X:$src), 1)>;
}
let Predicates = [HasDQI] in {
-defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm<0x5a, "vbroadcasti64x2",
+defm VBROADCASTI64X2 : avx512_subvec_broadcast_rm_dq<0x5a, "vbroadcasti64x2",
v8i64_info, v2i64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT2>;
-defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm<0x5b, "vbroadcasti32x8",
+defm VBROADCASTI32X8 : avx512_subvec_broadcast_rm_dq<0x5b, "vbroadcasti32x8",
v16i32_info, v8i32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
-defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm<0x1a, "vbroadcastf64x2",
+defm VBROADCASTF64X2 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2",
v8f64_info, v2f64x_info>, VEX_W,
EVEX_V512, EVEX_CD8<64, CD8VT2>;
-defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm<0x1b, "vbroadcastf32x8",
+defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
v16f32_info, v8f32x_info>,
EVEX_V512, EVEX_CD8<32, CD8VT8>;
-
-// Provide fallback in case the load node that is used in the patterns above
-// is used by additional users, which prevents the pattern selection.
-def : Pat<(v16f32 (X86SubVBroadcast (v8f32 VR256X:$src))),
- (VINSERTF32x8Zrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v8f32 VR256X:$src), 1)>;
-def : Pat<(v16i32 (X86SubVBroadcast (v8i32 VR256X:$src))),
- (VINSERTI32x8Zrr (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm),
- (v8i32 VR256X:$src), 1)>;
}
multiclass avx512_common_broadcast_32x2<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _Dst, AVX512VLVectorVTInfo _Src> {
let Predicates = [HasDQI] in
- defm Z : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info512, _Src.info128>,
- EVEX_V512;
+ defm Z : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
+ WriteShuffle256Ld, _Dst.info512,
+ _Src.info512, _Src.info128, null_frag>,
+ EVEX_V512;
let Predicates = [HasDQI, HasVLX] in
- defm Z256 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info256, _Src.info128>,
- EVEX_V256;
+ defm Z256 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle256,
+ WriteShuffle256Ld, _Dst.info256,
+ _Src.info256, _Src.info128, null_frag>,
+ EVEX_V256;
}
multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
@@ -1280,8 +1574,10 @@ multiclass avx512_common_broadcast_i32x2<bits<8> opc, string OpcodeStr,
avx512_common_broadcast_32x2<opc, OpcodeStr, _Dst, _Src> {
let Predicates = [HasDQI, HasVLX] in
- defm Z128 : avx512_broadcast_rm<opc, OpcodeStr, _Dst.info128, _Src.info128>,
- EVEX_V128;
+ defm Z128 : avx512_broadcast_rm_split<opc, OpcodeStr, WriteShuffle,
+ WriteShuffleLd, _Dst.info128,
+ _Src.info128, _Src.info128, null_frag>,
+ EVEX_V128;
}
defm VBROADCASTI32X2 : avx512_common_broadcast_i32x2<0x59, "vbroadcasti32x2",
@@ -1313,7 +1609,8 @@ multiclass avx512_mask_broadcastm<bits<8> opc, string OpcodeStr,
X86VectorVTInfo _, RegisterClass KRC> {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.RC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))]>, EVEX;
+ [(set _.RC:$dst, (_.VT (X86VBroadcastm KRC:$src)))],
+ IIC_SSE_PSHUF_RI>, EVEX, Sched<[WriteShuffle]>;
}
multiclass avx512_mask_broadcast<bits<8> opc, string OpcodeStr,
@@ -1333,7 +1630,19 @@ defm VPBROADCASTMB2Q : avx512_mask_broadcast<0x2A, "vpbroadcastmb2q",
//===----------------------------------------------------------------------===//
// -- VPERMI2 - 3 source operands form --
-multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+let Sched = WriteFShuffle256 in
+def AVX512_PERM2_F : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+
+let Sched = WriteShuffle256 in
+def AVX512_PERM2_I : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
+multiclass avx512_perm_i<bits<8> opc, string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
// The index operand in the pattern should really be an integer type. However,
// if we do that and it happens to come from a bitcast, then it becomes
@@ -1343,18 +1652,19 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)), 1>, EVEX_4V,
- AVX5128IBase;
+ (_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2, _.RC:$src3)),
+ itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermi2X _.RC:$src1, _.RC:$src2,
- (_.VT (bitconvert (_.LdFrag addr:$src3))))), 1>,
- EVEX_4V, AVX5128IBase;
+ (_.VT (bitconvert (_.LdFrag addr:$src3))))), itins.rm, 1>,
+ EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
+
+multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
X86VectorVTInfo _> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -1363,66 +1673,68 @@ multiclass avx512_perm_i_mb<bits<8> opc, string OpcodeStr,
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermi2X _.RC:$src1,
_.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
- 1>, AVX5128IBase, EVEX_4V, EVEX_B;
+ itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_i_sizes<bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512VLVectorVTInfo VTInfo> {
- defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>,
- avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>,
+ avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>,
- avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>,
- avx512_perm_i_mb<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>,
+ avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>,
+ avx512_perm_i_mb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
}
}
multiclass avx512_perm_i_sizes_bw<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo VTInfo,
- Predicate Prd> {
+ OpndItins itins,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate Prd> {
let Predicates = [Prd] in
- defm NAME: avx512_perm_i<opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ defm NAME: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
let Predicates = [Prd, HasVLX] in {
- defm NAME#128: avx512_perm_i<opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_i<opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
+ defm NAME#128: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
+ defm NAME#256: avx512_perm_i<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
}
}
-defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d",
+defm VPERMI2D : avx512_perm_i_sizes<0x76, "vpermi2d", AVX512_PERM2_I,
avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q",
+defm VPERMI2Q : avx512_perm_i_sizes<0x76, "vpermi2q", AVX512_PERM2_I,
avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w",
+defm VPERMI2W : avx512_perm_i_sizes_bw<0x75, "vpermi2w", AVX512_PERM2_I,
avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b",
+defm VPERMI2B : avx512_perm_i_sizes_bw<0x75, "vpermi2b", AVX512_PERM2_I,
avx512vl_i8_info, HasVBMI>,
EVEX_CD8<8, CD8VF>;
-defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps",
+defm VPERMI2PS : avx512_perm_i_sizes<0x77, "vpermi2ps", AVX512_PERM2_F,
avx512vl_f32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd",
+defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", AVX512_PERM2_F,
avx512vl_f64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
// VPERMT2
-multiclass avx512_perm_t<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_t<bits<8> opc, string OpcodeStr, OpndItins itins,
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)), 1>,
- EVEX_4V, AVX5128IBase;
+ (_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2, _.RC:$src3)),
+ itins.rr, 1>, EVEX_4V, AVX5128IBase, Sched<[itins.Sched]>;
defm rm: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins IdxVT.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (X86VPermt2 _.RC:$src1, IdxVT.RC:$src2,
- (bitconvert (_.LdFrag addr:$src3)))), 1>,
- EVEX_4V, AVX5128IBase;
+ (bitconvert (_.LdFrag addr:$src3)))), itins.rm, 1>,
+ EVEX_4V, AVX5128IBase, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr, OpndItins itins,
X86VectorVTInfo _, X86VectorVTInfo IdxVT> {
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
defm rmb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -1431,147 +1743,165 @@ multiclass avx512_perm_t_mb<bits<8> opc, string OpcodeStr,
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(_.VT (X86VPermt2 _.RC:$src1,
IdxVT.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))),
- 1>, AVX5128IBase, EVEX_4V, EVEX_B;
+ itins.rm, 1>, AVX5128IBase, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_t_sizes<bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512VLVectorVTInfo VTInfo,
AVX512VLVectorVTInfo ShuffleMask> {
- defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512,
ShuffleMask.info512>,
- avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info512,
+ avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info512,
ShuffleMask.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128,
ShuffleMask.info128>,
- avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info128,
+ avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info128,
ShuffleMask.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256,
ShuffleMask.info256>,
- avx512_perm_t_mb<opc, OpcodeStr, VTInfo.info256,
+ avx512_perm_t_mb<opc, OpcodeStr, itins, VTInfo.info256,
ShuffleMask.info256>, EVEX_V256;
}
}
-multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr,
+multiclass avx512_perm_t_sizes_bw<bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512VLVectorVTInfo VTInfo,
AVX512VLVectorVTInfo Idx,
Predicate Prd> {
let Predicates = [Prd] in
- defm NAME: avx512_perm_t<opc, OpcodeStr, VTInfo.info512,
+ defm NAME: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info512,
Idx.info512>, EVEX_V512;
let Predicates = [Prd, HasVLX] in {
- defm NAME#128: avx512_perm_t<opc, OpcodeStr, VTInfo.info128,
+ defm NAME#128: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info128,
Idx.info128>, EVEX_V128;
- defm NAME#256: avx512_perm_t<opc, OpcodeStr, VTInfo.info256,
+ defm NAME#256: avx512_perm_t<opc, OpcodeStr, itins, VTInfo.info256,
Idx.info256>, EVEX_V256;
}
}
-defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d",
+defm VPERMT2D : avx512_perm_t_sizes<0x7E, "vpermt2d", AVX512_PERM2_I,
avx512vl_i32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q",
+defm VPERMT2Q : avx512_perm_t_sizes<0x7E, "vpermt2q", AVX512_PERM2_I,
avx512vl_i64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w",
+defm VPERMT2W : avx512_perm_t_sizes_bw<0x7D, "vpermt2w", AVX512_PERM2_I,
avx512vl_i16_info, avx512vl_i16_info, HasBWI>,
VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b",
+defm VPERMT2B : avx512_perm_t_sizes_bw<0x7D, "vpermt2b", AVX512_PERM2_I,
avx512vl_i8_info, avx512vl_i8_info, HasVBMI>,
EVEX_CD8<8, CD8VF>;
-defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps",
+defm VPERMT2PS : avx512_perm_t_sizes<0x7F, "vpermt2ps", AVX512_PERM2_F,
avx512vl_f32_info, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd",
+defm VPERMT2PD : avx512_perm_t_sizes<0x7F, "vpermt2pd", AVX512_PERM2_F,
avx512vl_f64_info, avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
//===----------------------------------------------------------------------===//
// AVX-512 - BLEND using mask
//
-multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
+
+let Sched = WriteFVarBlend in
+def AVX512_BLENDM : OpndItins<
+ IIC_SSE_ALU_F32P_RR, IIC_SSE_ALU_F32P_RM
+>;
+
+let Sched = WriteVarBlend in
+def AVX512_PBLENDM : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
+multiclass avx512_blendmask<bits<8> opc, string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
def rr : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
- []>, EVEX_4V;
+ [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
def rrk : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_K;
+ [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
def rrkz : AVX5128I<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_KZ;
+ [], itins.rr>, EVEX_4V, EVEX_KZ, Sched<[itins.Sched]>;
let mayLoad = 1 in {
def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst}|${dst}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ [], itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}}|${dst} {${mask}}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>;
+ [], itins.rm>, EVEX_4V, EVEX_K, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmkz : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, ${dst} {${mask}} {z}|${dst} {${mask}} {z}, $src1, $src2}"),
- []>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>;
+ [], itins.rm>, EVEX_4V, EVEX_KZ, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
}
-multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
-
+multiclass avx512_blendmask_rmb<bits<8> opc, string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _> {
let mayLoad = 1, hasSideEffects = 0 in {
def rmbk : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, "}"),
- []>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmb : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr,
"\t{${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, "}"),
- []>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ [], itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass blendmask_dq <bits<8> opc, string OpcodeStr,
+multiclass blendmask_dq <bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512VLVectorVTInfo VTInfo> {
- defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>,
- avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_blendmask <opc, OpcodeStr, itins, VTInfo.info512>,
+ avx512_blendmask_rmb <opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z256 : avx512_blendmask<opc, OpcodeStr, VTInfo.info256>,
- avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_blendmask<opc, OpcodeStr, VTInfo.info128>,
- avx512_blendmask_rmb <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>,
+ avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>,
+ avx512_blendmask_rmb<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
}
}
-multiclass blendmask_bw <bits<8> opc, string OpcodeStr,
+multiclass blendmask_bw <bits<8> opc, string OpcodeStr, OpndItins itins,
AVX512VLVectorVTInfo VTInfo> {
let Predicates = [HasBWI] in
- defm Z : avx512_blendmask <opc, OpcodeStr, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info512>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
- defm Z256 : avx512_blendmask <opc, OpcodeStr, VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_blendmask <opc, OpcodeStr, VTInfo.info128>, EVEX_V128;
+ defm Z256 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info256>, EVEX_V256;
+ defm Z128 : avx512_blendmask<opc, OpcodeStr, itins, VTInfo.info128>, EVEX_V128;
}
}
-defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", avx512vl_f32_info>;
-defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", avx512vl_f64_info>, VEX_W;
-defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", avx512vl_i32_info>;
-defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", avx512vl_i64_info>, VEX_W;
-defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", avx512vl_i8_info>;
-defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
+defm VBLENDMPS : blendmask_dq <0x65, "vblendmps", AVX512_BLENDM, avx512vl_f32_info>;
+defm VBLENDMPD : blendmask_dq <0x65, "vblendmpd", AVX512_BLENDM, avx512vl_f64_info>, VEX_W;
+defm VPBLENDMD : blendmask_dq <0x64, "vpblendmd", AVX512_PBLENDM, avx512vl_i32_info>;
+defm VPBLENDMQ : blendmask_dq <0x64, "vpblendmq", AVX512_PBLENDM, avx512vl_i64_info>, VEX_W;
+defm VPBLENDMB : blendmask_bw <0x66, "vpblendmb", AVX512_PBLENDM, avx512vl_i8_info>;
+defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", AVX512_PBLENDM, avx512vl_i16_info>, VEX_W;
//===----------------------------------------------------------------------===//
@@ -1580,8 +1910,8 @@ defm VPBLENDMW : blendmask_bw <0x66, "vpblendmw", avx512vl_i16_info>, VEX_W;
// avx512_cmp_scalar - AVX512 CMPSS and CMPSD
-multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>{
-
+multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd,
+ OpndItins itins> {
defm rr_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
@@ -1589,7 +1919,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- imm:$cc)>, EVEX_4V;
+ imm:$cc), itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -1597,7 +1927,8 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
"vcmp${cc}"#_.Suffix,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
- imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+ imm:$cc), itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
@@ -1607,28 +1938,31 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
(OpNodeRnd (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
imm:$cc,
- (i32 FROUND_NO_EXC))>, EVEX_4V, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
(outs VK1:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">, EVEX_4V;
+ "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>, EVEX_4V,
+ Sched<[itins.Sched]>;
let mayLoad = 1 in
defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+ "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rrb_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc">,
- EVEX_4V, EVEX_B;
+ "$cc, {sae}, $src2, $src1","$src1, $src2, {sae}, $cc", itins.rr>,
+ EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
}// let isAsmParserOnly = 1, hasSideEffects = 0
let isCodeGenOnly = 1 in {
@@ -1640,7 +1974,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
imm:$cc))],
- IIC_SSE_ALU_F32S_RR>, EVEX_4V;
+ itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
@@ -1649,33 +1983,34 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
(_.ScalarLdFrag addr:$src2),
imm:$cc))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+ itins.rm>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
let Predicates = [HasAVX512] in {
let ExeDomain = SSEPackedSingle in
- defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
- AVX512XSIi8Base;
+ defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd,
+ SSE_ALU_F32S>, AVX512XSIi8Base;
let ExeDomain = SSEPackedDouble in
- defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
- AVX512XDIi8Base, VEX_W;
+ defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd,
+ SSE_ALU_F64S>, AVX512XDIi8Base, VEX_W;
}
multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, bit IsCommutable> {
+ OpndItins itins, X86VectorVTInfo _, bit IsCommutable> {
let isCommutable = IsCommutable in
def rr : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))],
- IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
def rm : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2)))))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let isCommutable = IsCommutable in
def rrk : AVX512BI<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
@@ -1683,7 +2018,7 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$dst {${mask}}, $src1, $src2}"),
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))))],
- IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
def rmk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst {${mask}}|",
@@ -1692,19 +2027,19 @@ multiclass avx512_icmp_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert
(_.LdFrag addr:$src2))))))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+ itins.rm>, EVEX_4V, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, bit IsCommutable> :
- avx512_icmp_packed<opc, OpcodeStr, OpNode, _, IsCommutable> {
+ OpndItins itins, X86VectorVTInfo _, bit IsCommutable> :
+ avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, _, IsCommutable> {
def rmb : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2),
!strconcat(OpcodeStr, "\t{${src2}", _.BroadcastStr, ", $src1, $dst",
"|$dst, $src1, ${src2}", _.BroadcastStr, "}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(X86VBroadcast (_.ScalarLdFrag addr:$src2))))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ itins.rm>, EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmbk : AVX512BI<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2),
@@ -1715,285 +2050,95 @@ multiclass avx512_icmp_packed_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)))))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+ itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_icmp_packed_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, Predicate prd,
- bit IsCommutable = 0> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info512,
+ defm Z : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info512,
IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info256,
+ defm Z256 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info256,
IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, VTInfo.info128,
+ defm Z128 : avx512_icmp_packed<opc, OpcodeStr, OpNode, itins, VTInfo.info128,
IsCommutable>, EVEX_V128;
}
}
multiclass avx512_icmp_packed_rmb_vl<bits<8> opc, string OpcodeStr,
- SDNode OpNode, AVX512VLVectorVTInfo VTInfo,
- Predicate prd, bit IsCommutable = 0> {
+ SDNode OpNode, OpndItins itins,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate prd, bit IsCommutable = 0> {
let Predicates = [prd] in
- defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
+ defm Z : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512,
IsCommutable>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
+ defm Z256 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256,
IsCommutable>, EVEX_V256;
- defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
+ defm Z128 : avx512_icmp_packed_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128,
IsCommutable>, EVEX_V128;
}
}
+// FIXME: Is there a better scheduler itinerary for VPCMP?
defm VPCMPEQB : avx512_icmp_packed_vl<0x74, "vpcmpeqb", X86pcmpeqm,
- avx512vl_i8_info, HasBWI, 1>,
- EVEX_CD8<8, CD8VF>;
+ SSE_ALU_F32P, avx512vl_i8_info, HasBWI, 1>,
+ EVEX_CD8<8, CD8VF>, VEX_WIG;
defm VPCMPEQW : avx512_icmp_packed_vl<0x75, "vpcmpeqw", X86pcmpeqm,
- avx512vl_i16_info, HasBWI, 1>,
- EVEX_CD8<16, CD8VF>;
+ SSE_ALU_F32P, avx512vl_i16_info, HasBWI, 1>,
+ EVEX_CD8<16, CD8VF>, VEX_WIG;
defm VPCMPEQD : avx512_icmp_packed_rmb_vl<0x76, "vpcmpeqd", X86pcmpeqm,
- avx512vl_i32_info, HasAVX512, 1>,
+ SSE_ALU_F32P, avx512vl_i32_info, HasAVX512, 1>,
EVEX_CD8<32, CD8VF>;
defm VPCMPEQQ : avx512_icmp_packed_rmb_vl<0x29, "vpcmpeqq", X86pcmpeqm,
- avx512vl_i64_info, HasAVX512, 1>,
+ SSE_ALU_F32P, avx512vl_i64_info, HasAVX512, 1>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VPCMPGTB : avx512_icmp_packed_vl<0x64, "vpcmpgtb", X86pcmpgtm,
- avx512vl_i8_info, HasBWI>,
- EVEX_CD8<8, CD8VF>;
+ SSE_ALU_F32P, avx512vl_i8_info, HasBWI>,
+ EVEX_CD8<8, CD8VF>, VEX_WIG;
defm VPCMPGTW : avx512_icmp_packed_vl<0x65, "vpcmpgtw", X86pcmpgtm,
- avx512vl_i16_info, HasBWI>,
- EVEX_CD8<16, CD8VF>;
+ SSE_ALU_F32P, avx512vl_i16_info, HasBWI>,
+ EVEX_CD8<16, CD8VF>, VEX_WIG;
defm VPCMPGTD : avx512_icmp_packed_rmb_vl<0x66, "vpcmpgtd", X86pcmpgtm,
- avx512vl_i32_info, HasAVX512>,
+ SSE_ALU_F32P, avx512vl_i32_info, HasAVX512>,
EVEX_CD8<32, CD8VF>;
defm VPCMPGTQ : avx512_icmp_packed_rmb_vl<0x37, "vpcmpgtq", X86pcmpgtm,
- avx512vl_i64_info, HasAVX512>,
+ SSE_ALU_F32P, avx512vl_i64_info, HasAVX512>,
T8PD, VEX_W, EVEX_CD8<64, CD8VF>;
-
-multiclass avx512_icmp_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
- SDNode OpNode, string InstrStr,
- list<Predicate> Preds> {
-let Predicates = Preds in {
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rr) _.RC:$src1, _.RC:$src2),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rm) _.RC:$src1, addr:$src2),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrk) _.KRCWM:$mask,
- _.RC:$src1, _.RC:$src2),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (and (_.KVT _.KRCWM:$mask),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert
- (_.LdFrag addr:$src2))))))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmk) _.KRCWM:$mask,
- _.RC:$src1, addr:$src2),
- NewInf.KRC)>;
-}
-}
-
-multiclass avx512_icmp_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
- SDNode OpNode, string InstrStr,
- list<Predicate> Preds>
- : avx512_icmp_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
-let Predicates = Preds in {
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmb) _.RC:$src1, addr:$src2),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (and (_.KVT _.KRCWM:$mask),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)))))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbk) _.KRCWM:$mask,
- _.RC:$src1, addr:$src2),
- NewInf.KRC)>;
-}
-}
-
-// VPCMPEQB - i8
-defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQBZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQBZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQBZ256", [HasBWI, HasVLX]>;
-
-// VPCMPEQW - i16
-defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpeqm,
- "VPCMPEQWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQWZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQWZ256", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQWZ256", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQWZ", [HasBWI]>;
-
-// VPCMPEQD - i32
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpeqm,
- "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpeqm,
- "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQDZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpeqm,
- "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQDZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQDZ", [HasAVX512]>;
-defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQDZ", [HasAVX512]>;
-
-// VPCMPEQQ - i64
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpeqm,
- "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpeqm,
- "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpeqm,
- "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQQZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpeqm,
- "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpeqm,
- "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQQZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpeqm,
- "VPCMPEQQZ", [HasAVX512]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpeqm,
- "VPCMPEQQZ", [HasAVX512]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpeqm,
- "VPCMPEQQZ", [HasAVX512]>;
-
-// VPCMPGTB - i8
-defm : avx512_icmp_packed_lowering<v16i8x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTBZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v16i8x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTBZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_packed_lowering<v32i8x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTBZ256", [HasBWI, HasVLX]>;
-
-// VPCMPGTW - i16
-defm : avx512_icmp_packed_lowering<v8i16x_info, v16i1_info, X86pcmpgtm,
- "VPCMPGTWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v8i16x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v8i16x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTWZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_packed_lowering<v16i16x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTWZ256", [HasBWI, HasVLX]>;
-defm : avx512_icmp_packed_lowering<v16i16x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTWZ256", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_packed_lowering<v32i16_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTWZ", [HasBWI]>;
-
-// VPCMPGTD - i32
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v8i1_info, X86pcmpgtm,
- "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v16i1_info, X86pcmpgtm,
- "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i32x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTDZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v16i1_info, X86pcmpgtm,
- "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i32x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTDZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTDZ", [HasAVX512]>;
-defm : avx512_icmp_packed_rmb_lowering<v16i32_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTDZ", [HasAVX512]>;
-
-// VPCMPGTQ - i64
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v4i1_info, X86pcmpgtm,
- "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v8i1_info, X86pcmpgtm,
- "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v16i1_info, X86pcmpgtm,
- "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v2i64x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTQZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v8i1_info, X86pcmpgtm,
- "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v16i1_info, X86pcmpgtm,
- "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_packed_rmb_lowering<v4i64x_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTQZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v16i1_info, X86pcmpgtm,
- "VPCMPGTQZ", [HasAVX512]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v32i1_info, X86pcmpgtm,
- "VPCMPGTQZ", [HasAVX512]>;
-defm : avx512_icmp_packed_rmb_lowering<v8i64_info, v64i1_info, X86pcmpgtm,
- "VPCMPGTQZ", [HasAVX512]>;
+// Transforms to swizzle an immediate to help matching memory operand in first
+// operand.
+def CommutePCMPCC : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x7;
+ switch (Imm) {
+ default: llvm_unreachable("Unreachable!");
+ case 0x01: Imm = 0x06; break; // LT -> NLE
+ case 0x02: Imm = 0x05; break; // LE -> NLT
+ case 0x05: Imm = 0x02; break; // NLT -> LE
+ case 0x06: Imm = 0x01; break; // NLE -> LT
+ case 0x00: // EQ
+ case 0x03: // FALSE
+ case 0x04: // NE
+ case 0x07: // TRUE
+ break;
+ }
+ return getI8Imm(Imm, SDLoc(N));
+}]>;
multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let isCommutable = 1 in
def rri : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, AVX512ICC:$cc),
@@ -2001,7 +2146,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
imm:$cc))],
- IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
def rmi : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, AVX512ICC:$cc),
!strconcat("vpcmp${cc}", Suffix,
@@ -2009,7 +2154,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
imm:$cc))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let isCommutable = 1 in
def rrik : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
@@ -2020,7 +2165,7 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
[(set _.KRC:$dst, (and _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
imm:$cc)))],
- IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
def rmik : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
AVX512ICC:$cc),
@@ -2031,7 +2176,8 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
imm:$cc)))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+ itins.rm>, EVEX_4V, EVEX_K,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
@@ -2039,20 +2185,20 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
"$dst, $src1, $src2, $cc}"),
- [], IIC_SSE_ALU_F32P_RR>, EVEX_4V;
+ [], itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rmi_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix, "\t{$cc, $src2, $src1, $dst|",
"$dst, $src1, $src2, $cc}"),
- [], IIC_SSE_ALU_F32P_RM>, EVEX_4V;
+ [], itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rrik_alt : AVX512AIi8<opc, MRMSrcReg,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2,
u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, $src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2, $cc}"),
- [], IIC_SSE_ALU_F32P_RR>, EVEX_4V, EVEX_K;
+ [], itins.rr>, EVEX_4V, EVEX_K, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rmik_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1, _.MemOp:$src2,
@@ -2060,13 +2206,25 @@ multiclass avx512_icmp_cc<bits<8> opc, string Suffix, SDNode OpNode,
!strconcat("vpcmp", Suffix,
"\t{$cc, $src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2, $cc}"),
- [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K;
+ [], itins.rm>, EVEX_4V, EVEX_K,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
+
+ def : Pat<(OpNode (bitconvert (_.LdFrag addr:$src2)),
+ (_.VT _.RC:$src1), imm:$cc),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ (CommutePCMPCC imm:$cc))>;
+
+ def : Pat<(and _.KRCWM:$mask, (OpNode (bitconvert (_.LdFrag addr:$src2)),
+ (_.VT _.RC:$src1), imm:$cc)),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (CommutePCMPCC imm:$cc))>;
}
multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
- X86VectorVTInfo _> :
- avx512_icmp_cc<opc, Suffix, OpNode, _> {
+ OpndItins itins, X86VectorVTInfo _> :
+ avx512_icmp_cc<opc, Suffix, OpNode, itins, _> {
def rmib : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.RC:$src1, _.ScalarMemOp:$src2,
AVX512ICC:$cc),
@@ -2076,7 +2234,8 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
[(set _.KRC:$dst, (OpNode (_.VT _.RC:$src1),
(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
imm:$cc))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ itins.rm>, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmibk : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, AVX512ICC:$cc),
@@ -2087,7 +2246,8 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(X86VBroadcast (_.ScalarLdFrag addr:$src2)),
imm:$cc)))],
- IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+ itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0, mayLoad = 1 in {
@@ -2097,302 +2257,98 @@ multiclass avx512_icmp_cc_rmb<bits<8> opc, string Suffix, SDNode OpNode,
!strconcat("vpcmp", Suffix,
"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst|",
"$dst, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
- [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_B;
+ [], itins.rm>, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmibk_alt : AVX512AIi8<opc, MRMSrcMem,
(outs _.KRC:$dst), (ins _.KRCWM:$mask, _.RC:$src1,
_.ScalarMemOp:$src2, u8imm:$cc),
!strconcat("vpcmp", Suffix,
"\t{$cc, ${src2}", _.BroadcastStr, ", $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, ${src2}", _.BroadcastStr, ", $cc}"),
- [], IIC_SSE_ALU_F32P_RM>, EVEX_4V, EVEX_K, EVEX_B;
+ [], itins.rm>, EVEX_4V, EVEX_K, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
+
+ def : Pat<(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1), imm:$cc),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmib") _.RC:$src1, addr:$src2,
+ (CommutePCMPCC imm:$cc))>;
+
+ def : Pat<(and _.KRCWM:$mask, (OpNode (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1), imm:$cc)),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmibk") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ (CommutePCMPCC imm:$cc))>;
}
multiclass avx512_icmp_cc_vl<bits<8> opc, string Suffix, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info512>,
+ EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info256>, EVEX_V256;
- defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, VTInfo.info128>, EVEX_V128;
+ defm Z256 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info256>,
+ EVEX_V256;
+ defm Z128 : avx512_icmp_cc<opc, Suffix, OpNode, itins, VTInfo.info128>,
+ EVEX_V128;
}
}
multiclass avx512_icmp_cc_rmb_vl<bits<8> opc, string Suffix, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info512>,
+ defm Z : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info512>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info256>,
+ defm Z256 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, VTInfo.info128>,
+ defm Z128 : avx512_icmp_cc_rmb<opc, Suffix, OpNode, itins, VTInfo.info128>,
EVEX_V128;
}
}
-defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, avx512vl_i8_info,
- HasBWI>, EVEX_CD8<8, CD8VF>;
-defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, avx512vl_i8_info,
- HasBWI>, EVEX_CD8<8, CD8VF>;
-
-defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, avx512vl_i16_info,
- HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
-defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, avx512vl_i16_info,
- HasBWI>, VEX_W, EVEX_CD8<16, CD8VF>;
-
-defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, avx512vl_i32_info,
- HasAVX512>, EVEX_CD8<32, CD8VF>;
-defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, avx512vl_i32_info,
- HasAVX512>, EVEX_CD8<32, CD8VF>;
-
-defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, avx512vl_i64_info,
- HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, avx512vl_i64_info,
- HasAVX512>, VEX_W, EVEX_CD8<64, CD8VF>;
-
-multiclass avx512_icmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
- SDNode OpNode, string InstrStr,
- list<Predicate> Preds> {
-let Predicates = Preds in {
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc)),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
- _.RC:$src2,
- imm:$cc),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- imm:$cc)),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
- addr:$src2,
- imm:$cc),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (and _.KRCWM:$mask,
- (OpNode (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrik) _.KRCWM:$mask,
- _.RC:$src1,
- _.RC:$src2,
- imm:$cc),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (and (_.KVT _.KRCWM:$mask),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert
- (_.LdFrag addr:$src2))),
- imm:$cc)))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmik) _.KRCWM:$mask,
- _.RC:$src1,
- addr:$src2,
- imm:$cc),
- NewInf.KRC)>;
-}
-}
-
-multiclass avx512_icmp_cc_packed_rmb_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
- SDNode OpNode, string InstrStr,
- list<Predicate> Preds>
- : avx512_icmp_cc_packed_lowering<_, NewInf, OpNode, InstrStr, Preds> {
-let Predicates = Preds in {
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- imm:$cc)),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmib) _.RC:$src1,
- addr:$src2,
- imm:$cc),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (and (_.KVT _.KRCWM:$mask),
- (_.KVT (OpNode (_.VT _.RC:$src1),
- (X86VBroadcast
- (_.ScalarLdFrag addr:$src2)),
- imm:$cc)))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmibk) _.KRCWM:$mask,
- _.RC:$src1,
- addr:$src2,
- imm:$cc),
- NewInf.KRC)>;
-}
-}
-
-// VPCMPB - i8
-defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpm,
- "VPCMPBZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpm,
- "VPCMPBZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpm,
- "VPCMPBZ256", [HasBWI, HasVLX]>;
-
-// VPCMPW - i16
-defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpm,
- "VPCMPWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpm,
- "VPCMPWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpm,
- "VPCMPWZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpm,
- "VPCMPWZ256", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpm,
- "VPCMPWZ256", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpm,
- "VPCMPWZ", [HasBWI]>;
-
-// VPCMPD - i32
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpm,
- "VPCMPDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpm,
- "VPCMPDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpm,
- "VPCMPDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpm,
- "VPCMPDZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpm,
- "VPCMPDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpm,
- "VPCMPDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpm,
- "VPCMPDZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpm,
- "VPCMPDZ", [HasAVX512]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpm,
- "VPCMPDZ", [HasAVX512]>;
-
-// VPCMPQ - i64
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpm,
- "VPCMPQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpm,
- "VPCMPQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpm,
- "VPCMPQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpm,
- "VPCMPQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpm,
- "VPCMPQZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpm,
- "VPCMPQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpm,
- "VPCMPQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpm,
- "VPCMPQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpm,
- "VPCMPQZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpm,
- "VPCMPQZ", [HasAVX512]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpm,
- "VPCMPQZ", [HasAVX512]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpm,
- "VPCMPQZ", [HasAVX512]>;
-
-// VPCMPUB - i8
-defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v32i1_info, X86cmpmu,
- "VPCMPUBZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v16i8x_info, v64i1_info, X86cmpmu,
- "VPCMPUBZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_lowering<v32i8x_info, v64i1_info, X86cmpmu,
- "VPCMPUBZ256", [HasBWI, HasVLX]>;
-
-// VPCMPUW - i16
-defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v16i1_info, X86cmpmu,
- "VPCMPUWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v32i1_info, X86cmpmu,
- "VPCMPUWZ128", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v8i16x_info, v64i1_info, X86cmpmu,
- "VPCMPUWZ128", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v32i1_info, X86cmpmu,
- "VPCMPUWZ256", [HasBWI, HasVLX]>;
-defm : avx512_icmp_cc_packed_lowering<v16i16x_info, v64i1_info, X86cmpmu,
- "VPCMPUWZ256", [HasBWI, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_lowering<v32i16_info, v64i1_info, X86cmpmu,
- "VPCMPUWZ", [HasBWI]>;
-
-// VPCMPUD - i32
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v8i1_info, X86cmpmu,
- "VPCMPUDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v16i1_info, X86cmpmu,
- "VPCMPUDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v32i1_info, X86cmpmu,
- "VPCMPUDZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i32x_info, v64i1_info, X86cmpmu,
- "VPCMPUDZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v16i1_info, X86cmpmu,
- "VPCMPUDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v32i1_info, X86cmpmu,
- "VPCMPUDZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i32x_info, v64i1_info, X86cmpmu,
- "VPCMPUDZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v32i1_info, X86cmpmu,
- "VPCMPUDZ", [HasAVX512]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v16i32_info, v64i1_info, X86cmpmu,
- "VPCMPUDZ", [HasAVX512]>;
-
-// VPCMPUQ - i64
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v4i1_info, X86cmpmu,
- "VPCMPUQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v8i1_info, X86cmpmu,
- "VPCMPUQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v16i1_info, X86cmpmu,
- "VPCMPUQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v32i1_info, X86cmpmu,
- "VPCMPUQZ128", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v2i64x_info, v64i1_info, X86cmpmu,
- "VPCMPUQZ128", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v8i1_info, X86cmpmu,
- "VPCMPUQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v16i1_info, X86cmpmu,
- "VPCMPUQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v32i1_info, X86cmpmu,
- "VPCMPUQZ256", [HasAVX512, HasVLX]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v4i64x_info, v64i1_info, X86cmpmu,
- "VPCMPUQZ256", [HasAVX512, HasVLX]>;
-
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v16i1_info, X86cmpmu,
- "VPCMPUQZ", [HasAVX512]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v32i1_info, X86cmpmu,
- "VPCMPUQZ", [HasAVX512]>;
-defm : avx512_icmp_cc_packed_rmb_lowering<v8i64_info, v64i1_info, X86cmpmu,
- "VPCMPUQZ", [HasAVX512]>;
-
-multiclass avx512_vcmp_common<X86VectorVTInfo _> {
+// FIXME: Is there a better scheduler itinerary for VPCMP/VPCMPU?
+defm VPCMPB : avx512_icmp_cc_vl<0x3F, "b", X86cmpm, SSE_ALU_F32P,
+ avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
+defm VPCMPUB : avx512_icmp_cc_vl<0x3E, "ub", X86cmpmu, SSE_ALU_F32P,
+ avx512vl_i8_info, HasBWI>, EVEX_CD8<8, CD8VF>;
+
+defm VPCMPW : avx512_icmp_cc_vl<0x3F, "w", X86cmpm, SSE_ALU_F32P,
+ avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+defm VPCMPUW : avx512_icmp_cc_vl<0x3E, "uw", X86cmpmu, SSE_ALU_F32P,
+ avx512vl_i16_info, HasBWI>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+
+defm VPCMPD : avx512_icmp_cc_rmb_vl<0x1F, "d", X86cmpm, SSE_ALU_F32P,
+ avx512vl_i32_info, HasAVX512>,
+ EVEX_CD8<32, CD8VF>;
+defm VPCMPUD : avx512_icmp_cc_rmb_vl<0x1E, "ud", X86cmpmu, SSE_ALU_F32P,
+ avx512vl_i32_info, HasAVX512>,
+ EVEX_CD8<32, CD8VF>;
+
+defm VPCMPQ : avx512_icmp_cc_rmb_vl<0x1F, "q", X86cmpm, SSE_ALU_F32P,
+ avx512vl_i64_info, HasAVX512>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+defm VPCMPUQ : avx512_icmp_cc_rmb_vl<0x1E, "uq", X86cmpmu, SSE_ALU_F32P,
+ avx512vl_i64_info, HasAVX512>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+
+multiclass avx512_vcmp_common<OpndItins itins, X86VectorVTInfo _> {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,AVXCC:$cc),
"vcmp${cc}"#_.Suffix,
"$src2, $src1", "$src1, $src2",
(X86cmpm (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- imm:$cc), 1>;
+ imm:$cc), itins.rr, 1>,
+ Sched<[itins.Sched]>;
defm rmi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, AVXCC:$cc),
@@ -2400,7 +2356,8 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> {
"$src2, $src1", "$src1, $src2",
(X86cmpm (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- imm:$cc)>;
+ imm:$cc), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -2410,33 +2367,63 @@ multiclass avx512_vcmp_common<X86VectorVTInfo _> {
"$src1, ${src2}"##_.BroadcastStr,
(X86cmpm (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- imm:$cc)>,EVEX_B;
+ imm:$cc), itins.rm>,
+ EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
// Accept explicit immediate argument form instead of comparison code.
let isAsmParserOnly = 1, hasSideEffects = 0 in {
defm rri_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">;
+ "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rr>,
+ Sched<[itins.Sched]>;
let mayLoad = 1 in {
defm rmi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
- "$cc, $src2, $src1", "$src1, $src2, $cc">;
+ "$cc, $src2, $src1", "$src1, $src2, $cc", itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmbi_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, ${src2}"##_.BroadcastStr##", $src1",
- "$src1, ${src2}"##_.BroadcastStr##", $cc">,EVEX_B;
+ "$src1, ${src2}"##_.BroadcastStr##", $cc", itins.rm>,
+ EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
- }
+ }
+
+ // Patterns for selecting with loads in other operand.
+ def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
+ imm:$cc)>;
+
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm (_.LdFrag addr:$src2),
+ (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc)),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ imm:$cc)>;
+
+ def : Pat<(X86cmpm (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1), CommutableCMPCC:$cc),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
+ imm:$cc)>;
+
+ def : Pat<(and _.KRCWM:$mask, (X86cmpm (X86VBroadcast
+ (_.ScalarLdFrag addr:$src2)),
+ (_.VT _.RC:$src1),
+ CommutableCMPCC:$cc)),
+ (!cast<Instruction>(NAME#_.ZSuffix#"rmbik") _.KRCWM:$mask,
+ _.RC:$src1, addr:$src2,
+ imm:$cc)>;
}
-multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
+multiclass avx512_vcmp_sae<OpndItins itins, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, AVXCC:$cc),
@@ -2445,7 +2432,8 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
(X86cmpmRnd (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
imm:$cc,
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ EVEX_B, Sched<[itins.Sched]>;
let isAsmParserOnly = 1, hasSideEffects = 0 in {
defm rrib_alt : AVX512_maskable_cmp_alt<0xC2, MRMSrcReg, _,
@@ -2453,163 +2441,78 @@ multiclass avx512_vcmp_sae<X86VectorVTInfo _> {
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, {sae}, $src2, $src1",
- "$src1, $src2, {sae}, $cc">, EVEX_B;
+ "$src1, $src2, {sae}, $cc", itins.rr>,
+ EVEX_B, Sched<[itins.Sched]>;
}
}
-multiclass avx512_vcmp<AVX512VLVectorVTInfo _> {
+multiclass avx512_vcmp<OpndItins itins, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcmp_common<_.info512>,
- avx512_vcmp_sae<_.info512>, EVEX_V512;
+ defm Z : avx512_vcmp_common<itins, _.info512>,
+ avx512_vcmp_sae<itins, _.info512>, EVEX_V512;
}
let Predicates = [HasAVX512,HasVLX] in {
- defm Z128 : avx512_vcmp_common<_.info128>, EVEX_V128;
- defm Z256 : avx512_vcmp_common<_.info256>, EVEX_V256;
+ defm Z128 : avx512_vcmp_common<itins, _.info128>, EVEX_V128;
+ defm Z256 : avx512_vcmp_common<itins, _.info256>, EVEX_V256;
}
}
-defm VCMPPD : avx512_vcmp<avx512vl_f64_info>,
+defm VCMPPD : avx512_vcmp<SSE_ALU_F64P, avx512vl_f64_info>,
AVX512PDIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VCMPPS : avx512_vcmp<avx512vl_f32_info>,
+defm VCMPPS : avx512_vcmp<SSE_ALU_F32P, avx512vl_f32_info>,
AVX512PSIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-multiclass avx512_fcmp_cc_packed_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
- string InstrStr, list<Predicate> Preds> {
-let Predicates = Preds in {
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (X86cmpm (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc)),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rri) _.RC:$src1,
- _.RC:$src2,
- imm:$cc),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (X86cmpm (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))),
- imm:$cc)),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmi) _.RC:$src1,
- addr:$src2,
- imm:$cc),
- NewInf.KRC)>;
-
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (X86cmpm (_.VT _.RC:$src1),
- (X86VBroadcast (_.ScalarLdFrag addr:$src2)),
- imm:$cc)),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rmbi) _.RC:$src1,
- addr:$src2,
- imm:$cc),
- NewInf.KRC)>;
-}
-}
-
-multiclass avx512_fcmp_cc_packed_sae_lowering<X86VectorVTInfo _, X86KVectorVTInfo NewInf,
- string InstrStr, list<Predicate> Preds>
- : avx512_fcmp_cc_packed_lowering<_, NewInf, InstrStr, Preds> {
-
-let Predicates = Preds in
- def : Pat<(insert_subvector (NewInf.KVT immAllZerosV),
- (_.KVT (X86cmpmRnd (_.VT _.RC:$src1),
- (_.VT _.RC:$src2),
- imm:$cc,
- (i32 FROUND_NO_EXC))),
- (i64 0)),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr##rrib) _.RC:$src1,
- _.RC:$src2,
- imm:$cc),
- NewInf.KRC)>;
-}
-
-
-// VCMPPS - f32
-defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v8i1_info, "VCMPPSZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v16i1_info, "VCMPPSZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v32i1_info, "VCMPPSZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v4f32x_info, v64i1_info, "VCMPPSZ128",
- [HasAVX512, HasVLX]>;
-
-defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v16i1_info, "VCMPPSZ256",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v32i1_info, "VCMPPSZ256",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v8f32x_info, v64i1_info, "VCMPPSZ256",
- [HasAVX512, HasVLX]>;
-
-defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v32i1_info, "VCMPPSZ",
- [HasAVX512]>;
-defm : avx512_fcmp_cc_packed_sae_lowering<v16f32_info, v64i1_info, "VCMPPSZ",
- [HasAVX512]>;
-
-// VCMPPD - f64
-defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v4i1_info, "VCMPPDZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v8i1_info, "VCMPPDZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v16i1_info, "VCMPPDZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v32i1_info, "VCMPPDZ128",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v2f64x_info, v64i1_info, "VCMPPDZ128",
- [HasAVX512, HasVLX]>;
-
-defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v8i1_info, "VCMPPDZ256",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v16i1_info, "VCMPPDZ256",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v32i1_info, "VCMPPDZ256",
- [HasAVX512, HasVLX]>;
-defm : avx512_fcmp_cc_packed_lowering<v4f64x_info, v64i1_info, "VCMPPDZ256",
- [HasAVX512, HasVLX]>;
-
-defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v16i1_info, "VCMPPDZ",
- [HasAVX512]>;
-defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v32i1_info, "VCMPPDZ",
- [HasAVX512]>;
-defm : avx512_fcmp_cc_packed_sae_lowering<v8f64_info, v64i1_info, "VCMPPDZ",
- [HasAVX512]>;
+
+// Patterns to select fp compares with load as first operand.
+let Predicates = [HasAVX512] in {
+ def : Pat<(v1i1 (X86cmpms (loadf64 addr:$src2), FR64X:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPSDZrm FR64X:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(v1i1 (X86cmpms (loadf32 addr:$src2), FR32X:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPSSZrm FR32X:$src1, addr:$src2, imm:$cc)>;
+}
// ----------------------------------------------------------------
// FPClass
//handle fpclass instruction mask = op(reg_scalar,imm)
// op(mem_scalar,imm)
multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, Predicate prd> {
- let Predicates = [prd] in {
- def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),//_.KRC:$dst),
+ OpndItins itins, X86VectorVTInfo _,
+ Predicate prd> {
+ let Predicates = [prd], ExeDomain = _.ExeDomain in {
+ def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2)))], NoItinerary>;
+ (i32 imm:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(or _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ (i32 imm:$src2))))], itins.rr>,
+ EVEX_K, Sched<[itins.Sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
- (ins _.MemOp:$src1, i32u8imm:$src2),
+ (ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,
- (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2)))], NoItinerary>;
+ (OpNode _.ScalarIntMemCPat:$src1,
+ (i32 imm:$src2)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
- (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+ (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(or _.KRCWM:$mask,
- (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ (OpNode _.ScalarIntMemCPat:$src1,
+ (i32 imm:$src2))))], itins.rm>,
+ EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -2617,33 +2520,39 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
// fpclass(reg_vec, mem_vec, imm)
// fpclass(reg_vec, broadcast(eltVt), imm)
multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, string mem, string broadcast>{
+ OpndItins itins, X86VectorVTInfo _,
+ string mem, string broadcast>{
+ let ExeDomain = _.ExeDomain in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2)))], NoItinerary>;
+ (i32 imm:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst,(or _.KRCWM:$mask,
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ (i32 imm:$src2))))], itins.rr>,
+ EVEX_K, Sched<[itins.Sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##mem#
"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.KRC:$dst,(OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2)))], NoItinerary>;
+ (i32 imm:$src2)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##mem#
"\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
[(set _.KRC:$dst, (or _.KRCWM:$mask, (OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2))))], NoItinerary>, EVEX_K;
+ (i32 imm:$src2))))], itins.rm>,
+ EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
@@ -2652,7 +2561,8 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.KRC:$dst,(OpNode
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2)))], NoItinerary>,EVEX_B;
+ (i32 imm:$src2)))], itins.rm>,
+ EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix##broadcast##"\t{$src2, ${src1}"##
@@ -2661,35 +2571,42 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set _.KRC:$dst,(or _.KRCWM:$mask, (OpNode
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2))))], NoItinerary>,
- EVEX_B, EVEX_K;
+ (i32 imm:$src2))))], itins.rm>,
+ EVEX_B, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
}
-multiclass avx512_vector_fpclass_all<string OpcodeStr,
- AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd,
- string broadcast>{
+multiclass avx512_vector_fpclass_all<string OpcodeStr, AVX512VLVectorVTInfo _,
+ bits<8> opc, SDNode OpNode,
+ OpndItins itins, Predicate prd,
+ string broadcast>{
let Predicates = [prd] in {
- defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info512, "{z}",
- broadcast>, EVEX_V512;
+ defm Z : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+ _.info512, "{z}", broadcast>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info128, "{x}",
- broadcast>, EVEX_V128;
- defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, _.info256, "{y}",
- broadcast>, EVEX_V256;
+ defm Z128 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+ _.info128, "{x}", broadcast>, EVEX_V128;
+ defm Z256 : avx512_vector_fpclass<opc, OpcodeStr, OpNode, itins,
+ _.info256, "{y}", broadcast>, EVEX_V256;
}
}
+// FIXME: Is there a better scheduler itinerary for VFPCLASS?
multiclass avx512_fp_fpclass_all<string OpcodeStr, bits<8> opcVec,
bits<8> opcScalar, SDNode VecOpNode, SDNode ScalarOpNode, Predicate prd>{
defm PS : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f32_info, opcVec,
- VecOpNode, prd, "{l}">, EVEX_CD8<32, CD8VF>;
+ VecOpNode, SSE_ALU_F32P, prd, "{l}">,
+ EVEX_CD8<32, CD8VF>;
defm PD : avx512_vector_fpclass_all<OpcodeStr, avx512vl_f64_info, opcVec,
- VecOpNode, prd, "{q}">,EVEX_CD8<64, CD8VF> , VEX_W;
+ VecOpNode, SSE_ALU_F64P, prd, "{q}">,
+ EVEX_CD8<64, CD8VF> , VEX_W;
defm SS : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- f32x_info, prd>, EVEX_CD8<32, CD8VT1>;
+ SSE_ALU_F32S, f32x_info, prd>,
+ EVEX_CD8<32, CD8VT1>;
defm SD : avx512_scalar_fpclass<opcScalar, OpcodeStr, ScalarOpNode,
- f64x_info, prd>, EVEX_CD8<64, CD8VT1>, VEX_W;
+ SSE_ALU_F64S, f64x_info, prd>,
+ EVEX_CD8<64, CD8VT1>, VEX_W;
}
defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
@@ -2704,15 +2621,16 @@ defm VFPCLASS : avx512_fp_fpclass_all<"vfpclass", 0x66, 0x67, X86Vfpclass,
multiclass avx512_mask_mov<bits<8> opc_kk, bits<8> opc_km, bits<8> opc_mk,
string OpcodeStr, RegisterClass KRC,
ValueType vvt, X86MemOperand x86memop> {
- let hasSideEffects = 0 in
+ let hasSideEffects = 0, SchedRW = [WriteMove] in
def kk : I<opc_kk, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+ IIC_SSE_MOVDQ>;
def km : I<opc_km, MRMSrcMem, (outs KRC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set KRC:$dst, (vvt (load addr:$src)))]>;
+ [(set KRC:$dst, (vvt (load addr:$src)))], IIC_SSE_MOVDQ>;
def mk : I<opc_mk, MRMDestMem, (outs), (ins x86memop:$dst, KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(store KRC:$src, addr:$dst)]>;
+ [(store KRC:$src, addr:$dst)], IIC_SSE_MOVDQ>;
}
multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
@@ -2720,9 +2638,11 @@ multiclass avx512_mask_mov_gpr<bits<8> opc_kr, bits<8> opc_rk,
RegisterClass KRC, RegisterClass GRC> {
let hasSideEffects = 0 in {
def kr : I<opc_kr, MRMSrcReg, (outs KRC:$dst), (ins GRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
def rk : I<opc_rk, MRMSrcReg, (outs GRC:$dst), (ins KRC:$src),
- !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), []>;
+ !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
+ IIC_SSE_MOVD_ToGP>, Sched<[WriteMove]>;
}
}
@@ -2848,17 +2768,11 @@ let Predicates = [HasAVX512] in {
def : Pat<(maskVT (scalar_to_vector GR32:$src)),
(COPY_TO_REGCLASS GR32:$src, maskRC)>;
- def : Pat<(i32 (X86Vextract maskRC:$src, (iPTR 0))),
+ def : Pat<(i32 (X86kextract maskRC:$src, (iPTR 0))),
(COPY_TO_REGCLASS maskRC:$src, GR32)>;
def : Pat<(maskVT (scalar_to_vector GR8:$src)),
(COPY_TO_REGCLASS (INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$src, sub_8bit), maskRC)>;
-
- def : Pat<(i8 (X86Vextract maskRC:$src, (iPTR 0))),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS maskRC:$src, GR32)), sub_8bit)>;
-
- def : Pat<(i32 (anyext (i8 (X86Vextract maskRC:$src, (iPTR 0))))),
- (COPY_TO_REGCLASS maskRC:$src, GR32)>;
}
defm : operation_gpr_mask_copy_lowering<VK1, v1i1>;
@@ -2888,26 +2802,27 @@ let Predicates = [HasAVX512] in {
// - KNOT
multiclass avx512_mask_unop<bits<8> opc, string OpcodeStr,
RegisterClass KRC, SDPatternOperator OpNode,
- Predicate prd> {
+ OpndItins itins, Predicate prd> {
let Predicates = [prd] in
def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set KRC:$dst, (OpNode KRC:$src))]>;
+ [(set KRC:$dst, (OpNode KRC:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
}
multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode> {
+ SDPatternOperator OpNode, OpndItins itins> {
defm B : avx512_mask_unop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
- HasDQI>, VEX, PD;
+ itins, HasDQI>, VEX, PD;
defm W : avx512_mask_unop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- HasAVX512>, VEX, PS;
+ itins, HasAVX512>, VEX, PS;
defm D : avx512_mask_unop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
- HasBWI>, VEX, PD, VEX_W;
+ itins, HasBWI>, VEX, PD, VEX_W;
defm Q : avx512_mask_unop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
- HasBWI>, VEX, PS, VEX_W;
+ itins, HasBWI>, VEX, PS, VEX_W;
}
-defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
+defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot, SSE_BIT_ITINS_P>;
// KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
let Predicates = [HasAVX512, NoDQI] in
@@ -2923,25 +2838,26 @@ def : Pat<(vnot VK2:$src),
// - KAND, KANDN, KOR, KXNOR, KXOR
multiclass avx512_mask_binop<bits<8> opc, string OpcodeStr,
RegisterClass KRC, SDPatternOperator OpNode,
- Predicate prd, bit IsCommutable> {
+ OpndItins itins, Predicate prd, bit IsCommutable> {
let Predicates = [prd], isCommutable = IsCommutable in
def rr : I<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src1, KRC:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))]>;
+ [(set KRC:$dst, (OpNode KRC:$src1, KRC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
}
multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, bit IsCommutable,
- Predicate prdW = HasAVX512> {
+ SDPatternOperator OpNode, OpndItins itins,
+ bit IsCommutable, Predicate prdW = HasAVX512> {
defm B : avx512_mask_binop<opc, !strconcat(OpcodeStr, "b"), VK8, OpNode,
- HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
+ itins, HasDQI, IsCommutable>, VEX_4V, VEX_L, PD;
defm W : avx512_mask_binop<opc, !strconcat(OpcodeStr, "w"), VK16, OpNode,
- prdW, IsCommutable>, VEX_4V, VEX_L, PS;
+ itins, prdW, IsCommutable>, VEX_4V, VEX_L, PS;
defm D : avx512_mask_binop<opc, !strconcat(OpcodeStr, "d"), VK32, OpNode,
- HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
+ itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PD;
defm Q : avx512_mask_binop<opc, !strconcat(OpcodeStr, "q"), VK64, OpNode,
- HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
+ itins, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
}
def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
@@ -2950,12 +2866,12 @@ def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
-defm KAND : avx512_mask_binop_all<0x41, "kand", and, 1>;
-defm KOR : avx512_mask_binop_all<0x45, "kor", or, 1>;
-defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, 1>;
-defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, 1>;
-defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
-defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, 1, HasDQI>;
+defm KAND : avx512_mask_binop_all<0x41, "kand", and, SSE_BIT_ITINS_P, 1>;
+defm KOR : avx512_mask_binop_all<0x45, "kor", or, SSE_BIT_ITINS_P, 1>;
+defm KXNOR : avx512_mask_binop_all<0x46, "kxnor", vxnor, SSE_BIT_ITINS_P, 1>;
+defm KXOR : avx512_mask_binop_all<0x47, "kxor", xor, SSE_BIT_ITINS_P, 1>;
+defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, SSE_BIT_ITINS_P, 0>;
+defm KADD : avx512_mask_binop_all<0x4A, "kadd", add, SSE_BIT_ITINS_P, 1, HasDQI>;
multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
Instruction Inst> {
@@ -2990,13 +2906,13 @@ defm : avx512_binop_pat<xor, xor, KXORWrr>;
// Mask unpacking
multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
- RegisterClass KRCSrc, Predicate prd> {
+ RegisterClass KRCSrc, OpndItins itins, Predicate prd> {
let Predicates = [prd] in {
let hasSideEffects = 0 in
def rr : I<0x4b, MRMSrcReg, (outs KRC:$dst),
(ins KRC:$src1, KRC:$src2),
- "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- VEX_4V, VEX_L;
+ "kunpck"#Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ itins.rr>, VEX_4V, VEX_L, Sched<[itins.Sched]>;
def : Pat<(VT (concat_vectors KRCSrc:$src1, KRCSrc:$src2)),
(!cast<Instruction>(NAME##rr)
@@ -3005,61 +2921,63 @@ multiclass avx512_mask_unpck<string Suffix,RegisterClass KRC, ValueType VT,
}
}
-defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, HasAVX512>, PD;
-defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, HasBWI>, PS;
-defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, HasBWI>, PS, VEX_W;
+defm KUNPCKBW : avx512_mask_unpck<"bw", VK16, v16i1, VK8, SSE_UNPCK, HasAVX512>, PD;
+defm KUNPCKWD : avx512_mask_unpck<"wd", VK32, v32i1, VK16, SSE_UNPCK, HasBWI>, PS;
+defm KUNPCKDQ : avx512_mask_unpck<"dq", VK64, v64i1, VK32, SSE_UNPCK, HasBWI>, PS, VEX_W;
// Mask bit testing
multiclass avx512_mask_testop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
- SDNode OpNode, Predicate prd> {
+ SDNode OpNode, OpndItins itins, Predicate prd> {
let Predicates = [prd], Defs = [EFLAGS] in
def rr : I<opc, MRMSrcReg, (outs), (ins KRC:$src1, KRC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
- [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))]>;
+ [(set EFLAGS, (OpNode KRC:$src1, KRC:$src2))], itins.rr>,
+ Sched<[itins.Sched]>;
}
multiclass avx512_mask_testop_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
- Predicate prdW = HasAVX512> {
- defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, HasDQI>,
+ OpndItins itins, Predicate prdW = HasAVX512> {
+ defm B : avx512_mask_testop<opc, OpcodeStr#"b", VK8, OpNode, itins, HasDQI>,
VEX, PD;
- defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, prdW>,
+ defm W : avx512_mask_testop<opc, OpcodeStr#"w", VK16, OpNode, itins, prdW>,
VEX, PS;
- defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, HasBWI>,
+ defm Q : avx512_mask_testop<opc, OpcodeStr#"q", VK64, OpNode, itins, HasBWI>,
VEX, PS, VEX_W;
- defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, HasBWI>,
+ defm D : avx512_mask_testop<opc, OpcodeStr#"d", VK32, OpNode, itins, HasBWI>,
VEX, PD, VEX_W;
}
-defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest>;
-defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, HasDQI>;
+defm KORTEST : avx512_mask_testop_w<0x98, "kortest", X86kortest, SSE_PTEST>;
+defm KTEST : avx512_mask_testop_w<0x99, "ktest", X86ktest, SSE_PTEST, HasDQI>;
// Mask shift
multiclass avx512_mask_shiftop<bits<8> opc, string OpcodeStr, RegisterClass KRC,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
let Predicates = [HasAVX512] in
def ri : Ii8<opc, MRMSrcReg, (outs KRC:$dst), (ins KRC:$src, u8imm:$imm),
!strconcat(OpcodeStr,
"\t{$imm, $src, $dst|$dst, $src, $imm}"),
- [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))]>;
+ [(set KRC:$dst, (OpNode KRC:$src, (i8 imm:$imm)))],
+ itins.rr>, Sched<[itins.Sched]>;
}
multiclass avx512_mask_shiftop_w<bits<8> opc1, bits<8> opc2, string OpcodeStr,
- SDNode OpNode> {
- defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode>,
- VEX, TAPD, VEX_W;
+ SDNode OpNode, OpndItins itins> {
+ defm W : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "w"), VK16, OpNode,
+ itins>, VEX, TAPD, VEX_W;
let Predicates = [HasDQI] in
- defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode>,
- VEX, TAPD;
+ defm B : avx512_mask_shiftop<opc1, !strconcat(OpcodeStr, "b"), VK8, OpNode,
+ itins>, VEX, TAPD;
let Predicates = [HasBWI] in {
- defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode>,
- VEX, TAPD, VEX_W;
- defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode>,
- VEX, TAPD;
+ defm Q : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "q"), VK64, OpNode,
+ itins>, VEX, TAPD, VEX_W;
+ defm D : avx512_mask_shiftop<opc2, !strconcat(OpcodeStr, "d"), VK32, OpNode,
+ itins>, VEX, TAPD;
}
}
-defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl>;
-defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr>;
+defm KSHIFTL : avx512_mask_shiftop_w<0x32, 0x33, "kshiftl", X86kshiftl, SSE_PSHUF>;
+defm KSHIFTR : avx512_mask_shiftop_w<0x30, 0x31, "kshiftr", X86kshiftr, SSE_PSHUF>;
multiclass axv512_icmp_packed_no_vlx_lowering<SDNode OpNode, string InstStr> {
def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
@@ -3067,23 +2985,14 @@ def : Pat<(v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))), VK8)>;
-def : Pat<(insert_subvector (v16i1 immAllZerosV),
- (v8i1 (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
- (i64 0)),
- (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrr)
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
- (i8 8)), (i8 8))>;
-
-def : Pat<(insert_subvector (v16i1 immAllZerosV),
- (v8i1 (and VK8:$mask,
- (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
- (i64 0)),
- (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrk)
- (COPY_TO_REGCLASS VK8:$mask, VK16),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
- (i8 8)), (i8 8))>;
+def : Pat<(v8i1 (and VK8:$mask,
+ (OpNode (v8i32 VR256X:$src1), (v8i32 VR256X:$src2)))),
+ (COPY_TO_REGCLASS
+ (!cast<Instruction>(InstStr##Zrrk)
+ (COPY_TO_REGCLASS VK8:$mask, VK16),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+ VK8)>;
}
multiclass axv512_icmp_packed_cc_no_vlx_lowering<SDNode OpNode, string InstStr,
@@ -3094,25 +3003,13 @@ def : Pat<(v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2)
(_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
imm:$cc), VK8)>;
-def : Pat<(insert_subvector (v16i1 immAllZerosV),
- (v8i1 (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc)),
- (i64 0)),
- (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrri)
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- imm:$cc),
- (i8 8)), (i8 8))>;
-
-def : Pat<(insert_subvector (v16i1 immAllZerosV),
- (v8i1 (and VK8:$mask,
- (OpNode (_.info256.VT VR256X:$src1), (_.info256.VT VR256X:$src2), imm:$cc))),
- (i64 0)),
- (KSHIFTRWri (KSHIFTLWri (!cast<Instruction>(InstStr##Zrrik)
- (COPY_TO_REGCLASS VK8:$mask, VK16),
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
- imm:$cc),
- (i8 8)), (i8 8))>;
+def : Pat<(v8i1 (and VK8:$mask, (OpNode (_.info256.VT VR256X:$src1),
+ (_.info256.VT VR256X:$src2), imm:$cc))),
+ (COPY_TO_REGCLASS (!cast<Instruction>(InstStr##Zrrik)
+ (COPY_TO_REGCLASS VK8:$mask, VK16),
+ (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+ (_.info512.VT (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)),
+ imm:$cc), VK8)>;
}
let Predicates = [HasAVX512, NoVLX] in {
@@ -3127,7 +3024,8 @@ let Predicates = [HasAVX512, NoVLX] in {
// Mask setting all 0s or 1s
multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
let Predicates = [HasAVX512] in
- let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1 in
+ let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1,
+ SchedRW = [WriteZero] in
def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
[(set KRC:$dst, (VT Val))]>;
}
@@ -3189,21 +3087,48 @@ defm : operation_subvector_mask_lowering<VK16, v16i1, VK64, v64i1>;
defm : operation_subvector_mask_lowering<VK32, v32i1, VK64, v64i1>;
-def : Pat<(v2i1 (extract_subvector (v4i1 VK4:$src), (iPTR 2))),
- (v2i1 (COPY_TO_REGCLASS
- (KSHIFTRWri (COPY_TO_REGCLASS VK4:$src, VK16), (i8 2)),
- VK2))>;
-def : Pat<(v4i1 (extract_subvector (v8i1 VK8:$src), (iPTR 4))),
- (v4i1 (COPY_TO_REGCLASS
- (KSHIFTRWri (COPY_TO_REGCLASS VK8:$src, VK16), (i8 4)),
- VK4))>;
-def : Pat<(v8i1 (extract_subvector (v16i1 VK16:$src), (iPTR 8))),
- (v8i1 (COPY_TO_REGCLASS (KSHIFTRWri VK16:$src, (i8 8)), VK8))>;
-def : Pat<(v16i1 (extract_subvector (v32i1 VK32:$src), (iPTR 16))),
- (v16i1 (COPY_TO_REGCLASS (KSHIFTRDri VK32:$src, (i8 16)), VK16))>;
-def : Pat<(v32i1 (extract_subvector (v64i1 VK64:$src), (iPTR 32))),
- (v32i1 (COPY_TO_REGCLASS (KSHIFTRQri VK64:$src, (i8 32)), VK32))>;
+multiclass vextract_for_mask_to_mask<string InstrStr, X86KVectorVTInfo From,
+ X86KVectorVTInfo To, Predicate prd> {
+let Predicates = [prd] in
+ def :
+ Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))),
+ (To.KVT(COPY_TO_REGCLASS
+ (!cast<Instruction>(InstrStr#"ri") From.KVT:$src,
+ (i8 imm:$imm8)), To.KRC))>;
+}
+
+multiclass vextract_for_mask_to_mask_legal_w<X86KVectorVTInfo From,
+ X86KVectorVTInfo To> {
+def :
+ Pat<(To.KVT(extract_subvector(From.KVT From.KRC:$src), (iPTR imm:$imm8))),
+ (To.KVT(COPY_TO_REGCLASS
+ (KSHIFTRWri(COPY_TO_REGCLASS From.KRC:$src, VK16),
+ (i8 imm:$imm8)), To.KRC))>;
+}
+
+defm : vextract_for_mask_to_mask_legal_w<v2i1_info, v1i1_info>;
+defm : vextract_for_mask_to_mask_legal_w<v4i1_info, v1i1_info>;
+defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v1i1_info>;
+defm : vextract_for_mask_to_mask_legal_w<v4i1_info, v2i1_info>;
+defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v2i1_info>;
+defm : vextract_for_mask_to_mask_legal_w<v8i1_info, v4i1_info>;
+
+defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v1i1_info, HasAVX512>;
+defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v1i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v1i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v2i1_info, HasAVX512>;
+defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v2i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v2i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v4i1_info, HasAVX512>;
+defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v4i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v4i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRW", v16i1_info, v8i1_info, HasAVX512>;
+defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v8i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v8i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRD", v32i1_info, v16i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v16i1_info, HasBWI>;
+defm : vextract_for_mask_to_mask<"KSHIFTRQ", v64i1_info, v32i1_info, HasBWI>;
// Patterns for kmask shift
multiclass mask_shift_lowering<RegisterClass RC, ValueType VT> {
@@ -3227,39 +3152,40 @@ defm : mask_shift_lowering<VK2, v2i1>, Requires<[HasAVX512]>;
//
-multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- PatFrag ld_frag, PatFrag mload,
- SDPatternOperator SelectOprr = vselect> {
+multiclass avx512_load<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+ X86VectorVTInfo _, PatFrag ld_frag, PatFrag mload,
+ bit NoRMPattern = 0,
+ SDPatternOperator SelectOprr = vselect> {
let hasSideEffects = 0 in {
def rr : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"), [],
- _.ExeDomain>, EVEX;
+ _.ExeDomain, itins.rr>, EVEX, Sched<[WriteMove]>;
def rrkz : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
"${dst} {${mask}} {z}, $src}"),
[(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
(_.VT _.RC:$src),
- _.ImmAllZerosV)))], _.ExeDomain>,
- EVEX, EVEX_KZ;
+ _.ImmAllZerosV)))], _.ExeDomain,
+ itins.rr>, EVEX, EVEX_KZ, Sched<[WriteMove]>;
- let canFoldAsLoad = 1, isReMaterializable = 1,
- SchedRW = [WriteLoad] in
+ let mayLoad = 1, canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst), (ins _.MemOp:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.RC:$dst, (_.VT (bitconvert (ld_frag addr:$src))))],
- _.ExeDomain>, EVEX;
+ !if(NoRMPattern, [],
+ [(set _.RC:$dst,
+ (_.VT (bitconvert (ld_frag addr:$src))))]),
+ _.ExeDomain, itins.rm>, EVEX, Sched<[WriteLoad]>;
let Constraints = "$src0 = $dst", isConvertibleToThreeAddress = 1 in {
- def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
- !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
- "${dst} {${mask}}, $src1}"),
- [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
- (_.VT _.RC:$src1),
- (_.VT _.RC:$src0))))], _.ExeDomain>,
- EVEX, EVEX_K;
- let SchedRW = [WriteLoad] in
+ def rrk : AVX512PI<opc, MRMSrcReg, (outs _.RC:$dst),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1),
+ !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
+ "${dst} {${mask}}, $src1}"),
+ [(set _.RC:$dst, (_.VT (SelectOprr _.KRCWM:$mask,
+ (_.VT _.RC:$src1),
+ (_.VT _.RC:$src0))))], _.ExeDomain,
+ itins.rr>, EVEX, EVEX_K, Sched<[WriteMove]>;
def rmk : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.MemOp:$src1),
!strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
@@ -3267,16 +3193,16 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
[(set _.RC:$dst, (_.VT
(vselect _.KRCWM:$mask,
(_.VT (bitconvert (ld_frag addr:$src1))),
- (_.VT _.RC:$src0))))], _.ExeDomain>, EVEX, EVEX_K;
+ (_.VT _.RC:$src0))))], _.ExeDomain, itins.rm>,
+ EVEX, EVEX_K, Sched<[WriteLoad]>;
}
- let SchedRW = [WriteLoad] in
def rmkz : AVX512PI<opc, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.MemOp:$src),
OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
"${dst} {${mask}} {z}, $src}",
[(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
(_.VT (bitconvert (ld_frag addr:$src))), _.ImmAllZerosV)))],
- _.ExeDomain>, EVEX, EVEX_KZ;
+ _.ExeDomain, itins.rm>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
}
def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
(!cast<Instruction>(NAME#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
@@ -3293,59 +3219,72 @@ multiclass avx512_alignedload_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _,
Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.AlignedLdFrag,
- masked_load_aligned512>, EVEX_V512;
+ defm Z : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info512,
+ _.info512.AlignedLdFrag, masked_load_aligned512>,
+ EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.AlignedLdFrag,
- masked_load_aligned256>, EVEX_V256;
- defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.AlignedLdFrag,
- masked_load_aligned128>, EVEX_V128;
+ defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info256,
+ _.info256.AlignedLdFrag, masked_load_aligned256>,
+ EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVA, _.info128,
+ _.info128.AlignedLdFrag, masked_load_aligned128>,
+ EVEX_V128;
}
}
multiclass avx512_load_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _,
Predicate prd,
+ bit NoRMPattern = 0,
SDPatternOperator SelectOprr = vselect> {
let Predicates = [prd] in
- defm Z : avx512_load<opc, OpcodeStr, _.info512, _.info512.LdFrag,
- masked_load_unaligned, SelectOprr>, EVEX_V512;
+ defm Z : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info512, _.info512.LdFrag,
+ masked_load_unaligned, NoRMPattern,
+ SelectOprr>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_load<opc, OpcodeStr, _.info256, _.info256.LdFrag,
- masked_load_unaligned, SelectOprr>, EVEX_V256;
- defm Z128 : avx512_load<opc, OpcodeStr, _.info128, _.info128.LdFrag,
- masked_load_unaligned, SelectOprr>, EVEX_V128;
+ defm Z256 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info256, _.info256.LdFrag,
+ masked_load_unaligned, NoRMPattern,
+ SelectOprr>, EVEX_V256;
+ defm Z128 : avx512_load<opc, OpcodeStr, SSE_MOVU, _.info128, _.info128.LdFrag,
+ masked_load_unaligned, NoRMPattern,
+ SelectOprr>, EVEX_V128;
}
}
-multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- PatFrag st_frag, PatFrag mstore, string Name> {
-
+multiclass avx512_store<bits<8> opc, string OpcodeStr, MoveLoadStoreItins itins,
+ X86VectorVTInfo _, PatFrag st_frag, PatFrag mstore,
+ string Name, bit NoMRPattern = 0> {
let hasSideEffects = 0 in {
def rr_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst), (ins _.RC:$src),
OpcodeStr # ".s\t{$src, $dst|$dst, $src}",
- [], _.ExeDomain>, EVEX, FoldGenData<Name#rr>;
+ [], _.ExeDomain, itins.rr>, EVEX, FoldGenData<Name#rr>,
+ Sched<[WriteMove]>;
def rrk_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # ".s\t{$src, ${dst} {${mask}}|"#
"${dst} {${mask}}, $src}",
- [], _.ExeDomain>, EVEX, EVEX_K, FoldGenData<Name#rrk>;
+ [], _.ExeDomain, itins.rr>, EVEX, EVEX_K,
+ FoldGenData<Name#rrk>, Sched<[WriteMove]>;
def rrkz_REV : AVX512PI<opc, MRMDestReg, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.RC:$src),
OpcodeStr # ".s\t{$src, ${dst} {${mask}} {z}|" #
"${dst} {${mask}} {z}, $src}",
- [], _.ExeDomain>, EVEX, EVEX_KZ, FoldGenData<Name#rrkz>;
+ [], _.ExeDomain, itins.rr>, EVEX, EVEX_KZ,
+ FoldGenData<Name#rrkz>, Sched<[WriteMove]>;
}
+ let hasSideEffects = 0, mayStore = 1 in
def mr : AVX512PI<opc, MRMDestMem, (outs), (ins _.MemOp:$dst, _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(st_frag (_.VT _.RC:$src), addr:$dst)], _.ExeDomain>, EVEX;
+ !if(NoMRPattern, [],
+ [(st_frag (_.VT _.RC:$src), addr:$dst)]),
+ _.ExeDomain, itins.mr>, EVEX, Sched<[WriteStore]>;
def mrk : AVX512PI<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}",
- [], _.ExeDomain>, EVEX, EVEX_K;
+ [], _.ExeDomain, itins.mr>, EVEX, EVEX_K, Sched<[WriteStore]>;
def: Pat<(mstore addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src)),
(!cast<Instruction>(NAME#_.ZSuffix##mrk) addr:$ptr,
@@ -3355,16 +3294,18 @@ multiclass avx512_store<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
multiclass avx512_store_vl< bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
- string Name> {
+ string Name, bit NoMRPattern = 0> {
let Predicates = [prd] in
- defm Z : avx512_store<opc, OpcodeStr, _.info512, store,
- masked_store_unaligned, Name#Z>, EVEX_V512;
+ defm Z : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info512, store,
+ masked_store_unaligned, Name#Z, NoMRPattern>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_store<opc, OpcodeStr, _.info256, store,
- masked_store_unaligned, Name#Z256>, EVEX_V256;
- defm Z128 : avx512_store<opc, OpcodeStr, _.info128, store,
- masked_store_unaligned, Name#Z128>, EVEX_V128;
+ defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info256, store,
+ masked_store_unaligned, Name#Z256,
+ NoMRPattern>, EVEX_V256;
+ defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVU, _.info128, store,
+ masked_store_unaligned, Name#Z128,
+ NoMRPattern>, EVEX_V128;
}
}
@@ -3372,13 +3313,13 @@ multiclass avx512_alignedstore_vl<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _, Predicate prd,
string Name> {
let Predicates = [prd] in
- defm Z : avx512_store<opc, OpcodeStr, _.info512, alignedstore512,
+ defm Z : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info512, alignedstore,
masked_store_aligned512, Name#Z>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_store<opc, OpcodeStr, _.info256, alignedstore256,
+ defm Z256 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info256, alignedstore,
masked_store_aligned256, Name#Z256>, EVEX_V256;
- defm Z128 : avx512_store<opc, OpcodeStr, _.info128, alignedstore,
+ defm Z128 : avx512_store<opc, OpcodeStr, SSE_MOVA, _.info128, alignedstore,
masked_store_aligned128, Name#Z128>, EVEX_V128;
}
}
@@ -3396,13 +3337,13 @@ defm VMOVAPD : avx512_alignedload_vl<0x28, "vmovapd", avx512vl_f64_info,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VMOVUPS : avx512_load_vl<0x10, "vmovups", avx512vl_f32_info, HasAVX512,
- null_frag>,
+ 0, null_frag>,
avx512_store_vl<0x11, "vmovups", avx512vl_f32_info, HasAVX512,
"VMOVUPS">,
PS, EVEX_CD8<32, CD8VF>;
defm VMOVUPD : avx512_load_vl<0x10, "vmovupd", avx512vl_f64_info, HasAVX512,
- null_frag>,
+ 0, null_frag>,
avx512_store_vl<0x11, "vmovupd", avx512vl_f64_info, HasAVX512,
"VMOVUPD">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
@@ -3419,24 +3360,24 @@ defm VMOVDQA64 : avx512_alignedload_vl<0x6F, "vmovdqa64", avx512vl_i64_info,
HasAVX512, "VMOVDQA64">,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI>,
+defm VMOVDQU8 : avx512_load_vl<0x6F, "vmovdqu8", avx512vl_i8_info, HasBWI, 1>,
avx512_store_vl<0x7F, "vmovdqu8", avx512vl_i8_info,
- HasBWI, "VMOVDQU8">,
+ HasBWI, "VMOVDQU8", 1>,
XD, EVEX_CD8<8, CD8VF>;
-defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI>,
+defm VMOVDQU16 : avx512_load_vl<0x6F, "vmovdqu16", avx512vl_i16_info, HasBWI, 1>,
avx512_store_vl<0x7F, "vmovdqu16", avx512vl_i16_info,
- HasBWI, "VMOVDQU16">,
+ HasBWI, "VMOVDQU16", 1>,
XD, VEX_W, EVEX_CD8<16, CD8VF>;
defm VMOVDQU32 : avx512_load_vl<0x6F, "vmovdqu32", avx512vl_i32_info, HasAVX512,
- null_frag>,
+ 0, null_frag>,
avx512_store_vl<0x7F, "vmovdqu32", avx512vl_i32_info,
HasAVX512, "VMOVDQU32">,
XS, EVEX_CD8<32, CD8VF>;
defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
- null_frag>,
+ 0, null_frag>,
avx512_store_vl<0x7F, "vmovdqu64", avx512vl_i64_info,
HasAVX512, "VMOVDQU64">,
XS, VEX_W, EVEX_CD8<64, CD8VF>;
@@ -3447,24 +3388,24 @@ defm VMOVDQU64 : avx512_load_vl<0x6F, "vmovdqu64", avx512vl_i64_info, HasAVX512,
let isReMaterializable = 1, canFoldAsLoad = 1,
isPseudo = 1, SchedRW = [WriteLoad], mayLoad = 1, hasSideEffects = 0 in {
def VMOVAPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_RM>;
def VMOVAPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_RM>;
def VMOVUPSZ128rm_NOVLX : I<0, Pseudo, (outs VR128X:$dst), (ins f128mem:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_RM>;
def VMOVUPSZ256rm_NOVLX : I<0, Pseudo, (outs VR256X:$dst), (ins f256mem:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_RM>;
}
-let isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+let isPseudo = 1, SchedRW = [WriteStore], mayStore = 1, hasSideEffects = 0 in {
def VMOVAPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_MR>;
def VMOVAPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_MR>;
def VMOVUPSZ128mr_NOVLX : I<0, Pseudo, (outs), (ins f128mem:$dst, VR128X:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_MR>;
def VMOVUPSZ256mr_NOVLX : I<0, Pseudo, (outs), (ins f256mem:$dst, VR256X:$src),
- "", []>;
+ "", [], IIC_SSE_MOVA_P_MR>;
}
def : Pat<(v8i64 (vselect VK8WM:$mask, (bc_v8i64 (v16i32 immAllZerosV)),
@@ -3511,8 +3452,20 @@ def : Pat<(v8i32 (vselect (v8i1 VK8WM:$mask), (v8i32 VR256X:$src1),
sub_ymm)>;
}
-let Predicates = [HasVLX, NoBWI] in {
- // 128-bit load/store without BWI.
+let Predicates = [HasAVX512] in {
+ // 512-bit store.
+ def : Pat<(alignedstore (v32i16 VR512:$src), addr:$dst),
+ (VMOVDQA32Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(alignedstore (v64i8 VR512:$src), addr:$dst),
+ (VMOVDQA32Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v32i16 VR512:$src), addr:$dst),
+ (VMOVDQU32Zmr addr:$dst, VR512:$src)>;
+ def : Pat<(store (v64i8 VR512:$src), addr:$dst),
+ (VMOVDQU32Zmr addr:$dst, VR512:$src)>;
+}
+
+let Predicates = [HasVLX] in {
+ // 128-bit store.
def : Pat<(alignedstore (v8i16 VR128X:$src), addr:$dst),
(VMOVDQA32Z128mr addr:$dst, VR128X:$src)>;
def : Pat<(alignedstore (v16i8 VR128X:$src), addr:$dst),
@@ -3522,10 +3475,10 @@ let Predicates = [HasVLX, NoBWI] in {
def : Pat<(store (v16i8 VR128X:$src), addr:$dst),
(VMOVDQU32Z128mr addr:$dst, VR128X:$src)>;
- // 256-bit load/store without BWI.
- def : Pat<(alignedstore256 (v16i16 VR256X:$src), addr:$dst),
+ // 256-bit store.
+ def : Pat<(alignedstore (v16i16 VR256X:$src), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
- def : Pat<(alignedstore256 (v32i8 VR256X:$src), addr:$dst),
+ def : Pat<(alignedstore (v32i8 VR256X:$src), addr:$dst),
(VMOVDQA32Z256mr addr:$dst, VR256X:$src)>;
def : Pat<(store (v16i16 VR256X:$src), addr:$dst),
(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
@@ -3533,129 +3486,75 @@ let Predicates = [HasVLX, NoBWI] in {
(VMOVDQU32Z256mr addr:$dst, VR256X:$src)>;
}
-let Predicates = [HasVLX] in {
- // Special patterns for storing subvector extracts of lower 128-bits of 256.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- def : Pat<(alignedstore (v2f64 (extract_subvector
- (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v4f32 (extract_subvector
- (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v2i64 (extract_subvector
- (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v4i32 (extract_subvector
- (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v8i16 (extract_subvector
- (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v16i8 (extract_subvector
- (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
-
- def : Pat<(store (v2f64 (extract_subvector
- (v4f64 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(store (v4f32 (extract_subvector
- (v8f32 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(store (v2i64 (extract_subvector
- (v4i64 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(store (v4i32 (extract_subvector
- (v8i32 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(store (v8i16 (extract_subvector
- (v16i16 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
- def : Pat<(store (v16i8 (extract_subvector
- (v32i8 VR256X:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256X:$src,sub_xmm)))>;
-
- // Special patterns for storing subvector extracts of lower 128-bits of 512.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- def : Pat<(alignedstore (v2f64 (extract_subvector
- (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVAPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v4f32 (extract_subvector
- (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVAPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v2i64 (extract_subvector
- (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v4i32 (extract_subvector
- (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v8i16 (extract_subvector
- (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v16i8 (extract_subvector
- (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
-
- def : Pat<(store (v2f64 (extract_subvector
- (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVUPDZ128mr addr:$dst, (v2f64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(store (v4f32 (extract_subvector
- (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVUPSZ128mr addr:$dst, (v4f32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(store (v2i64 (extract_subvector
- (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU64Z128mr addr:$dst, (v2i64 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(store (v4i32 (extract_subvector
- (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, (v4i32 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(store (v8i16 (extract_subvector
- (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, (v8i16 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
- def : Pat<(store (v16i8 (extract_subvector
- (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z128mr addr:$dst, (v16i8 (EXTRACT_SUBREG VR512:$src,sub_xmm)))>;
-
- // Special patterns for storing subvector extracts of lower 256-bits of 512.
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- def : Pat<(alignedstore256 (v4f64 (extract_subvector
- (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVAPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore256 (v8f32 (extract_subvector
- (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVAPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore256 (v4i64 (extract_subvector
- (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore256 (v8i32 (extract_subvector
- (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore256 (v16i16 (extract_subvector
- (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(alignedstore256 (v32i8 (extract_subvector
- (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQA32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
-
- def : Pat<(store (v4f64 (extract_subvector
- (v8f64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVUPDZ256mr addr:$dst, (v4f64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(store (v8f32 (extract_subvector
- (v16f32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVUPSZ256mr addr:$dst, (v8f32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(store (v4i64 (extract_subvector
- (v8i64 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU64Z256mr addr:$dst, (v4i64 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(store (v8i32 (extract_subvector
- (v16i32 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z256mr addr:$dst, (v8i32 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(store (v16i16 (extract_subvector
- (v32i16 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z256mr addr:$dst, (v16i16 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
- def : Pat<(store (v32i8 (extract_subvector
- (v64i8 VR512:$src), (iPTR 0))), addr:$dst),
- (VMOVDQU32Z256mr addr:$dst, (v32i8 (EXTRACT_SUBREG VR512:$src,sub_ymm)))>;
+multiclass masked_move_for_extract<string InstrStr, X86VectorVTInfo From,
+ X86VectorVTInfo To, X86VectorVTInfo Cast> {
+ def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (extract_subvector
+ (From.VT From.RC:$src), (iPTR 0)))),
+ To.RC:$src0)),
+ (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
+ Cast.RC:$src0, Cast.KRCWM:$mask,
+ (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>;
+
+ def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
+ (bitconvert
+ (To.VT (extract_subvector
+ (From.VT From.RC:$src), (iPTR 0)))),
+ Cast.ImmAllZerosV)),
+ (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
+ Cast.KRCWM:$mask,
+ (EXTRACT_SUBREG From.RC:$src, To.SubRegIdx)))>;
}
+let Predicates = [HasVLX] in {
+// A masked extract from the first 128-bits of a 256-bit vector can be
+// implemented with masked move.
+defm : masked_move_for_extract<"VMOVDQA64Z128", v4i64x_info, v2i64x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z128", v8i32x_info, v4i32x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z128", v16i16x_info, v8i16x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z128", v32i8x_info, v16i8x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v4i64x_info, v2i64x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v8i32x_info, v4i32x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v16i16x_info, v8i16x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v32i8x_info, v16i8x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVAPDZ128", v4f64x_info, v2f64x_info, v2f64x_info>;
+defm : masked_move_for_extract<"VMOVAPDZ128", v8f32x_info, v4f32x_info, v2f64x_info>;
+defm : masked_move_for_extract<"VMOVAPSZ128", v4f64x_info, v2f64x_info, v4f32x_info>;
+defm : masked_move_for_extract<"VMOVAPSZ128", v8f32x_info, v4f32x_info, v4f32x_info>;
+
+// A masked extract from the first 128-bits of a 512-bit vector can be
+// implemented with masked move.
+defm : masked_move_for_extract<"VMOVDQA64Z128", v8i64_info, v2i64x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z128", v16i32_info, v4i32x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z128", v32i16_info, v8i16x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z128", v64i8_info, v16i8x_info, v2i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v8i64_info, v2i64x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v16i32_info, v4i32x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v32i16_info, v8i16x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z128", v64i8_info, v16i8x_info, v4i32x_info>;
+defm : masked_move_for_extract<"VMOVAPDZ128", v8f64_info, v2f64x_info, v2f64x_info>;
+defm : masked_move_for_extract<"VMOVAPDZ128", v16f32_info, v4f32x_info, v2f64x_info>;
+defm : masked_move_for_extract<"VMOVAPSZ128", v8f64_info, v2f64x_info, v4f32x_info>;
+defm : masked_move_for_extract<"VMOVAPSZ128", v16f32_info, v4f32x_info, v4f32x_info>;
+
+// A masked extract from the first 256-bits of a 512-bit vector can be
+// implemented with masked move.
+defm : masked_move_for_extract<"VMOVDQA64Z256", v8i64_info, v4i64x_info, v4i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z256", v16i32_info, v8i32x_info, v4i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z256", v32i16_info, v16i16x_info, v4i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA64Z256", v64i8_info, v32i8x_info, v4i64x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z256", v8i64_info, v4i64x_info, v8i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z256", v16i32_info, v8i32x_info, v8i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z256", v32i16_info, v16i16x_info, v8i32x_info>;
+defm : masked_move_for_extract<"VMOVDQA32Z256", v64i8_info, v32i8x_info, v8i32x_info>;
+defm : masked_move_for_extract<"VMOVAPDZ256", v8f64_info, v4f64x_info, v4f64x_info>;
+defm : masked_move_for_extract<"VMOVAPDZ256", v16f32_info, v8f32x_info, v4f64x_info>;
+defm : masked_move_for_extract<"VMOVAPSZ256", v8f64_info, v4f64x_info, v8f32x_info>;
+defm : masked_move_for_extract<"VMOVAPSZ256", v16f32_info, v8f32x_info, v8f32x_info>;
+}
+
// Move Int Doubleword to Packed Double Int
//
let ExeDomain = SSEPackedInt in {
@@ -3663,22 +3562,22 @@ def VMOVDI2PDIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR32:$src
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v4i32 (scalar_to_vector GR32:$src)))], IIC_SSE_MOVDQ>,
- EVEX;
+ EVEX, Sched<[WriteMove]>;
def VMOVDI2PDIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst), (ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v4i32 (scalar_to_vector (loadi32 addr:$src))))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>;
def VMOV64toPQIZrr : AVX512BI<0x6E, MRMSrcReg, (outs VR128X:$dst), (ins GR64:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector GR64:$src)))],
- IIC_SSE_MOVDQ>, EVEX, VEX_W;
+ IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayLoad = 1 in
def VMOV64toPQIZrm : AVX512BI<0x6E, MRMSrcMem, (outs VR128X:$dst),
(ins i64mem:$src),
- "vmovq\t{$src, $dst|$dst, $src}", []>,
- EVEX, VEX_W, EVEX_CD8<64, CD8VT1>;
+ "vmovq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
+ EVEX, VEX_W, EVEX_CD8<64, CD8VT1>, Sched<[WriteLoad]>;
let isCodeGenOnly = 1 in {
def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src),
"vmovq\t{$src, $dst|$dst, $src}",
@@ -3687,7 +3586,7 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src)
def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>;
def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (bitconvert FR64X:$src))],
@@ -3706,12 +3605,12 @@ let ExeDomain = SSEPackedInt, isCodeGenOnly = 1 in {
def VMOVDI2SSZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR32X:$dst), (ins GR32:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert GR32:$src))],
- IIC_SSE_MOVDQ>, EVEX;
+ IIC_SSE_MOVDQ>, EVEX, Sched<[WriteMove]>;
def VMOVDI2SSZrm : AVX512BI<0x6E, MRMSrcMem, (outs FR32X:$dst), (ins i32mem:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set FR32X:$dst, (bitconvert (loadi32 addr:$src)))],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteLoad]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move doubleword from xmm register to r/m32
@@ -3721,13 +3620,13 @@ def VMOVPDI2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst), (ins VR128X:$s
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (extractelt (v4i32 VR128X:$src),
(iPTR 0)))], IIC_SSE_MOVD_ToGP>,
- EVEX;
+ EVEX, Sched<[WriteMove]>;
def VMOVPDI2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, VR128X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(store (i32 (extractelt (v4i32 VR128X:$src),
(iPTR 0))), addr:$dst)], IIC_SSE_MOVDQ>,
- EVEX, EVEX_CD8<32, CD8VT1>;
+ EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>;
} // ExeDomain = SSEPackedInt
// Move quadword from xmm1 register to r/m64
@@ -3737,13 +3636,13 @@ def VMOVPQIto64Zrr : I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (extractelt (v2i64 VR128X:$src),
(iPTR 0)))],
- IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteMove]>,
Requires<[HasAVX512, In64BitMode]>;
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
def VMOVPQIto64Zmr : I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128X:$src),
"vmovq\t{$src, $dst|$dst, $src}",
- [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W,
+ [], IIC_SSE_MOVD_ToGP>, PD, EVEX, VEX_W, Sched<[WriteStore]>,
Requires<[HasAVX512, In64BitMode]>;
def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
@@ -3757,8 +3656,8 @@ def VMOVPQI2QIZmr : I<0xD6, MRMDestMem, (outs),
let hasSideEffects = 0 in
def VMOVPQI2QIZrr : AVX512BI<0xD6, MRMDestReg, (outs VR128X:$dst),
(ins VR128X:$src),
- "vmovq.s\t{$src, $dst|$dst, $src}",[]>,
- EVEX, VEX_W;
+ "vmovq.s\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVDQ>,
+ EVEX, VEX_W, Sched<[WriteMove]>;
} // ExeDomain = SSEPackedInt
// Move Scalar Single to Double Int
@@ -3768,12 +3667,12 @@ def VMOVSS2DIZrr : AVX512BI<0x7E, MRMDestReg, (outs GR32:$dst),
(ins FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (bitconvert FR32X:$src))],
- IIC_SSE_MOVD_ToGP>, EVEX;
+ IIC_SSE_MOVD_ToGP>, EVEX, Sched<[WriteMove]>;
def VMOVSS2DIZmr : AVX512BI<0x7E, MRMDestMem, (outs),
(ins i32mem:$dst, FR32X:$src),
"vmovd\t{$src, $dst|$dst, $src}",
[(store (i32 (bitconvert FR32X:$src)), addr:$dst)],
- IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>;
+ IIC_SSE_MOVDQ>, EVEX, EVEX_CD8<32, CD8VT1>, Sched<[WriteStore]>;
} // ExeDomain = SSEPackedInt, isCodeGenOnly = 1
// Move Quadword Int to Packed Quadword Int
@@ -3784,7 +3683,7 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
"vmovq\t{$src, $dst|$dst, $src}",
[(set VR128X:$dst,
(v2i64 (scalar_to_vector (loadi64 addr:$src))))]>,
- EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
+ EVEX, VEX_W, EVEX_CD8<8, CD8VT8>, Sched<[WriteLoad]>;
} // ExeDomain = SSEPackedInt
//===----------------------------------------------------------------------===//
@@ -3794,57 +3693,54 @@ def VMOVQI2PQIZrm : AVX512XSI<0x7E, MRMSrcMem, (outs VR128X:$dst),
multiclass avx512_move_scalar<string asm, SDNode OpNode,
X86VectorVTInfo _> {
def rr : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src1, _.FRC:$src2),
+ (ins _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1,
- (scalar_to_vector _.FRC:$src2))))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
+ [(set _.RC:$dst, (_.VT (OpNode _.RC:$src1, _.RC:$src2)))],
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, Sched<[WriteMove]>;
def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
- (ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
+ (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
"$dst {${mask}} {z}, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
- (_.VT (OpNode _.RC:$src1,
- (scalar_to_vector _.FRC:$src2))),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
_.ImmAllZerosV)))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ, Sched<[WriteMove]>;
let Constraints = "$src0 = $dst" in
def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
- (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
+ (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
!strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
"$dst {${mask}}, $src1, $src2}"),
[(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
- (_.VT (OpNode _.RC:$src1,
- (scalar_to_vector _.FRC:$src2))),
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
(_.VT _.RC:$src0))))],
- _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
+ _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K, Sched<[WriteMove]>;
let canFoldAsLoad = 1, isReMaterializable = 1 in
def rm : AVX512PI<0x10, MRMSrcMem, (outs _.FRC:$dst), (ins _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set _.FRC:$dst, (_.ScalarLdFrag addr:$src))],
- _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX;
+ _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, Sched<[WriteLoad]>;
let mayLoad = 1, hasSideEffects = 0 in {
let Constraints = "$src0 = $dst" in
def rmk : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
(ins _.RC:$src0, _.KRCWM:$mask, _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|",
"$dst {${mask}}, $src}"),
- [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K;
+ [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_K, Sched<[WriteLoad]>;
def rmkz : AVX512PI<0x10, MRMSrcMem, (outs _.RC:$dst),
(ins _.KRCWM:$mask, _.ScalarMemOp:$src),
!strconcat(asm, "\t{$src, $dst {${mask}} {z}|",
"$dst {${mask}} {z}, $src}"),
- [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ;
+ [], _.ExeDomain, IIC_SSE_MOV_S_RM>, EVEX, EVEX_KZ, Sched<[WriteLoad]>;
}
def mr: AVX512PI<0x11, MRMDestMem, (outs), (ins _.ScalarMemOp:$dst, _.FRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(store _.FRC:$src, addr:$dst)], _.ExeDomain, IIC_SSE_MOV_S_MR>,
- EVEX;
+ EVEX, Sched<[WriteStore]>;
let mayStore = 1, hasSideEffects = 0 in
def mrk: AVX512PI<0x11, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, VK1WM:$mask, _.FRC:$src),
!strconcat(asm, "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}"),
- [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K;
+ [], _.ExeDomain, IIC_SSE_MOV_S_MR>, EVEX, EVEX_K, Sched<[WriteStore]>;
}
defm VMOVSSZ : avx512_move_scalar<"vmovss", X86Movss, f32x_info>,
@@ -3862,21 +3758,21 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
(_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
(_.EltVT _.FRC:$src1),
(_.EltVT _.FRC:$src2))))))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
- (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
- (COPY_TO_REGCLASS GR32:$mask, VK1WM),
- (_.VT _.RC:$src0), _.FRC:$src1),
- _.RC)>;
+ (!cast<Instruction>(InstrStr#rrk)
+ (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
+ (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT _.RC:$src0),
+ (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>;
def : Pat<(_.VT (OpNode _.RC:$src0,
(_.VT (scalar_to_vector
(_.EltVT (X86selects (scalar_to_vector (and (i8 (trunc GR32:$mask)), (i8 1))),
(_.EltVT _.FRC:$src1),
(_.EltVT ZeroFP))))))),
- (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
- (COPY_TO_REGCLASS GR32:$mask, VK1WM),
- (_.VT _.RC:$src0), _.FRC:$src1),
- _.RC)>;
+ (!cast<Instruction>(InstrStr#rrkz)
+ (COPY_TO_REGCLASS GR32:$mask, VK1WM),
+ (_.VT _.RC:$src0),
+ (COPY_TO_REGCLASS _.FRC:$src1, _.RC))>;
}
multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3982,13 +3878,33 @@ defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
(v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
+def : Pat<(f32 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))),
+ (f32 FR32X:$src1), (f32 FR32X:$src2))),
+ (COPY_TO_REGCLASS
+ (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ GR8:$mask, sub_8bit)), VK1WM),
+ (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src1, VR128X)),
+ FR32X)>;
+
def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
(COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
- VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
+ VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+
+def : Pat<(f64 (X86selects (scalar_to_vector (and GR8:$mask, (i8 1))),
+ (f64 FR64X:$src1), (f64 FR64X:$src2))),
+ (COPY_TO_REGCLASS
+ (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
+ (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF),
+ GR8:$mask, sub_8bit)), VK1WM),
+ (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)),
+ FR64X)>;
def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
(COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
- VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
+ VK1WM:$mask, (v2f64 (IMPLICIT_DEF)),
+ (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
(VMOVSSZmrk addr:$dst, (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM),
@@ -3996,63 +3912,60 @@ def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
let hasSideEffects = 0 in {
def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
- (ins VR128X:$src1, FR32X:$src2),
+ (ins VR128X:$src1, VR128X:$src2),
"vmovss.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], NoItinerary>, XS, EVEX_4V, VEX_LIG,
- FoldGenData<"VMOVSSZrr">;
+ [], IIC_SSE_MOV_S_RR>, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrr">, Sched<[WriteMove]>;
let Constraints = "$src0 = $dst" in
def VMOVSSZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f32x_info.RC:$src0, f32x_info.KRCWM:$mask,
- VR128X:$src1, FR32X:$src2),
+ VR128X:$src1, VR128X:$src2),
"vmovss.s\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- [], NoItinerary>, EVEX_K, XS, EVEX_4V, VEX_LIG,
- FoldGenData<"VMOVSSZrrk">;
+ [], IIC_SSE_MOV_S_RR>, EVEX_K, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrk">, Sched<[WriteMove]>;
def VMOVSSZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
- (ins f32x_info.KRCWM:$mask, VR128X:$src1, FR32X:$src2),
+ (ins f32x_info.KRCWM:$mask, VR128X:$src1, VR128X:$src2),
"vmovss.s\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- [], NoItinerary>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
- FoldGenData<"VMOVSSZrrkz">;
+ [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XS, EVEX_4V, VEX_LIG,
+ FoldGenData<"VMOVSSZrrkz">, Sched<[WriteMove]>;
def VMOVSDZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
- (ins VR128X:$src1, FR64X:$src2),
+ (ins VR128X:$src1, VR128X:$src2),
"vmovsd.s\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], NoItinerary>, XD, EVEX_4V, VEX_LIG, VEX_W,
- FoldGenData<"VMOVSDZrr">;
+ [], IIC_SSE_MOV_S_RR>, XD, EVEX_4V, VEX_LIG, VEX_W,
+ FoldGenData<"VMOVSDZrr">, Sched<[WriteMove]>;
let Constraints = "$src0 = $dst" in
def VMOVSDZrrk_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f64x_info.RC:$src0, f64x_info.KRCWM:$mask,
- VR128X:$src1, FR64X:$src2),
+ VR128X:$src1, VR128X:$src2),
"vmovsd.s\t{$src2, $src1, $dst {${mask}}|"#
"$dst {${mask}}, $src1, $src2}",
- [], NoItinerary>, EVEX_K, XD, EVEX_4V, VEX_LIG,
- VEX_W, FoldGenData<"VMOVSDZrrk">;
+ [], IIC_SSE_MOV_S_RR>, EVEX_K, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrk">, Sched<[WriteMove]>;
def VMOVSDZrrkz_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
(ins f64x_info.KRCWM:$mask, VR128X:$src1,
- FR64X:$src2),
+ VR128X:$src2),
"vmovsd.s\t{$src2, $src1, $dst {${mask}} {z}|"#
"$dst {${mask}} {z}, $src1, $src2}",
- [], NoItinerary>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
- VEX_W, FoldGenData<"VMOVSDZrrkz">;
+ [], IIC_SSE_MOV_S_RR>, EVEX_KZ, XD, EVEX_4V, VEX_LIG,
+ VEX_W, FoldGenData<"VMOVSDZrrkz">, Sched<[WriteMove]>;
}
let Predicates = [HasAVX512] in {
let AddedComplexity = 15 in {
- // Move scalar to XMM zero-extended, zeroing a VR128X then do a
- // MOVS{S,D} to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32X:$src)))),
- (VMOVSSZrr (v4f32 (AVX512_128_SET0)), FR32X:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128X:$src))),
- (VMOVSSZrr (v4f32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ (VMOVSSZrr (v4f32 (AVX512_128_SET0)), VR128X:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128X:$src))),
- (VMOVSSZrr (v4i32 (AVX512_128_SET0)), (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
+ (VMOVSSZrr (v4i32 (AVX512_128_SET0)), VR128X:$src)>;
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64X:$src)))),
- (VMOVSDZrr (v2f64 (AVX512_128_SET0)), FR64X:$src)>;
+ (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
+ (COPY_TO_REGCLASS FR64X:$src, VR128))>;
}
// Move low f32 and clear high bits.
@@ -4130,14 +4043,6 @@ let Predicates = [HasAVX512] in {
def : Pat<(v8f64 (X86vzload addr:$src)),
(SUBREG_TO_REG (i32 0), (VMOVSDZrm addr:$src), sub_xmm)>;
}
- def : Pat<(v8f32 (X86vzmovl (insert_subvector undef,
- (v4f32 (scalar_to_vector FR32X:$src)), (iPTR 0)))),
- (SUBREG_TO_REG (i32 0), (v4f32 (VMOVSSZrr (v4f32 (AVX512_128_SET0)),
- FR32X:$src)), sub_xmm)>;
- def : Pat<(v4f64 (X86vzmovl (insert_subvector undef,
- (v2f64 (scalar_to_vector FR64X:$src)), (iPTR 0)))),
- (SUBREG_TO_REG (i64 0), (v2f64 (VMOVSDZrr (v2f64 (AVX512_128_SET0)),
- FR64X:$src)), sub_xmm)>;
def : Pat<(v4i64 (X86vzmovl (insert_subvector undef,
(v2i64 (scalar_to_vector (loadi64 addr:$src))), (iPTR 0)))),
(SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
@@ -4166,50 +4071,23 @@ let Predicates = [HasAVX512] in {
// Shuffle with VMOVSS
def : Pat<(v4i32 (X86Movss VR128X:$src1, VR128X:$src2)),
- (VMOVSSZrr (v4i32 VR128X:$src1),
- (COPY_TO_REGCLASS (v4i32 VR128X:$src2), FR32X))>;
- def : Pat<(v4f32 (X86Movss VR128X:$src1, VR128X:$src2)),
- (VMOVSSZrr (v4f32 VR128X:$src1),
- (COPY_TO_REGCLASS (v4f32 VR128X:$src2), FR32X))>;
-
- // 256-bit variants
- def : Pat<(v8i32 (X86Movss VR256X:$src1, VR256X:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSSZrr (EXTRACT_SUBREG (v8i32 VR256X:$src1), sub_xmm),
- (EXTRACT_SUBREG (v8i32 VR256X:$src2), sub_xmm)),
- sub_xmm)>;
- def : Pat<(v8f32 (X86Movss VR256X:$src1, VR256X:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSSZrr (EXTRACT_SUBREG (v8f32 VR256X:$src1), sub_xmm),
- (EXTRACT_SUBREG (v8f32 VR256X:$src2), sub_xmm)),
- sub_xmm)>;
+ (VMOVSSZrr (v4i32 VR128X:$src1), VR128X:$src2)>;
+
+ def : Pat<(v4f32 (X86Movss VR128X:$src1, (scalar_to_vector FR32X:$src2))),
+ (VMOVSSZrr VR128X:$src1,
+ (COPY_TO_REGCLASS FR32X:$src2, VR128X))>;
// Shuffle with VMOVSD
def : Pat<(v2i64 (X86Movsd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
- def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
- // 256-bit variants
- def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDZrr (EXTRACT_SUBREG (v4i64 VR256X:$src1), sub_xmm),
- (EXTRACT_SUBREG (v4i64 VR256X:$src2), sub_xmm)),
- sub_xmm)>;
- def : Pat<(v4f64 (X86Movsd VR256X:$src1, VR256X:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDZrr (EXTRACT_SUBREG (v4f64 VR256X:$src1), sub_xmm),
- (EXTRACT_SUBREG (v4f64 VR256X:$src2), sub_xmm)),
- sub_xmm)>;
+ def : Pat<(v2f64 (X86Movsd VR128X:$src1, (scalar_to_vector FR64X:$src2))),
+ (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS FR64X:$src2, VR128X))>;
def : Pat<(v2f64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
- def : Pat<(v2i64 (X86Movlpd VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
def : Pat<(v4f32 (X86Movlps VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
- def : Pat<(v4i32 (X86Movlps VR128X:$src1, VR128X:$src2)),
- (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
+ (VMOVSDZrr VR128X:$src1, VR128X:$src2)>;
}
let AddedComplexity = 15 in
@@ -4337,12 +4215,6 @@ let Predicates = [HasAVX512], AddedComplexity = 400 in {
(VMOVNTDQAZrm addr:$src)>;
def : Pat<(v8i64 (alignednontemporalload addr:$src)),
(VMOVNTDQAZrm addr:$src)>;
- def : Pat<(v16i32 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZrm addr:$src)>;
- def : Pat<(v32i16 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZrm addr:$src)>;
- def : Pat<(v64i8 (bitconvert (v8i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZrm addr:$src)>;
}
let Predicates = [HasVLX], AddedComplexity = 400 in {
@@ -4359,12 +4231,6 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
(VMOVNTDQAZ256rm addr:$src)>;
def : Pat<(v4i64 (alignednontemporalload addr:$src)),
(VMOVNTDQAZ256rm addr:$src)>;
- def : Pat<(v8i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZ256rm addr:$src)>;
- def : Pat<(v16i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZ256rm addr:$src)>;
- def : Pat<(v32i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZ256rm addr:$src)>;
def : Pat<(alignednontemporalstore (v4i32 VR128X:$src), addr:$dst),
(VMOVNTDQZ128mr addr:$dst, VR128X:$src)>;
@@ -4379,12 +4245,6 @@ let Predicates = [HasVLX], AddedComplexity = 400 in {
(VMOVNTDQAZ128rm addr:$src)>;
def : Pat<(v2i64 (alignednontemporalload addr:$src)),
(VMOVNTDQAZ128rm addr:$src)>;
- def : Pat<(v4i32 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZ128rm addr:$src)>;
- def : Pat<(v8i16 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZ128rm addr:$src)>;
- def : Pat<(v16i8 (bitconvert (v2i64 (alignednontemporalload addr:$src)))),
- (VMOVNTDQAZ128rm addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -4397,16 +4257,16 @@ multiclass avx512_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)),
- itins.rr, IsCommutable>,
- AVX512BIBase, EVEX_4V;
+ itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V,
+ Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)))),
- itins.rm>,
- AVX512BIBase, EVEX_4V;
+ itins.rm>, AVX512BIBase, EVEX_4V,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4420,8 +4280,8 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT (OpNode _.RC:$src1,
(X86VBroadcast
(_.ScalarLdFrag addr:$src2)))),
- itins.rm>,
- AVX512BIBase, EVEX_4V, EVEX_B;
+ itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_binop_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4473,14 +4333,16 @@ multiclass avx512_binop_rm_vl_w<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins, Predicate prd,
bit IsCommutable = 0> {
defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i16_info,
- itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>;
+ itins, prd, IsCommutable>, EVEX_CD8<16, CD8VF>,
+ VEX_WIG;
}
multiclass avx512_binop_rm_vl_b<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpndItins itins, Predicate prd,
bit IsCommutable = 0> {
defm NAME : avx512_binop_rm_vl<opc, OpcodeStr, OpNode, avx512vl_i8_info,
- itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>;
+ itins, prd, IsCommutable>, EVEX_CD8<8, CD8VF>,
+ VEX_WIG;
}
multiclass avx512_binop_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
@@ -4524,14 +4386,14 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
itins.rr, IsCommutable>,
- AVX512BIBase, EVEX_4V;
+ AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
(bitconvert (_Src.LdFrag addr:$src2)))),
- itins.rm>,
- AVX512BIBase, EVEX_4V;
+ itins.rm>, AVX512BIBase, EVEX_4V,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
@@ -4541,8 +4403,8 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr, OpndItins itins,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Brdct.VT (X86VBroadcast
(_Brdct.ScalarLdFrag addr:$src2)))))),
- itins.rm>,
- AVX512BIBase, EVEX_4V, EVEX_B;
+ itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
defm VPADD : avx512_binop_rm_vl_all<0xFC, 0xFD, 0xFE, 0xD4, "vpadd", add,
@@ -4603,7 +4465,8 @@ defm VPMULTISHIFTQB : avx512_binop_all<0x83, "vpmultishiftqb", SSE_INTALU_ITINS_
X86multishift, HasVBMI, 0>, T8PD;
multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _Src, X86VectorVTInfo _Dst> {
+ X86VectorVTInfo _Src, X86VectorVTInfo _Dst,
+ OpndItins itins> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
OpcodeStr,
@@ -4611,57 +4474,60 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_Src.BroadcastStr,
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
(_Src.VT (X86VBroadcast
- (_Src.ScalarLdFrag addr:$src2))))))>,
- EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>;
+ (_Src.ScalarLdFrag addr:$src2)))))),
+ itins.rm>, EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_packs_rm<bits<8> opc, string OpcodeStr,
SDNode OpNode,X86VectorVTInfo _Src,
- X86VectorVTInfo _Dst, bit IsCommutable = 0> {
+ X86VectorVTInfo _Dst, OpndItins itins,
+ bit IsCommutable = 0> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1","$src1, $src2",
(_Dst.VT (OpNode
(_Src.VT _Src.RC:$src1),
(_Src.VT _Src.RC:$src2))),
- NoItinerary, IsCommutable>,
- EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V;
+ itins.rr, IsCommutable>,
+ EVEX_CD8<_Src.EltSize, CD8VF>, EVEX_4V, Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
(ins _Src.RC:$src1, _Src.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_Dst.VT (OpNode (_Src.VT _Src.RC:$src1),
- (bitconvert (_Src.LdFrag addr:$src2))))>,
- EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>;
+ (bitconvert (_Src.LdFrag addr:$src2)))), itins.rm>,
+ EVEX_4V, EVEX_CD8<_Src.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_packs_all_i32_i16<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
let Predicates = [HasBWI] in
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i32_info,
- v32i16_info>,
+ v32i16_info, SSE_PACK>,
avx512_packs_rmb<opc, OpcodeStr, OpNode, v16i32_info,
- v32i16_info>, EVEX_V512;
+ v32i16_info, SSE_PACK>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i32x_info,
- v16i16x_info>,
+ v16i16x_info, SSE_PACK>,
avx512_packs_rmb<opc, OpcodeStr, OpNode, v8i32x_info,
- v16i16x_info>, EVEX_V256;
+ v16i16x_info, SSE_PACK>, EVEX_V256;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v4i32x_info,
- v8i16x_info>,
+ v8i16x_info, SSE_PACK>,
avx512_packs_rmb<opc, OpcodeStr, OpNode, v4i32x_info,
- v8i16x_info>, EVEX_V128;
+ v8i16x_info, SSE_PACK>, EVEX_V128;
}
}
multiclass avx512_packs_all_i16_i8<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
let Predicates = [HasBWI] in
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, v32i16_info,
- v64i8_info>, EVEX_V512;
+ v64i8_info, SSE_PACK>, EVEX_V512, VEX_WIG;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, v16i16x_info,
- v32i8x_info>, EVEX_V256;
+ v32i8x_info, SSE_PACK>, EVEX_V256, VEX_WIG;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, v8i16x_info,
- v16i8x_info>, EVEX_V128;
+ v16i8x_info, SSE_PACK>, EVEX_V128, VEX_WIG;
}
}
@@ -4670,12 +4536,12 @@ multiclass avx512_vpmadd<bits<8> opc, string OpcodeStr,
AVX512VLVectorVTInfo _Dst, bit IsCommutable = 0> {
let Predicates = [HasBWI] in
defm NAME#Z : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info512,
- _Dst.info512, IsCommutable>, EVEX_V512;
+ _Dst.info512, SSE_PMADD, IsCommutable>, EVEX_V512;
let Predicates = [HasBWI, HasVLX] in {
defm NAME#Z256 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info256,
- _Dst.info256, IsCommutable>, EVEX_V256;
+ _Dst.info256, SSE_PMADD, IsCommutable>, EVEX_V256;
defm NAME#Z128 : avx512_packs_rm<opc, OpcodeStr, OpNode, _Src.info128,
- _Dst.info128, IsCommutable>, EVEX_V128;
+ _Dst.info128, SSE_PMADD, IsCommutable>, EVEX_V128;
}
}
@@ -4685,9 +4551,9 @@ defm VPACKSSWB : avx512_packs_all_i16_i8 <0x63, "vpacksswb", X86Packss>, AVX512B
defm VPACKUSWB : avx512_packs_all_i16_i8 <0x67, "vpackuswb", X86Packus>, AVX512BIBase;
defm VPMADDUBSW : avx512_vpmadd<0x04, "vpmaddubsw", X86vpmaddubsw,
- avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD;
+ avx512vl_i8_info, avx512vl_i16_info>, AVX512BIBase, T8PD, VEX_WIG;
defm VPMADDWD : avx512_vpmadd<0xF5, "vpmaddwd", X86vpmaddwd,
- avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase;
+ avx512vl_i16_info, avx512vl_i32_info, 1>, AVX512BIBase, VEX_WIG;
defm VPMAXSB : avx512_binop_rm_vl_b<0x3C, "vpmaxsb", smax,
SSE_INTALU_ITINS_P, HasBWI, 1>, T8PD;
@@ -4734,90 +4600,135 @@ let Predicates = [HasDQI, NoVLX] in {
sub_xmm)>;
}
+// PMULLQ: Use 512bit version to implement 128/256 bit in case NoVLX.
+let Predicates = [HasDQI, NoVLX] in {
+ def : Pat<(v4i64 (mul (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (mul (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+ (EXTRACT_SUBREG
+ (VPMULLQZrr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+}
+
+multiclass avx512_min_max_lowering<Instruction Instr, SDNode OpNode> {
+ def : Pat<(v4i64 (OpNode VR256X:$src1, VR256X:$src2)),
+ (EXTRACT_SUBREG
+ (Instr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src1, sub_ymm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src2, sub_ymm)),
+ sub_ymm)>;
+
+ def : Pat<(v2i64 (OpNode VR128X:$src1, VR128X:$src2)),
+ (EXTRACT_SUBREG
+ (Instr
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src1, sub_xmm),
+ (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src2, sub_xmm)),
+ sub_xmm)>;
+}
+
+let Predicates = [HasAVX512] in {
+ defm : avx512_min_max_lowering<VPMAXUQZrr, umax>;
+ defm : avx512_min_max_lowering<VPMINUQZrr, umin>;
+ defm : avx512_min_max_lowering<VPMAXSQZrr, smax>;
+ defm : avx512_min_max_lowering<VPMINSQZrr, smin>;
+}
+
//===----------------------------------------------------------------------===//
// AVX-512 Logical Instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, bit IsCommutable = 0> {
+// OpNodeMsk is the OpNode to use when element size is important. OpNode will
+// be set to null_frag for 32-bit elements.
+multiclass avx512_logic_rm<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _,
+ bit IsCommutable = 0> {
+ let hasSideEffects = 0 in
defm rr : AVX512_maskable_logic<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
(bitconvert (_.VT _.RC:$src2)))),
- (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
- _.RC:$src2)))),
- IIC_SSE_BIT_P_RR, IsCommutable>,
- AVX512BIBase, EVEX_4V;
+ (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
+ _.RC:$src2)))),
+ itins.rr, IsCommutable>, AVX512BIBase, EVEX_4V,
+ Sched<[itins.Sched]>;
+ let hasSideEffects = 0, mayLoad = 1 in
defm rm : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.i64VT (OpNode (bitconvert (_.VT _.RC:$src1)),
(bitconvert (_.LdFrag addr:$src2)))),
- (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
(bitconvert (_.LdFrag addr:$src2)))))),
- IIC_SSE_BIT_P_RM>,
- AVX512BIBase, EVEX_4V;
+ itins.rm>, AVX512BIBase, EVEX_4V,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, bit IsCommutable = 0> :
- avx512_logic_rm<opc, OpcodeStr, OpNode, _, IsCommutable> {
+// OpNodeMsk is the OpNode to use where element size is important. So use
+// for all of the broadcast patterns.
+multiclass avx512_logic_rmb<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ SDNode OpNodeMsk, OpndItins itins, X86VectorVTInfo _,
+ bit IsCommutable = 0> :
+ avx512_logic_rm<opc, OpcodeStr, OpNode, OpNodeMsk, itins, _,
+ IsCommutable> {
defm rmb : AVX512_maskable_logic<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
- (_.i64VT (OpNode _.RC:$src1,
+ (_.i64VT (OpNodeMsk _.RC:$src1,
(bitconvert
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src2)))))),
- (_.VT (bitconvert (_.i64VT (OpNode _.RC:$src1,
+ (_.VT (bitconvert (_.i64VT (OpNodeMsk _.RC:$src1,
(bitconvert
(_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src2)))))))),
- IIC_SSE_BIT_P_RM>,
- AVX512BIBase, EVEX_4V, EVEX_B;
+ itins.rm>, AVX512BIBase, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
+multiclass avx512_logic_rmb_vl<bits<8> opc, string OpcodeStr,
+ SDPatternOperator OpNode,
+ SDNode OpNodeMsk, OpndItins itins,
AVX512VLVectorVTInfo VTInfo,
bit IsCommutable = 0> {
let Predicates = [HasAVX512] in
- defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info512,
- IsCommutable>, EVEX_V512;
+ defm Z : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+ VTInfo.info512, IsCommutable>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info256,
- IsCommutable>, EVEX_V256;
- defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, VTInfo.info128,
- IsCommutable>, EVEX_V128;
+ defm Z256 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+ VTInfo.info256, IsCommutable>, EVEX_V256;
+ defm Z128 : avx512_logic_rmb<opc, OpcodeStr, OpNode, OpNodeMsk, itins,
+ VTInfo.info128, IsCommutable>, EVEX_V128;
}
}
-multiclass avx512_logic_rm_vl_d<bits<8> opc, string OpcodeStr, SDNode OpNode,
- bit IsCommutable = 0> {
- defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i32_info,
- IsCommutable>, EVEX_CD8<32, CD8VF>;
-}
-
-multiclass avx512_logic_rm_vl_q<bits<8> opc, string OpcodeStr, SDNode OpNode,
- bit IsCommutable = 0> {
- defm NAME : avx512_logic_rmb_vl<opc, OpcodeStr, OpNode, avx512vl_i64_info,
- IsCommutable>,
- VEX_W, EVEX_CD8<64, CD8VF>;
-}
-
multiclass avx512_logic_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
- SDNode OpNode, bit IsCommutable = 0> {
- defm Q : avx512_logic_rm_vl_q<opc_q, OpcodeStr#"q", OpNode, IsCommutable>;
- defm D : avx512_logic_rm_vl_d<opc_d, OpcodeStr#"d", OpNode, IsCommutable>;
+ SDNode OpNode, OpndItins itins,
+ bit IsCommutable = 0> {
+ defm Q : avx512_logic_rmb_vl<opc_q, OpcodeStr#"q", OpNode, OpNode, itins,
+ avx512vl_i64_info, IsCommutable>,
+ VEX_W, EVEX_CD8<64, CD8VF>;
+ defm D : avx512_logic_rmb_vl<opc_d, OpcodeStr#"d", null_frag, OpNode, itins,
+ avx512vl_i32_info, IsCommutable>,
+ EVEX_CD8<32, CD8VF>;
}
-defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, 1>;
-defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, 1>;
-defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, 1>;
-defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp>;
+defm VPAND : avx512_logic_rm_vl_dq<0xDB, 0xDB, "vpand", and, SSE_BIT_ITINS_P, 1>;
+defm VPOR : avx512_logic_rm_vl_dq<0xEB, 0xEB, "vpor", or, SSE_BIT_ITINS_P, 1>;
+defm VPXOR : avx512_logic_rm_vl_dq<0xEF, 0xEF, "vpxor", xor, SSE_BIT_ITINS_P, 1>;
+defm VPANDN : avx512_logic_rm_vl_dq<0xDF, 0xDF, "vpandn", X86andnp, SSE_BIT_ITINS_P>;
//===----------------------------------------------------------------------===//
// AVX-512 FP arithmetic
@@ -4831,7 +4742,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1, _.RC:$src2,
(i32 FROUND_CURRENT))),
- itins.rr>;
+ itins.rr>, Sched<[itins.Sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
@@ -4839,20 +4750,21 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(_.VT (VecNode _.RC:$src1,
_.ScalarIntMemCPat:$src2,
(i32 FROUND_CURRENT))),
- itins.rm>;
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr> {
+ itins.rr>, Sched<[itins.Sched]> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+ (_.ScalarLdFrag addr:$src2)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
}
@@ -4860,12 +4772,12 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode VecNode, OpndItins itins, bit IsCommutable = 0> {
let ExeDomain = _.ExeDomain in
- defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(i32 imm:$rc)), itins.rr, IsCommutable>,
- EVEX_B, EVEX_RC;
+ EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
}
multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode, SDNode SaeNode,
@@ -4875,35 +4787,37 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1, _.RC:$src2)),
- itins.rr>;
+ itins.rr>, Sched<[itins.Sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
_.ScalarIntMemCPat:$src2)),
- itins.rm>;
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr> {
+ itins.rr>, Sched<[itins.Sched]> {
let isCommutable = IsCommutable;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+ (_.ScalarLdFrag addr:$src2)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
- defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>, EVEX_B,
+ Sched<[itins.Sched]>;
}
}
@@ -4950,14 +4864,15 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
- itins.rr> {
+ itins.rr>, Sched<[itins.Sched]> {
let isCommutable = 1;
}
def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set _.FRC:$dst, (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+ (_.ScalarLdFrag addr:$src2)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
@@ -4984,43 +4899,43 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, _.RC:$src2)), itins.rr,
- IsCommutable>, EVEX_4V;
+ IsCommutable>, EVEX_4V, Sched<[itins.Sched]>;
let mayLoad = 1 in {
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
(OpNode _.RC:$src1, (_.LdFrag addr:$src2)), itins.rm>,
- EVEX_4V;
+ EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src2)))),
- itins.rm>, EVEX_4V, EVEX_B;
+ itins.rm>, EVEX_4V, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
}
multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
- defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc)))>,
- EVEX_4V, EVEX_B, EVEX_RC;
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 imm:$rc))), itins.rr>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
}
-
multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNodeRnd,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
- defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
- (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC)))>,
- EVEX_4V, EVEX_B;
+ (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 FROUND_NO_EXC))), itins.rr>,
+ EVEX_4V, EVEX_B, Sched<[itins.Sched]>;
}
multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
@@ -5052,36 +4967,38 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
}
}
-multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
- defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ SizeItins itins> {
+ defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>,
EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+ defm PDZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>,
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
-multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd> {
- defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v16f32_info>,
+multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
+ SizeItins itins> {
+ defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.s, v16f32_info>,
EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, v8f64_info>,
+ defm PDZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, itins.d, v8f64_info>,
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
SSE_ALU_ITINS_P, 1>,
- avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd>;
+ avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SSE_ALU_ITINS_P>;
defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
SSE_MUL_ITINS_P, 1>,
- avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd>;
+ avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SSE_MUL_ITINS_P>;
defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512, SSE_ALU_ITINS_P>,
- avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd>;
+ avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SSE_ALU_ITINS_P>;
defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512, SSE_DIV_ITINS_P>,
- avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd>;
+ avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SSE_DIV_ITINS_P>;
defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
SSE_ALU_ITINS_P, 0>,
- avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd>;
+ avx512_fp_binop_p_sae<0x5D, "vmin", X86fminRnd, SSE_ALU_ITINS_P>;
defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
SSE_ALU_ITINS_P, 0>,
- avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd>;
+ avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxRnd, SSE_ALU_ITINS_P>;
let isCodeGenOnly = 1 in {
defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
SSE_ALU_ITINS_P, 1>;
@@ -5202,65 +5119,69 @@ let Predicates = [HasVLX,HasDQI] in {
}
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>, EVEX_4V;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))),
+ itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT))>, EVEX_4V;
+ (OpNode _.RC:$src1, (_.LdFrag addr:$src2), (i32 FROUND_CURRENT)),
+ itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode _.RC:$src1, (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
- EVEX_4V, EVEX_B;
+ (_.ScalarLdFrag addr:$src2))),
+ (i32 FROUND_CURRENT)), itins.rm>,
+ EVEX_4V, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT)))>;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src2, (i32 FROUND_CURRENT))), itins.rr>,
+ Sched<[itins.Sched]>;
defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
- (OpNode _.RC:$src1,
- (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
- (i32 FROUND_CURRENT))>;
+ (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2,
+ (i32 FROUND_CURRENT)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
- defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v16f32_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, v16f32_info>,
+ defm PSZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v16f32_info>,
EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f64_info>,
- avx512_fp_round_packed<opc, OpcodeStr, OpNode, v8f64_info>,
+ defm PDZ : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>,
+ avx512_fp_round_packed<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v8f64_info>,
EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
- defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f32x_info>,
+ defm SSZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F32S, f32x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info, OpNodeScal, SSE_ALU_ITINS_S.s>,
EVEX_4V,EVEX_CD8<32, CD8VT1>;
- defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, f64x_info>,
+ defm SDZ128 : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, OpNodeScal, SSE_ALU_F64S, f64x_info>,
avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info, OpNodeScal, SSE_ALU_ITINS_S.d>,
EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f32x_info>,
+ defm PSZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v4f32x_info>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v8f32x_info>,
+ defm PSZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F32P, v8f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v2f64x_info>,
+ defm PDZ128 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v2f64x_info>,
EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, v4f64x_info>,
+ defm PDZ256 : avx512_fp_scalef_p<opc, OpcodeStr, OpNode, SSE_ALU_F64P, v4f64x_info>,
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
@@ -5271,31 +5192,35 @@ defm VSCALEF : avx512_fp_scalef_all<0x2C, 0x2D, "vscalef", X86scalef, X86scalefs
//===----------------------------------------------------------------------===//
multiclass avx512_vptest<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in {
let isCommutable = 1 in
defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
- EVEX_4V;
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>,
+ EVEX_4V, Sched<[itins.Sched]>;
defm rm : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (_.VT (bitconvert (_.LdFrag addr:$src2))))>,
- EVEX_4V,
- EVEX_CD8<_.EltSize, CD8VF>;
+ (_.VT (bitconvert (_.LdFrag addr:$src2)))), itins.rm>,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
}
multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
+ let ExeDomain = _.ExeDomain in
defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr,
(OpNode (_.VT _.RC:$src1), (_.VT (X86VBroadcast
- (_.ScalarLdFrag addr:$src2))))>,
- EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ (_.ScalarLdFrag addr:$src2)))),
+ itins.rm>, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
// Use 512bit version to implement 128/256 bit in case NoVLX.
@@ -5312,16 +5237,17 @@ multiclass avx512_vptest_lowering<SDNode OpNode, X86VectorVTInfo ExtendInfo,
}
multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo _, string Suffix> {
+ OpndItins itins, AVX512VLVectorVTInfo _,
+ string Suffix> {
let Predicates = [HasAVX512] in
- defm Z : avx512_vptest<opc, OpcodeStr, OpNode, _.info512>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ defm Z : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info512>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, _.info256>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
- defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, _.info128>,
- avx512_vptest_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info256>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode,itins, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vptest<opc, OpcodeStr, OpNode, itins, _.info128>,
+ avx512_vptest_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
}
let Predicates = [HasAVX512, NoVLX] in {
defm Z256_Alt : avx512_vptest_lowering< OpNode, _.info512, _.info256, Suffix>;
@@ -5329,30 +5255,31 @@ multiclass avx512_vptest_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
-multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode> {
- defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode,
+multiclass avx512_vptest_dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm D : avx512_vptest_dq_sizes<opc, OpcodeStr#"d", OpNode, itins,
avx512vl_i32_info, "D">;
- defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode,
+ defm Q : avx512_vptest_dq_sizes<opc, OpcodeStr#"q", OpNode, itins,
avx512vl_i64_info, "Q">, VEX_W;
}
multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
let Predicates = [HasBWI] in {
- defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, v32i16_info>,
+ defm WZ: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v32i16_info>,
EVEX_V512, VEX_W;
- defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, v64i8_info>,
+ defm BZ: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v64i8_info>,
EVEX_V512;
}
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, v16i16x_info>,
+ defm WZ256: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v16i16x_info>,
EVEX_V256, VEX_W;
- defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, v8i16x_info>,
+ defm WZ128: avx512_vptest<opc, OpcodeStr#"w", OpNode, itins, v8i16x_info>,
EVEX_V128, VEX_W;
- defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, v32i8x_info>,
+ defm BZ256: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v32i8x_info>,
EVEX_V256;
- defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, v16i8x_info>,
+ defm BZ128: avx512_vptest<opc, OpcodeStr#"b", OpNode, itins, v16i8x_info>,
EVEX_V128;
}
@@ -5362,151 +5289,165 @@ multiclass avx512_vptest_wb<bits<8> opc, string OpcodeStr,
defm WZ256_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v16i16x_info, "W">;
defm WZ128_Alt : avx512_vptest_lowering< OpNode, v32i16_info, v8i16x_info, "W">;
}
-
}
multiclass avx512_vptest_all_forms<bits<8> opc_wb, bits<8> opc_dq, string OpcodeStr,
- SDNode OpNode> :
- avx512_vptest_wb <opc_wb, OpcodeStr, OpNode>,
- avx512_vptest_dq<opc_dq, OpcodeStr, OpNode>;
+ SDNode OpNode, OpndItins itins> :
+ avx512_vptest_wb <opc_wb, OpcodeStr, OpNode, itins>,
+ avx512_vptest_dq<opc_dq, OpcodeStr, OpNode, itins>;
-defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm>, T8PD;
-defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm>, T8XS;
+defm VPTESTM : avx512_vptest_all_forms<0x26, 0x27, "vptestm", X86testm,
+ SSE_BIT_ITINS_P>, T8PD;
+defm VPTESTNM : avx512_vptest_all_forms<0x26, 0x27, "vptestnm", X86testnm,
+ SSE_BIT_ITINS_P>, T8XS;
//===----------------------------------------------------------------------===//
// AVX-512 Shift instructions
//===----------------------------------------------------------------------===//
multiclass avx512_shift_rmi<bits<8> opc, Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ string OpcodeStr, SDNode OpNode, OpndItins itins,
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm ri : AVX512_maskable<opc, ImmFormR, _, (outs _.RC:$dst),
(ins _.RC:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))),
- SSE_INTSHIFT_ITINS_P.rr>;
+ itins.rr>, Sched<[itins.Sched]>;
defm mi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
(i8 imm:$src2))),
- SSE_INTSHIFT_ITINS_P.rm>;
+ itins.rm>, Sched<[itins.Sched.Folded]>;
}
}
multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
- string OpcodeStr, SDNode OpNode, X86VectorVTInfo _> {
+ string OpcodeStr, SDNode OpNode, OpndItins itins,
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
"$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src1)), (i8 imm:$src2))),
- SSE_INTSHIFT_ITINS_P.rm>, EVEX_B;
+ itins.rm>, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_shift_rrm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType SrcVT, PatFrag bc_frag, X86VectorVTInfo _> {
+ OpndItins itins, ValueType SrcVT, PatFrag bc_frag,
+ X86VectorVTInfo _> {
// src2 is always 128-bit
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, VR128X:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (SrcVT VR128X:$src2))),
- SSE_INTSHIFT_ITINS_P.rr>, AVX512BIBase, EVEX_4V;
+ itins.rr>, AVX512BIBase, EVEX_4V, Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, i128mem:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (bc_frag (loadv2i64 addr:$src2)))),
- SSE_INTSHIFT_ITINS_P.rm>, AVX512BIBase,
- EVEX_4V;
+ itins.rm>, AVX512BIBase,
+ EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- ValueType SrcVT, PatFrag bc_frag,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ OpndItins itins, ValueType SrcVT, PatFrag bc_frag,
+ AVX512VLVectorVTInfo VTInfo, Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ defm Z : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
VTInfo.info512>, EVEX_V512,
EVEX_CD8<VTInfo.info512.EltSize, CD8VQ> ;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ defm Z256 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
VTInfo.info256>, EVEX_V256,
EVEX_CD8<VTInfo.info256.EltSize, CD8VH>;
- defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, SrcVT, bc_frag,
+ defm Z128 : avx512_shift_rrm<opc, OpcodeStr, OpNode, itins, SrcVT, bc_frag,
VTInfo.info128>, EVEX_V128,
EVEX_CD8<VTInfo.info128.EltSize, CD8VF>;
}
}
multiclass avx512_shift_types<bits<8> opcd, bits<8> opcq, bits<8> opcw,
- string OpcodeStr, SDNode OpNode> {
- defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, v4i32, bc_v4i32,
- avx512vl_i32_info, HasAVX512>;
- defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, v2i64, bc_v2i64,
- avx512vl_i64_info, HasAVX512>, VEX_W;
- defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, v8i16, bc_v8i16,
- avx512vl_i16_info, HasBWI>;
+ string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm D : avx512_shift_sizes<opcd, OpcodeStr#"d", OpNode, itins, v4i32,
+ bc_v4i32, avx512vl_i32_info, HasAVX512>;
+ defm Q : avx512_shift_sizes<opcq, OpcodeStr#"q", OpNode, itins, v2i64,
+ bc_v2i64, avx512vl_i64_info, HasAVX512>, VEX_W;
+ defm W : avx512_shift_sizes<opcw, OpcodeStr#"w", OpNode, itins, v8i16,
+ bc_v2i64, avx512vl_i16_info, HasBWI>;
}
multiclass avx512_shift_rmi_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo> {
+ string OpcodeStr, SDNode OpNode,
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
let Predicates = [HasAVX512] in
- defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins,
VTInfo.info512>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
+ defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode, itins,
VTInfo.info256>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
VTInfo.info256>, EVEX_V256;
defm Z128: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- VTInfo.info128>,
- avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
+ itins, VTInfo.info128>,
+ avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode, itins,
VTInfo.info128>, EVEX_V128;
}
}
multiclass avx512_shift_rmi_w<bits<8> opcw,
Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode> {
+ string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
let Predicates = [HasBWI] in
defm WZ: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- v32i16_info>, EVEX_V512;
+ itins, v32i16_info>, EVEX_V512, VEX_WIG;
let Predicates = [HasVLX, HasBWI] in {
defm WZ256: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- v16i16x_info>, EVEX_V256;
+ itins, v16i16x_info>, EVEX_V256, VEX_WIG;
defm WZ128: avx512_shift_rmi<opcw, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- v8i16x_info>, EVEX_V128;
+ itins, v8i16x_info>, EVEX_V128, VEX_WIG;
}
}
multiclass avx512_shift_rmi_dq<bits<8> opcd, bits<8> opcq,
Format ImmFormR, Format ImmFormM,
- string OpcodeStr, SDNode OpNode> {
+ string OpcodeStr, SDNode OpNode, OpndItins itins> {
defm D: avx512_shift_rmi_sizes<opcd, ImmFormR, ImmFormM, OpcodeStr#"d", OpNode,
- avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ itins, avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
defm Q: avx512_shift_rmi_sizes<opcq, ImmFormR, ImmFormM, OpcodeStr#"q", OpNode,
- avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
+ itins, avx512vl_i64_info>, EVEX_CD8<64, CD8VF>, VEX_W;
}
-defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli>,
- avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli>, AVX512BIi8Base, EVEX_4V;
+defm VPSRL : avx512_shift_rmi_dq<0x72, 0x73, MRM2r, MRM2m, "vpsrl", X86vsrli,
+ SSE_INTSHIFT_P>,
+ avx512_shift_rmi_w<0x71, MRM2r, MRM2m, "vpsrlw", X86vsrli,
+ SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
-defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli>,
- avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli>, AVX512BIi8Base, EVEX_4V;
+defm VPSLL : avx512_shift_rmi_dq<0x72, 0x73, MRM6r, MRM6m, "vpsll", X86vshli,
+ SSE_INTSHIFT_P>,
+ avx512_shift_rmi_w<0x71, MRM6r, MRM6m, "vpsllw", X86vshli,
+ SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
-defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai>,
- avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai>, AVX512BIi8Base, EVEX_4V;
+defm VPSRA : avx512_shift_rmi_dq<0x72, 0x72, MRM4r, MRM4m, "vpsra", X86vsrai,
+ SSE_INTSHIFT_P>,
+ avx512_shift_rmi_w<0x71, MRM4r, MRM4m, "vpsraw", X86vsrai,
+ SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
-defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri>, AVX512BIi8Base, EVEX_4V;
-defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli>, AVX512BIi8Base, EVEX_4V;
+defm VPROR : avx512_shift_rmi_dq<0x72, 0x72, MRM0r, MRM0m, "vpror", X86vrotri,
+ SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
+defm VPROL : avx512_shift_rmi_dq<0x72, 0x72, MRM1r, MRM1m, "vprol", X86vrotli,
+ SSE_INTSHIFT_P>, AVX512BIi8Base, EVEX_4V;
-defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
-defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
-defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
+defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl, SSE_INTSHIFT_P>;
+defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra, SSE_INTSHIFT_P>;
+defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl, SSE_INTSHIFT_P>;
// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
@@ -5539,25 +5480,27 @@ let Predicates = [HasAVX512, NoVLX] in {
// Variable Bit Shifts
//===-------------------------------------------------------------------===//
multiclass avx512_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1, (_.VT _.RC:$src2))),
- SSE_INTSHIFT_ITINS_P.rr>, AVX5128IBase, EVEX_4V;
+ itins.rr>, AVX5128IBase, EVEX_4V,
+ Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
(_.VT (bitconvert (_.LdFrag addr:$src2))))),
- SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_4V,
- EVEX_CD8<_.EltSize, CD8VF>;
+ itins.rm>, AVX5128IBase, EVEX_4V,
+ EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
@@ -5565,29 +5508,30 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src1, (_.VT (X86VBroadcast
(_.ScalarLdFrag addr:$src2))))),
- SSE_INTSHIFT_ITINS_P.rm>, AVX5128IBase, EVEX_B,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ itins.rm>, AVX5128IBase, EVEX_B,
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_var_shift_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo _> {
+ OpndItins itins, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
- defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
+ defm Z128 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
}
}
multiclass avx512_var_shift_types<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
- defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode,
+ SDNode OpNode, OpndItins itins> {
+ defm D : avx512_var_shift_sizes<opc, OpcodeStr#"d", OpNode, itins,
avx512vl_i32_info>;
- defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode,
+ defm Q : avx512_var_shift_sizes<opc, OpcodeStr#"q", OpNode, itins,
avx512vl_i64_info>, VEX_W;
}
@@ -5613,30 +5557,30 @@ multiclass avx512_var_shift_lowering<AVX512VLVectorVTInfo _, string OpcodeStr,
}
}
multiclass avx512_var_shift_w<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
let Predicates = [HasBWI] in
- defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, v32i16_info>,
+ defm WZ: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i16_info>,
EVEX_V512, VEX_W;
let Predicates = [HasVLX, HasBWI] in {
- defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, v16i16x_info>,
+ defm WZ256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i16x_info>,
EVEX_V256, VEX_W;
- defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, v8i16x_info>,
+ defm WZ128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v8i16x_info>,
EVEX_V128, VEX_W;
}
}
-defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl>,
- avx512_var_shift_w<0x12, "vpsllvw", shl>;
+defm VPSLLV : avx512_var_shift_types<0x47, "vpsllv", shl, SSE_INTSHIFT_P>,
+ avx512_var_shift_w<0x12, "vpsllvw", shl, SSE_INTSHIFT_P>;
-defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra>,
- avx512_var_shift_w<0x11, "vpsravw", sra>;
+defm VPSRAV : avx512_var_shift_types<0x46, "vpsrav", sra, SSE_INTSHIFT_P>,
+ avx512_var_shift_w<0x11, "vpsravw", sra, SSE_INTSHIFT_P>;
-defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl>,
- avx512_var_shift_w<0x10, "vpsrlvw", srl>;
+defm VPSRLV : avx512_var_shift_types<0x45, "vpsrlv", srl, SSE_INTSHIFT_P>,
+ avx512_var_shift_w<0x10, "vpsrlvw", srl, SSE_INTSHIFT_P>;
-defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr>;
-defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl>;
+defm VPRORV : avx512_var_shift_types<0x14, "vprorv", rotr, SSE_INTSHIFT_P>;
+defm VPROLV : avx512_var_shift_types<0x15, "vprolv", rotl, SSE_INTSHIFT_P>;
defm : avx512_var_shift_lowering<avx512vl_i64_info, "VPSRAVQ", sra, [HasAVX512, NoVLX]>;
defm : avx512_var_shift_lowering<avx512vl_i16_info, "VPSLLVW", shl, [HasBWI, NoVLX]>;
@@ -5714,26 +5658,26 @@ let Predicates = [HasAVX512, NoVLX] in {
(EXTRACT_SUBREG (v8i64
(VPROLVQZrr
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
sub_xmm)>;
def : Pat<(v4i64 (rotl (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPROLVQZrr
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
def : Pat<(v4i32 (rotl (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
(EXTRACT_SUBREG (v16i32
(VPROLVDZrr
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
sub_xmm)>;
def : Pat<(v8i32 (rotl (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(EXTRACT_SUBREG (v16i32
(VPROLVDZrr
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
def : Pat<(v2i64 (X86vrotli (v2i64 VR128X:$src1), (i8 imm:$src2))),
@@ -5765,26 +5709,26 @@ let Predicates = [HasAVX512, NoVLX] in {
(EXTRACT_SUBREG (v8i64
(VPRORVQZrr
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
sub_xmm)>;
def : Pat<(v4i64 (rotr (v4i64 VR256X:$src1), (v4i64 VR256X:$src2))),
(EXTRACT_SUBREG (v8i64
(VPRORVQZrr
(v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+ (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
def : Pat<(v4i32 (rotr (v4i32 VR128X:$src1), (v4i32 VR128X:$src2))),
(EXTRACT_SUBREG (v16i32
(VPRORVDZrr
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm))),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src2, sub_xmm)))),
sub_xmm)>;
def : Pat<(v8i32 (rotr (v8i32 VR256X:$src1), (v8i32 VR256X:$src2))),
(EXTRACT_SUBREG (v16i32
(VPRORVDZrr
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
- (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm))),
+ (v16i32 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src2, sub_ymm)))),
sub_ymm)>;
def : Pat<(v2i64 (X86vrotri (v2i64 VR128X:$src1), (i8 imm:$src2))),
@@ -5814,84 +5758,86 @@ let Predicates = [HasAVX512, NoVLX] in {
// 1-src variable permutation VPERMW/D/Q
//===-------------------------------------------------------------------===//
multiclass avx512_vperm_dq_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo _> {
+ OpndItins itins, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ defm Z : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in
- defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
- avx512_var_shift_mb<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ defm Z256 : avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
+ avx512_var_shift_mb<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
}
multiclass avx512_vpermi_dq_sizes<bits<8> opc, Format ImmFormR, Format ImmFormM,
string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
let Predicates = [HasAVX512] in
defm Z: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- VTInfo.info512>,
+ itins, VTInfo.info512>,
avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
- VTInfo.info512>, EVEX_V512;
+ itins, VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in
defm Z256: avx512_shift_rmi<opc, ImmFormR, ImmFormM, OpcodeStr, OpNode,
- VTInfo.info256>,
+ itins, VTInfo.info256>,
avx512_shift_rmbi<opc, ImmFormM, OpcodeStr, OpNode,
- VTInfo.info256>, EVEX_V256;
+ itins, VTInfo.info256>, EVEX_V256;
}
multiclass avx512_vperm_bw<bits<8> opc, string OpcodeStr,
Predicate prd, SDNode OpNode,
- AVX512VLVectorVTInfo _> {
+ OpndItins itins, AVX512VLVectorVTInfo _> {
let Predicates = [prd] in
- defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, _.info512>,
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info512>,
EVEX_V512 ;
let Predicates = [HasVLX, prd] in {
- defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, _.info256>,
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info256>,
EVEX_V256 ;
- defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, _.info128>,
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, _.info128>,
EVEX_V128 ;
}
}
defm VPERMW : avx512_vperm_bw<0x8D, "vpermw", HasBWI, X86VPermv,
- avx512vl_i16_info>, VEX_W;
+ AVX2_PERMV_I, avx512vl_i16_info>, VEX_W;
defm VPERMB : avx512_vperm_bw<0x8D, "vpermb", HasVBMI, X86VPermv,
- avx512vl_i8_info>;
+ AVX2_PERMV_I, avx512vl_i8_info>;
defm VPERMD : avx512_vperm_dq_sizes<0x36, "vpermd", X86VPermv,
- avx512vl_i32_info>;
+ AVX2_PERMV_I, avx512vl_i32_info>;
defm VPERMQ : avx512_vperm_dq_sizes<0x36, "vpermq", X86VPermv,
- avx512vl_i64_info>, VEX_W;
+ AVX2_PERMV_I, avx512vl_i64_info>, VEX_W;
defm VPERMPS : avx512_vperm_dq_sizes<0x16, "vpermps", X86VPermv,
- avx512vl_f32_info>;
+ AVX2_PERMV_F, avx512vl_f32_info>;
defm VPERMPD : avx512_vperm_dq_sizes<0x16, "vpermpd", X86VPermv,
- avx512vl_f64_info>, VEX_W;
+ AVX2_PERMV_F, avx512vl_f64_info>, VEX_W;
defm VPERMQ : avx512_vpermi_dq_sizes<0x00, MRMSrcReg, MRMSrcMem, "vpermq",
- X86VPermi, avx512vl_i64_info>,
+ X86VPermi, AVX2_PERMV_I, avx512vl_i64_info>,
EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
defm VPERMPD : avx512_vpermi_dq_sizes<0x01, MRMSrcReg, MRMSrcMem, "vpermpd",
- X86VPermi, avx512vl_f64_info>,
+ X86VPermi, AVX2_PERMV_F, avx512vl_f64_info>,
EVEX, AVX512AIi8Base, EVEX_CD8<64, CD8VF>, VEX_W;
//===----------------------------------------------------------------------===//
// AVX-512 - VPERMIL
//===----------------------------------------------------------------------===//
-multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, X86VectorVTInfo Ctrl> {
+multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, X86VectorVTInfo _,
+ X86VectorVTInfo Ctrl> {
defm rr: AVX512_maskable<OpcVar, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, Ctrl.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode _.RC:$src1,
- (Ctrl.VT Ctrl.RC:$src2)))>,
- T8PD, EVEX_4V;
+ (Ctrl.VT Ctrl.RC:$src2))), itins.rr>,
+ T8PD, EVEX_4V, Sched<[itins.Sched]>;
defm rm: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, Ctrl.MemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode
_.RC:$src1,
- (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2)))))>,
- T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ (Ctrl.VT (bitconvert(Ctrl.LdFrag addr:$src2))))),
+ itins.rm>, T8PD, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
"${src2}"##_.BroadcastStr##", $src1",
@@ -5899,30 +5845,31 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
(_.VT (OpNode
_.RC:$src1,
(Ctrl.VT (X86VBroadcast
- (Ctrl.ScalarLdFrag addr:$src2)))))>,
- T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ (Ctrl.ScalarLdFrag addr:$src2))))),
+ itins.rm>, T8PD, EVEX_4V, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_permil_vec_common<string OpcodeStr, bits<8> OpcVar,
- AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
+ OpndItins itins, AVX512VLVectorVTInfo _,
+ AVX512VLVectorVTInfo Ctrl> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info512,
- Ctrl.info512>, EVEX_V512;
+ defm Z : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+ _.info512, Ctrl.info512>, EVEX_V512;
}
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info128,
- Ctrl.info128>, EVEX_V128;
- defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, _.info256,
- Ctrl.info256>, EVEX_V256;
+ defm Z128 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+ _.info128, Ctrl.info128>, EVEX_V128;
+ defm Z256 : avx512_permil_vec<OpcVar, OpcodeStr, X86VPermilpv, itins,
+ _.info256, Ctrl.info256>, EVEX_V256;
}
}
multiclass avx512_permil<string OpcodeStr, bits<8> OpcImm, bits<8> OpcVar,
AVX512VLVectorVTInfo _, AVX512VLVectorVTInfo Ctrl>{
-
- defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, _, Ctrl>;
+ defm NAME: avx512_permil_vec_common<OpcodeStr, OpcVar, AVX_VPERMILV, _, Ctrl>;
defm NAME: avx512_shift_rmi_sizes<OpcImm, MRMSrcReg, MRMSrcMem, OpcodeStr,
- X86VPermilpi, _>,
+ X86VPermilpi, AVX_VPERMILV, _>,
EVEX, AVX512AIi8Base, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
@@ -5932,29 +5879,31 @@ defm VPERMILPS : avx512_permil<"vpermilps", 0x04, 0x0C, avx512vl_f32_info,
let ExeDomain = SSEPackedDouble in
defm VPERMILPD : avx512_permil<"vpermilpd", 0x05, 0x0D, avx512vl_f64_info,
avx512vl_i64_info>, VEX_W;
+
//===----------------------------------------------------------------------===//
// AVX-512 - VPSHUFD, VPSHUFLW, VPSHUFHW
//===----------------------------------------------------------------------===//
defm VPSHUFD : avx512_shift_rmi_sizes<0x70, MRMSrcReg, MRMSrcMem, "vpshufd",
- X86PShufd, avx512vl_i32_info>,
+ X86PShufd, SSE_PSHUF, avx512vl_i32_info>,
EVEX, AVX512BIi8Base, EVEX_CD8<32, CD8VF>;
defm VPSHUFH : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshufhw",
- X86PShufhw>, EVEX, AVX512XSIi8Base;
+ X86PShufhw, SSE_PSHUF>, EVEX, AVX512XSIi8Base;
defm VPSHUFL : avx512_shift_rmi_w<0x70, MRMSrcReg, MRMSrcMem, "vpshuflw",
- X86PShuflw>, EVEX, AVX512XDIi8Base;
+ X86PShuflw, SSE_PSHUF>, EVEX, AVX512XDIi8Base;
-multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode> {
+multiclass avx512_pshufb_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
let Predicates = [HasBWI] in
- defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, v64i8_info>, EVEX_V512;
+ defm Z: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v64i8_info>, EVEX_V512;
let Predicates = [HasVLX, HasBWI] in {
- defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, v32i8x_info>, EVEX_V256;
- defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, v16i8x_info>, EVEX_V128;
+ defm Z256: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v32i8x_info>, EVEX_V256;
+ defm Z128: avx512_var_shift<opc, OpcodeStr, OpNode, itins, v16i8x_info>, EVEX_V128;
}
}
-defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb>;
+defm VPSHUFB: avx512_pshufb_sizes<0x00, "vpshufb", X86pshufb, SSE_PSHUFB>, VEX_WIG;
//===----------------------------------------------------------------------===//
// Move Low to High and High to Low packed FP Instructions
@@ -5970,18 +5919,6 @@ def VMOVHLPSZrr : AVX512PSI<0x12, MRMSrcReg, (outs VR128X:$dst),
[(set VR128X:$dst, (v4f32 (X86Movhlps VR128X:$src1, VR128X:$src2)))],
IIC_SSE_MOV_LH>, EVEX_4V;
-let Predicates = [HasAVX512] in {
- // MOVLHPS patterns
- def : Pat<(v4i32 (X86Movlhps VR128X:$src1, VR128X:$src2)),
- (VMOVLHPSZrr VR128X:$src1, VR128X:$src2)>;
- def : Pat<(v2i64 (X86Movlhps VR128X:$src1, VR128X:$src2)),
- (VMOVLHPSZrr (v2i64 VR128X:$src1), VR128X:$src2)>;
-
- // MOVHLPS patterns
- def : Pat<(v4i32 (X86Movhlps VR128X:$src1, VR128X:$src2)),
- (VMOVHLPSZrr VR128X:$src1, VR128X:$src2)>;
-}
-
//===----------------------------------------------------------------------===//
// VMOVHPS/PD VMOVLPS Instructions
// All patterns was taken from SSS implementation.
@@ -6002,7 +5939,7 @@ multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
defm VMOVHPSZ128 : avx512_mov_hilo_packed<0x16, "vmovhps", X86Movlhps,
v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
-defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Movlhpd,
+defm VMOVHPDZ128 : avx512_mov_hilo_packed<0x16, "vmovhpd", X86Unpckl,
v2f64x_info>, EVEX_CD8<64, CD8VT1>, PD, VEX_W;
defm VMOVLPSZ128 : avx512_mov_hilo_packed<0x12, "vmovlps", X86Movlps,
v4f32x_info>, EVEX_CD8<32, CD8VT2>, PS;
@@ -6015,25 +5952,18 @@ let Predicates = [HasAVX512] in {
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128X:$src1,
- (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
(VMOVHPSZ128rm VR128X:$src1, addr:$src2)>;
// VMOVHPD patterns
def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
(bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
(VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
// VMOVLPS patterns
def : Pat<(v4f32 (X86Movlps VR128X:$src1, (load addr:$src2))),
(VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86Movlps VR128X:$src1, (load addr:$src2))),
- (VMOVLPSZ128rm VR128X:$src1, addr:$src2)>;
// VMOVLPD patterns
def : Pat<(v2f64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
- def : Pat<(v2i64 (X86Movlpd VR128X:$src1, (load addr:$src2))),
- (VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Movsd VR128X:$src1,
(v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
(VMOVLPDZ128rm VR128X:$src1, addr:$src2)>;
@@ -6079,16 +6009,10 @@ let Predicates = [HasAVX512] in {
def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128X:$src2)),
addr:$src1),
(VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
- def : Pat<(store (v4i32 (X86Movlps
- (bc_v4i32 (loadv2i64 addr:$src1)), VR128X:$src2)), addr:$src1),
- (VMOVLPSZ128mr addr:$src1, VR128X:$src2)>;
// VMOVLPD patterns
def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
addr:$src1),
(VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
- def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128X:$src2)),
- addr:$src1),
- (VMOVLPDZ128mr addr:$src1, VR128X:$src2)>;
}
//===----------------------------------------------------------------------===//
// FMA - Fused Multiply Operations
@@ -6096,45 +6020,38 @@ let Predicates = [HasAVX512] in {
multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), NoItinerary, 1, 1>,
+ AVX512FMA3Base, Sched<[WriteFMA]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
+ NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
(OpNode _.RC:$src2,
- _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))), 1, 0>,
- AVX512FMA3Base, EVEX_B;
+ _.RC:$src1,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3)))),
+ NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B,
+ Sched<[WriteFMALd, ReadAfterLd]>;
}
-
- // Additional pattern for folding broadcast nodes in other orders.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode _.RC:$src1, _.RC:$src2,
- (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
- _.RC:$src1)),
- (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
- _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
}
multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))), 1, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC;
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 imm:$rc))),
+ NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
}
multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -6171,18 +6088,18 @@ defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), NoItinerary, 1, 1,
+ vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
+ NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -6190,36 +6107,20 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src2, ${src3}"##_.BroadcastStr,
(_.VT (OpNode _.RC:$src2,
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B;
+ _.RC:$src1)), NoItinerary, 1, 0>, AVX512FMA3Base, EVEX_B,
+ Sched<[WriteFMALd, ReadAfterLd]>;
}
-
- // Additional patterns for folding broadcast nodes in other orders.
- def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1)),
- (!cast<Instruction>(NAME#Suff#_.ZSuffix#mb) _.RC:$src1,
- _.RC:$src2, addr:$src3)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1),
- _.RC:$src1)),
- (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
- _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src2, _.RC:$src1),
- _.ImmAllZerosV)),
- (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbkz) _.RC:$src1,
- _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
}
multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))), 1, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC;
+ (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 imm:$rc))),
+ NoItinerary, 1, 1, vselect, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
}
multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -6255,45 +6156,42 @@ defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubR
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), NoItinerary,
+ 1, 1, vselect, 1>, AVX512FMA3Base, Sched<[WriteFMA]>;
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, (_.LdFrag addr:$src3), _.RC:$src2)), 1, 0>,
- AVX512FMA3Base;
+ (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
+ NoItinerary, 1, 0>, AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
"$src2, ${src3}"##_.BroadcastStr,
- (_.VT (OpNode _.RC:$src1,
- (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- _.RC:$src2)), 1, 0>, AVX512FMA3Base, EVEX_B;
+ (_.VT (OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1, _.RC:$src2)), NoItinerary, 1, 0>,
+ AVX512FMA3Base, EVEX_B, Sched<[WriteFMALd, ReadAfterLd]>;
}
-
- // Additional patterns for folding broadcast nodes in other orders.
- def : Pat<(_.VT (vselect _.KRCWM:$mask,
- (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
- _.RC:$src1, _.RC:$src2),
- _.RC:$src1)),
- (!cast<Instruction>(NAME#Suff#_.ZSuffix#mbk) _.RC:$src1,
- _.KRCWM:$mask, _.RC:$src2, addr:$src3)>;
}
multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
- (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))), 1, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC;
+ (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 imm:$rc))),
+ NoItinerary, 1, 1, vselect, 1>,
+ AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[WriteFMA]>;
}
multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -6328,167 +6226,179 @@ defm VFNMADD132 : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddR
defm VFNMSUB132 : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
// Scalar FMA
-let Constraints = "$src1 = $dst" in {
multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
dag RHS_VEC_r, dag RHS_VEC_m, dag RHS_VEC_rb,
- dag RHS_r, dag RHS_m > {
+ dag RHS_r, dag RHS_m, bit MaskOnlyReg> {
+let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
- "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
+ "$src3, $src2", "$src2, $src3", RHS_VEC_r, NoItinerary, 1, 1>,
+ AVX512FMA3Base, Sched<[WriteFMA]>;
defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
- "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
+ "$src3, $src2", "$src2, $src3", RHS_VEC_m, NoItinerary, 1, 1>,
+ AVX512FMA3Base, Sched<[WriteFMALd, ReadAfterLd]>;
defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
- OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb, 1, 1>,
- AVX512FMA3Base, EVEX_B, EVEX_RC;
+ OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", RHS_VEC_rb,
+ NoItinerary, 1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC,
+ Sched<[WriteFMA]>;
let isCodeGenOnly = 1, isCommutable = 1 in {
- def r : AVX512FMA3<opc, MRMSrcReg, (outs _.FRC:$dst),
+ def r : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [RHS_r]>;
- def m : AVX512FMA3<opc, MRMSrcMem, (outs _.FRC:$dst),
+ !if(MaskOnlyReg, [], [RHS_r])>, Sched<[WriteFMA]>;
+ def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [RHS_m]>;
+ [RHS_m]>, Sched<[WriteFMALd, ReadAfterLd]>;
}// isCodeGenOnly = 1
-}
}// Constraints = "$src1 = $dst"
+}
multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
- SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
+ string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+ SDNode OpNodeRnds1, SDNode OpNodes3,
+ SDNode OpNodeRnds3, X86VectorVTInfo _,
+ string SUFF> {
let ExeDomain = _.ExeDomain in {
- defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
+ defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix, _,
// Operands for intrinsic are in 123 order to preserve passthu
// semantics.
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
- _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
+ (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2, _.RC:$src3)),
+ (_.VT (OpNodes1 _.RC:$src1, _.RC:$src2,
+ _.ScalarIntMemCPat:$src3)),
(_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
_.FRC:$src3))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
- (_.ScalarLdFrag addr:$src3))))>;
+ (_.ScalarLdFrag addr:$src3)))), 0>;
- defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
- (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
- _.RC:$src1, (i32 FROUND_CURRENT))),
+ defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix, _,
+ (_.VT (OpNodes3 _.RC:$src2, _.RC:$src3, _.RC:$src1)),
+ (_.VT (OpNodes3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
+ _.RC:$src1)),
(_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
(i32 imm:$rc))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src3,
_.FRC:$src1))),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2,
- (_.ScalarLdFrag addr:$src3), _.FRC:$src1)))>;
-
- defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
- _.RC:$src2, (i32 FROUND_CURRENT))),
- (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
- (i32 imm:$rc))),
+ (_.ScalarLdFrag addr:$src3), _.FRC:$src1))), 1>;
+
+ // One pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix, _,
+ (null_frag),
+ (_.VT (OpNodes1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
+ _.RC:$src2)),
+ (null_frag),
(set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1, _.FRC:$src3,
_.FRC:$src2))),
- (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
- (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+ (set _.FRC:$dst, (_.EltVT (OpNode (_.ScalarLdFrag addr:$src3),
+ _.FRC:$src1, _.FRC:$src2))), 1>;
}
}
multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
- string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
+ string OpcodeStr, SDNode OpNode, SDNode OpNodes1,
+ SDNode OpNodeRnds1, SDNode OpNodes3,
SDNode OpNodeRnds3> {
let Predicates = [HasAVX512] in {
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnds1, OpNodeRnds3, f32x_info, "SS">,
+ OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+ f32x_info, "SS">,
EVEX_CD8<32, CD8VT1>, VEX_LIG;
defm NAME : avx512_fma3s_all<opc213, opc231, opc132, OpcodeStr, OpNode,
- OpNodeRnds1, OpNodeRnds3, f64x_info, "SD">,
+ OpNodes1, OpNodeRnds1, OpNodes3, OpNodeRnds3,
+ f64x_info, "SD">,
EVEX_CD8<64, CD8VT1>, VEX_LIG, VEX_W;
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnds1,
- X86FmaddRnds3>;
-defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnds1,
- X86FmsubRnds3>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd,
- X86FnmaddRnds1, X86FnmaddRnds3>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
- X86FnmsubRnds1, X86FnmsubRnds3>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86Fmadds1,
+ X86FmaddRnds1, X86Fmadds3, X86FmaddRnds3>;
+defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86Fmsubs1,
+ X86FmsubRnds1, X86Fmsubs3, X86FmsubRnds3>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86Fnmadds1,
+ X86FnmaddRnds1, X86Fnmadds3, X86FnmaddRnds3>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86Fnmsubs1,
+ X86FnmsubRnds1, X86Fnmsubs3, X86FnmsubRnds3>;
//===----------------------------------------------------------------------===//
// AVX-512 Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
//===----------------------------------------------------------------------===//
let Constraints = "$src1 = $dst" in {
multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
+ // NOTE: The SDNode have the multiply operands first with the add last.
+ // This enables commuted load patterns to be autogenerated by tablegen.
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), itins.rr, 1, 1>,
+ AVX512FMA3Base, Sched<[itins.Sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
- (_.VT (OpNode _.RC:$src1, _.RC:$src2, (_.LdFrag addr:$src3)))>,
- AVX512FMA3Base;
+ (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
+ itins.rm>, AVX512FMA3Base, Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
OpcodeStr, !strconcat("${src3}", _.BroadcastStr,", $src2"),
!strconcat("$src2, ${src3}", _.BroadcastStr ),
- (OpNode _.RC:$src1,
- _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
- AVX512FMA3Base, EVEX_B;
+ (OpNode _.RC:$src2,
+ (_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))),
+ _.RC:$src1), itins.rm>,
+ AVX512FMA3Base, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
} // Constraints = "$src1 = $dst"
multiclass avx512_pmadd52_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo _> {
+ OpndItins itins, AVX512VLVectorVTInfo _> {
let Predicates = [HasIFMA] in {
- defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info512>,
+ defm Z : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info512>,
EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
}
let Predicates = [HasVLX, HasIFMA] in {
- defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info256>,
+ defm Z256 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info256>,
EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
- defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, _.info128>,
+ defm Z128 : avx512_pmadd52_rm<opc, OpcodeStr, OpNode, itins, _.info128>,
EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
}
}
defm VPMADD52LUQ : avx512_pmadd52_common<0xb4, "vpmadd52luq", x86vpmadd52l,
- avx512vl_i64_info>, VEX_W;
+ SSE_PMADD, avx512vl_i64_info>, VEX_W;
defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
- avx512vl_i64_info>, VEX_W;
+ SSE_PMADD, avx512vl_i64_info>, VEX_W;
//===----------------------------------------------------------------------===//
// AVX-512 Scalar convert from sign integer to float/double
//===----------------------------------------------------------------------===//
-multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
- X86VectorVTInfo DstVT, X86MemOperand x86memop,
- PatFrag ld_frag, string asm> {
+multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, OpndItins itins,
+ RegisterClass SrcRC, X86VectorVTInfo DstVT,
+ X86MemOperand x86memop, PatFrag ld_frag, string asm> {
let hasSideEffects = 0 in {
def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, SrcRC:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- EVEX_4V;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
+ itins.rr>, EVEX_4V, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, x86memop:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- EVEX_4V;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
+ itins.rm>, EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
} // hasSideEffects = 0
let isCodeGenOnly = 1 in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
@@ -6497,7 +6407,8 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
- (i32 FROUND_CURRENT)))]>, EVEX_4V;
+ (i32 FROUND_CURRENT)))], itins.rr>,
+ EVEX_4V, Sched<[itins.Sched]>;
def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, x86memop:$src2),
@@ -6505,12 +6416,13 @@ multiclass avx512_vcvtsi<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
(ld_frag addr:$src2),
- (i32 FROUND_CURRENT)))]>, EVEX_4V;
+ (i32 FROUND_CURRENT)))], itins.rm>,
+ EVEX_4V, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}//isCodeGenOnly = 1
}
-multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
- X86VectorVTInfo DstVT, string asm> {
+multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, OpndItins itins,
+ RegisterClass SrcRC, X86VectorVTInfo DstVT, string asm> {
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
!strconcat(asm,
@@ -6518,28 +6430,29 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
[(set DstVT.RC:$dst,
(OpNode (DstVT.VT DstVT.RC:$src1),
SrcRC:$src2,
- (i32 imm:$rc)))]>, EVEX_4V, EVEX_B, EVEX_RC;
+ (i32 imm:$rc)))], itins.rr>,
+ EVEX_4V, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
}
-multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, RegisterClass SrcRC,
- X86VectorVTInfo DstVT, X86MemOperand x86memop,
- PatFrag ld_frag, string asm> {
- defm NAME : avx512_vcvtsi_round<opc, OpNode, SrcRC, DstVT, asm>,
- avx512_vcvtsi<opc, OpNode, SrcRC, DstVT, x86memop, ld_frag, asm>,
- VEX_LIG;
+multiclass avx512_vcvtsi_common<bits<8> opc, SDNode OpNode, OpndItins itins,
+ RegisterClass SrcRC, X86VectorVTInfo DstVT,
+ X86MemOperand x86memop, PatFrag ld_frag, string asm> {
+ defm NAME : avx512_vcvtsi_round<opc, OpNode, itins, SrcRC, DstVT, asm>,
+ avx512_vcvtsi<opc, OpNode, itins, SrcRC, DstVT, x86memop,
+ ld_frag, asm>, VEX_LIG;
}
let Predicates = [HasAVX512] in {
-defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+defm VCVTSI2SSZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR32,
v4f32x_info, i32mem, loadi32, "cvtsi2ss{l}">,
XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SS, GR64,
v4f32x_info, i64mem, loadi64, "cvtsi2ss{q}">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR32,
+defm VCVTSI2SDZ : avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR32,
v2f64x_info, i32mem, loadi32, "cvtsi2sd{l}">,
XD, EVEX_CD8<32, CD8VT1>;
-defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, GR64,
+defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFpRnd, SSE_CVT_SI2SD, GR64,
v2f64x_info, i64mem, loadi64, "cvtsi2sd{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -6566,16 +6479,16 @@ def : Pat<(f64 (sint_to_fp GR32:$src)),
def : Pat<(f64 (sint_to_fp GR64:$src)),
(VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
-defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR32,
+defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR32,
v4f32x_info, i32mem, loadi32,
"cvtusi2ss{l}">, XS, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SS, GR64,
v4f32x_info, i64mem, loadi64, "cvtusi2ss{q}">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
-defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, GR32, v2f64x_info,
+defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR32, v2f64x_info,
i32mem, loadi32, "cvtusi2sd{l}">,
XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
-defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, GR64,
+defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFpRnd, SSE_CVT_SI2SD, GR64,
v2f64x_info, i64mem, loadi64, "cvtusi2sd{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
@@ -6606,71 +6519,74 @@ def : Pat<(f64 (uint_to_fp GR64:$src)),
//===----------------------------------------------------------------------===//
// AVX-512 Scalar convert from float/double to integer
//===----------------------------------------------------------------------===//
-multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
- X86VectorVTInfo DstVT, SDNode OpNode, string asm> {
+
+multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
+ X86VectorVTInfo DstVT, SDNode OpNode,
+ OpndItins itins, string asm> {
let Predicates = [HasAVX512] in {
- def rr : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))]>,
- EVEX, VEX_LIG;
- def rb : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
- !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
- [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
- EVEX, VEX_LIG, EVEX_B, EVEX_RC;
- def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 FROUND_CURRENT)))],
+ itins.rr>, EVEX, VEX_LIG, Sched<[itins.Sched]>;
+ def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
+ !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
+ [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))],
+ itins.rr>, EVEX, VEX_LIG, EVEX_B, EVEX_RC,
+ Sched<[itins.Sched]>;
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode
(SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
- (i32 FROUND_CURRENT)))]>,
- EVEX, VEX_LIG;
+ (i32 FROUND_CURRENT)))], itins.rm>,
+ EVEX, VEX_LIG, Sched<[itins.Sched.Folded, ReadAfterLd]>;
} // Predicates = [HasAVX512]
}
// Convert float/double to signed/unsigned int 32/64
defm VCVTSS2SIZ: avx512_cvt_s_int_round<0x2D, f32x_info, i32x_info,
- X86cvts2si, "cvtss2si">,
+ X86cvts2si, SSE_CVT_SS2SI_32, "cvtss2si">,
XS, EVEX_CD8<32, CD8VT1>;
defm VCVTSS2SI64Z: avx512_cvt_s_int_round<0x2D, f32x_info, i64x_info,
- X86cvts2si, "cvtss2si">,
+ X86cvts2si, SSE_CVT_SS2SI_64, "cvtss2si">,
XS, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTSS2USIZ: avx512_cvt_s_int_round<0x79, f32x_info, i32x_info,
- X86cvts2usi, "cvtss2usi">,
+ X86cvts2usi, SSE_CVT_SS2SI_32, "cvtss2usi">,
XS, EVEX_CD8<32, CD8VT1>;
defm VCVTSS2USI64Z: avx512_cvt_s_int_round<0x79, f32x_info, i64x_info,
- X86cvts2usi, "cvtss2usi">, XS, VEX_W,
- EVEX_CD8<32, CD8VT1>;
+ X86cvts2usi, SSE_CVT_SS2SI_64, "cvtss2usi">,
+ XS, VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTSD2SIZ: avx512_cvt_s_int_round<0x2D, f64x_info, i32x_info,
- X86cvts2si, "cvtsd2si">,
+ X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">,
XD, EVEX_CD8<64, CD8VT1>;
defm VCVTSD2SI64Z: avx512_cvt_s_int_round<0x2D, f64x_info, i64x_info,
- X86cvts2si, "cvtsd2si">,
+ X86cvts2si, SSE_CVT_SD2SI, "cvtsd2si">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VCVTSD2USIZ: avx512_cvt_s_int_round<0x79, f64x_info, i32x_info,
- X86cvts2usi, "cvtsd2usi">,
+ X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">,
XD, EVEX_CD8<64, CD8VT1>;
defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
- X86cvts2usi, "cvtsd2usi">, XD, VEX_W,
- EVEX_CD8<64, CD8VT1>;
+ X86cvts2usi, SSE_CVT_SD2SI, "cvtsd2usi">,
+ XD, VEX_W, EVEX_CD8<64, CD8VT1>;
// The SSE version of these instructions are disabled for AVX512.
// Therefore, the SSE intrinsics are mapped to the AVX512 instructions.
let Predicates = [HasAVX512] in {
def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
- (VCVTSS2SIZrr VR128X:$src)>;
+ (VCVTSS2SIZrr_Int VR128X:$src)>;
def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)),
- (VCVTSS2SIZrm sse_load_f32:$src)>;
+ (VCVTSS2SIZrm_Int sse_load_f32:$src)>;
def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
- (VCVTSS2SI64Zrr VR128X:$src)>;
+ (VCVTSS2SI64Zrr_Int VR128X:$src)>;
def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)),
- (VCVTSS2SI64Zrm sse_load_f32:$src)>;
+ (VCVTSS2SI64Zrm_Int sse_load_f32:$src)>;
def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
- (VCVTSD2SIZrr VR128X:$src)>;
+ (VCVTSD2SIZrr_Int VR128X:$src)>;
def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)),
- (VCVTSD2SIZrm sse_load_f64:$src)>;
+ (VCVTSD2SIZrm_Int sse_load_f64:$src)>;
def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
- (VCVTSD2SI64Zrr VR128X:$src)>;
+ (VCVTSD2SI64Zrr_Int VR128X:$src)>;
def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)),
- (VCVTSD2SI64Zrm sse_load_f64:$src)>;
+ (VCVTSD2SI64Zrm_Int sse_load_f64:$src)>;
} // HasAVX512
let Predicates = [HasAVX512] in {
@@ -6723,24 +6639,25 @@ def : Pat<(v2f64 (X86Movsd
// Convert float/double to signed/unsigned int 32/64 with truncation
multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
- SDNode OpNodeRnd, string aliasStr>{
+ SDNode OpNodeRnd, OpndItins itins, string aliasStr>{
let Predicates = [HasAVX512] in {
def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>, EVEX;
+ [(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))], itins.rr>,
+ EVEX, Sched<[itins.Sched]>;
let hasSideEffects = 0 in
- def rb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
+ def rrb : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
- []>, EVEX, EVEX_B;
+ [], itins.rr>, EVEX, EVEX_B, Sched<[itins.Sched]>;
def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
- EVEX;
+ [(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))],
+ itins.rm>, EVEX, Sched<[itins.Sched.Folded, ReadAfterLd]>;
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "rr") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
def : InstAlias<asm # aliasStr # "\t\t{{sae}, $src, $dst|$dst, $src, {sae}}",
- (!cast<Instruction>(NAME # "rb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
+ (!cast<Instruction>(NAME # "rrb") _DstRC.RC:$dst, _SrcRC.FRC:$src), 0>;
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "rm") _DstRC.RC:$dst,
_SrcRC.ScalarMemOp:$src), 0>;
@@ -6749,47 +6666,48 @@ let Predicates = [HasAVX512] in {
def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_CURRENT)))]>, EVEX, VEX_LIG;
- def rb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
+ (i32 FROUND_CURRENT)))], itins.rr>,
+ EVEX, VEX_LIG, Sched<[itins.Sched]>;
+ def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
!strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
[(set _DstRC.RC:$dst, (OpNodeRnd (_SrcRC.VT _SrcRC.RC:$src),
- (i32 FROUND_NO_EXC)))]>,
- EVEX,VEX_LIG , EVEX_B;
+ (i32 FROUND_NO_EXC)))], itins.rr>,
+ EVEX,VEX_LIG , EVEX_B, Sched<[itins.Sched]>;
let mayLoad = 1, hasSideEffects = 0 in
def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
(ins _SrcRC.IntScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
- []>, EVEX, VEX_LIG;
-
+ [], itins.rm>, EVEX, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
} // isCodeGenOnly = 1
} //HasAVX512
}
defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
- fp_to_sint, X86cvtts2IntRnd, "{l}">,
+ fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_32, "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
- fp_to_sint, X86cvtts2IntRnd, "{q}">,
+ fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SS2SI_64, "{q}">,
VEX_W, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
- fp_to_sint, X86cvtts2IntRnd, "{l}">,
+ fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
- fp_to_sint, X86cvtts2IntRnd, "{q}">,
+ fp_to_sint, X86cvtts2IntRnd, SSE_CVT_SD2SI, "{q}">,
VEX_W, XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
- fp_to_uint, X86cvtts2UIntRnd, "{l}">,
+ fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_32, "{l}">,
XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
- fp_to_uint, X86cvtts2UIntRnd, "{q}">,
+ fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SS2SI_64, "{q}">,
XS,VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
- fp_to_uint, X86cvtts2UIntRnd, "{l}">,
+ fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{l}">,
XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
- fp_to_uint, X86cvtts2UIntRnd, "{q}">,
+ fp_to_uint, X86cvtts2UIntRnd, SSE_CVT_SD2SI, "{q}">,
XD, VEX_W, EVEX_CD8<64, CD8VT1>;
let Predicates = [HasAVX512] in {
def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
@@ -6809,88 +6727,95 @@ let Predicates = [HasAVX512] in {
def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)),
(VCVTTSD2SI64Zrm_Int sdmem:$src)>;
} // HasAVX512
+
//===----------------------------------------------------------------------===//
// AVX-512 Convert form float to double and back
//===----------------------------------------------------------------------===//
+
multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode> {
+ X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins> {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2),
- (i32 FROUND_CURRENT)))>,
- EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+ (i32 FROUND_CURRENT))), itins.rr>,
+ EVEX_4V, VEX_LIG, Sched<[itins.Sched]>;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (OpNode (_.VT _.RC:$src1),
(_Src.VT _Src.ScalarIntMemCPat:$src2),
- (i32 FROUND_CURRENT)))>,
- EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ (i32 FROUND_CURRENT))), itins.rm>,
+ EVEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _Src.FRC:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ itins.rr>, EVEX_4V, VEX_LIG, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ itins.rm>, EVEX_4V, VEX_LIG,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
// Scalar Coversion with SAE - suppress all exceptions
multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> {
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
(_Src.VT _Src.RC:$src2),
- (i32 FROUND_NO_EXC)))>,
- EVEX_4V, VEX_LIG, EVEX_B;
+ (i32 FROUND_NO_EXC))), itins.rr>,
+ EVEX_4V, VEX_LIG, EVEX_B, Sched<[itins.Sched]>;
}
// Scalar Conversion with rounding control (RC)
multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ X86VectorVTInfo _Src, SDNode OpNodeRnd, OpndItins itins> {
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
(_.VT (OpNodeRnd (_.VT _.RC:$src1),
- (_Src.VT _Src.RC:$src2), (i32 imm:$rc)))>,
- EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>,
+ (_Src.VT _Src.RC:$src2), (i32 imm:$rc))),
+ itins.rr>,
+ EVEX_4V, VEX_LIG, Sched<[itins.Sched]>,
EVEX_B, EVEX_RC;
}
multiclass avx512_cvt_fp_scalar_sd2ss<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, X86VectorVTInfo _src,
- X86VectorVTInfo _dst> {
+ SDNode OpNodeRnd, OpndItins itins,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
avx512_cvt_fp_rc_scalar<opc, OpcodeStr, _dst, _src,
- OpNodeRnd>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
+ OpNodeRnd, itins>, VEX_W, EVEX_CD8<64, CD8VT1>, XD;
}
}
multiclass avx512_cvt_fp_scalar_ss2sd<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, X86VectorVTInfo _src,
- X86VectorVTInfo _dst> {
+ SDNode OpNodeRnd, OpndItins itins,
+ X86VectorVTInfo _src, X86VectorVTInfo _dst> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
- avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd>,
+ defm Z : avx512_cvt_fp_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
+ avx512_cvt_fp_sae_scalar<opc, OpcodeStr, _dst, _src, OpNodeRnd, itins>,
EVEX_CD8<32, CD8VT1>, XS;
}
}
defm VCVTSD2SS : avx512_cvt_fp_scalar_sd2ss<0x5A, "vcvtsd2ss",
- X86froundRnd, f64x_info, f32x_info>;
+ X86froundRnd, SSE_CVT_SD2SS, f64x_info,
+ f32x_info>, NotMemoryFoldable;
defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
- X86fpextRnd,f32x_info, f64x_info >;
+ X86fpextRnd, SSE_CVT_SS2SD, f32x_info,
+ f64x_info>, NotMemoryFoldable;
def : Pat<(f64 (fpextend FR32X:$src)),
- (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
+ (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(f64 (fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
@@ -6905,7 +6830,7 @@ def : Pat<(f64 (extloadf32 addr:$src)),
Requires<[HasAVX512, OptForSpeed]>;
def : Pat<(f32 (fpround FR64X:$src)),
- (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
+ (VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(v4f32 (X86Movss
@@ -6928,74 +6853,81 @@ def : Pat<(v2f64 (X86Movsd
//===----------------------------------------------------------------------===//
multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNode,
+ X86VectorVTInfo _Src, SDNode OpNode, OpndItins itins,
string Broadcast = _.BroadcastStr,
string Alias = "", X86MemOperand MemOp = _Src.MemOp> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_Src.VT _Src.RC:$src)))>, EVEX;
+ (_.VT (OpNode (_Src.VT _Src.RC:$src))), itins.rr>,
+ EVEX, Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins MemOp:$src), OpcodeStr#Alias, "$src", "$src",
(_.VT (OpNode (_Src.VT
- (bitconvert (_Src.LdFrag addr:$src)))))>, EVEX;
+ (bitconvert (_Src.LdFrag addr:$src))))), itins.rm>,
+ EVEX, Sched<[itins.Sched.Folded]>;
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _Src.ScalarMemOp:$src), OpcodeStr,
"${src}"##Broadcast, "${src}"##Broadcast,
(_.VT (OpNode (_Src.VT
(X86VBroadcast (_Src.ScalarLdFrag addr:$src)))
- ))>, EVEX, EVEX_B;
+ )), itins.rm>, EVEX, EVEX_B,
+ Sched<[itins.Sched.Folded]>;
}
// Coversion with SAE - suppress all exceptions
multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ OpndItins itins> {
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
(_.VT (OpNodeRnd (_Src.VT _Src.RC:$src),
- (i32 FROUND_NO_EXC)))>,
- EVEX, EVEX_B;
+ (i32 FROUND_NO_EXC))), itins.rr>,
+ EVEX, EVEX_B, Sched<[itins.Sched]>;
}
// Conversion with rounding control (RC)
multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86VectorVTInfo _Src, SDNode OpNodeRnd> {
+ X86VectorVTInfo _Src, SDNode OpNodeRnd,
+ OpndItins itins> {
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
"$rc, $src", "$src, $rc",
- (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc)))>,
- EVEX, EVEX_B, EVEX_RC;
+ (_.VT (OpNodeRnd (_Src.VT _Src.RC:$src), (i32 imm:$rc))),
+ itins.rr>, EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
}
// Extend Float to Double
-multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr> {
+multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
+ OpndItins itins> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info, fpextend>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8f32x_info,
+ fpextend, itins>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
- X86vfpextRnd>, EVEX_V512;
+ X86vfpextRnd, itins>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4f32x_info,
- X86vfpext, "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend>,
- EVEX_V256;
+ X86vfpext, itins, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
+ itins>, EVEX_V256;
}
}
// Truncate Double to Float
-multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
+multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, OpndItins itins> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, fpround, itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
- X86vfproundRnd>, EVEX_V512;
+ X86vfproundRnd, itins>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
- X86vfpround, "{1to2}", "{x}">, EVEX_V128;
+ X86vfpround, itins, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, fpround,
- "{1to4}", "{y}">, EVEX_V256;
+ itins, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
@@ -7008,19 +6940,23 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr> {
}
}
-defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps">,
+defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SSE_CVT_PD2PS>,
VEX_W, PD, EVEX_CD8<64, CD8VF>;
-defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd">,
+defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SSE_CVT_PS2PD>,
PS, EVEX_CD8<32, CD8VH>;
def : Pat<(v8f64 (extloadv8f32 addr:$src)),
(VCVTPS2PDZrm addr:$src)>;
let Predicates = [HasVLX] in {
- let AddedComplexity = 15 in
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
- (VCVTPD2PSZ128rr VR128X:$src)>;
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128X:$src)))))),
+ (VCVTPD2PSZ128rr VR128X:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
+ (VCVTPD2PSZ128rm addr:$src)>;
+ }
def : Pat<(v2f64 (extloadv2f32 addr:$src)),
(VCVTPS2PDZ128rm addr:$src)>;
def : Pat<(v4f64 (extloadv4f32 addr:$src)),
@@ -7029,75 +6965,80 @@ let Predicates = [HasVLX] in {
// Convert Signed/Unsigned Doubleword to Double
multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128> {
+ SDNode OpNode128, OpndItins itins> {
// No rounding in this op
let Predicates = [HasAVX512] in
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode>,
- EVEX_V512;
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode,
+ itins>, EVEX_V512;
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
- OpNode128, "{1to2}", "", i64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode>,
- EVEX_V256;
+ OpNode128, itins, "{1to2}", "", i64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Signed/Unsigned Doubleword to Float
multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNodeRnd> {
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasAVX512] in
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
let Predicates = [HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode>,
- EVEX_V256;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode,
+ itins>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Doubleword with truncation
-multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
+ itins>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
- EVEX_V256;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
+ itins>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Doubleword
-multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode>,
- EVEX_V256;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
+ itins>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Double to Signed/Unsigned Doubleword with truncation
multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd> {
+ SDNode OpNode128, SDNode OpNodeRnd,
+ OpndItins itins> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
+ itins>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7105,9 +7046,9 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
- OpNode128, "{1to2}", "{x}">, EVEX_V128;
+ OpNode128, itins, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
- "{1to4}", "{y}">, EVEX_V256;
+ itins, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
@@ -7121,12 +7062,13 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
// Convert Double to Signed/Unsigned Doubleword
-multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7134,9 +7076,9 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info, OpNode,
- "{1to2}", "{x}">, EVEX_V128;
+ itins, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
- "{1to4}", "{y}">, EVEX_V256;
+ itins, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
@@ -7150,96 +7092,102 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr,
}
// Convert Double to Signed/Unsigned Quardword
-multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd,itins>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
- EVEX_V256;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
+ itins>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Double to Signed/Unsigned Quardword with truncation
-multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
+ itins>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode>,
- EVEX_V256;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
+ itins>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Double
-multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
- defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode>,
- EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode>,
- EVEX_V256;
+ defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
+ itins>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Quardword
-multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr,
- SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
- "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
- EVEX_V256;
+ itins, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Float to Signed/Unsigned Quardword with truncation
multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd> {
+ SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
+ itins>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
// Explicitly specified broadcast string, since we take only 2 elements
// from v4f32x_info source
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode128,
- "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode>,
- EVEX_V256;
+ itins, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
+ itins>, EVEX_V256;
}
}
// Convert Signed/Unsigned Quardword to Float
multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
- SDNode OpNode128, SDNode OpNodeRnd> {
+ SDNode OpNode128, SDNode OpNodeRnd, OpndItins itins> {
let Predicates = [HasDQI] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
+ itins>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
- OpNodeRnd>, EVEX_V512;
+ OpNodeRnd, itins>, EVEX_V512;
}
let Predicates = [HasDQI, HasVLX] in {
// we need "x"/"y" suffixes in order to distinguish between 128 and 256
@@ -7247,9 +7195,9 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
// dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
// due to the same reason.
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, OpNode128,
- "{1to2}", "{x}">, EVEX_V128;
+ itins, "{1to2}", "{x}">, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
- "{1to4}", "{y}">, EVEX_V256;
+ itins, "{1to4}", "{y}">, EVEX_V256;
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0>;
@@ -7262,89 +7210,100 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
-defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP>,
- XS, EVEX_CD8<32, CD8VH>;
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP,
+ SSE_CVT_I2PD>, XS, EVEX_CD8<32, CD8VH>;
defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
- X86VSintToFpRnd>,
+ X86VSintToFpRnd, SSE_CVT_I2PS>,
PS, EVEX_CD8<32, CD8VF>;
defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", fp_to_sint,
- X86cvttp2siRnd>,
+ X86cvttp2siRnd, SSE_CVT_PS2I>,
XS, EVEX_CD8<32, CD8VF>;
defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", fp_to_sint, X86cvttp2si,
- X86cvttp2siRnd>,
+ X86cvttp2siRnd, SSE_CVT_PD2I>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", fp_to_uint,
- X86cvttp2uiRnd>, PS,
+ X86cvttp2uiRnd, SSE_CVT_PS2I>, PS,
EVEX_CD8<32, CD8VF>;
defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", fp_to_uint,
- X86cvttp2ui, X86cvttp2uiRnd>, PS, VEX_W,
- EVEX_CD8<64, CD8VF>;
+ X86cvttp2ui, X86cvttp2uiRnd, SSE_CVT_PD2I>,
+ PS, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp, X86VUintToFP>,
- XS, EVEX_CD8<32, CD8VH>;
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp,
+ X86VUintToFP, SSE_CVT_I2PD>, XS,
+ EVEX_CD8<32, CD8VH>;
defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
- X86VUintToFpRnd>, XD,
+ X86VUintToFpRnd, SSE_CVT_I2PS>, XD,
EVEX_CD8<32, CD8VF>;
defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
- X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VF>;
+ X86cvtp2IntRnd, SSE_CVT_PS2I>, PD,
+ EVEX_CD8<32, CD8VF>;
defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
- X86cvtp2IntRnd>, XD, VEX_W,
- EVEX_CD8<64, CD8VF>;
+ X86cvtp2IntRnd, SSE_CVT_PD2I>, XD,
+ VEX_W, EVEX_CD8<64, CD8VF>;
defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
- X86cvtp2UIntRnd>,
+ X86cvtp2UIntRnd, SSE_CVT_PS2I>,
PS, EVEX_CD8<32, CD8VF>;
+
defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
- X86cvtp2UIntRnd>, VEX_W,
+ X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W,
PS, EVEX_CD8<64, CD8VF>;
defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
- X86cvtp2IntRnd>, VEX_W,
+ X86cvtp2IntRnd, SSE_CVT_PD2I>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
- X86cvtp2IntRnd>, PD, EVEX_CD8<32, CD8VH>;
+ X86cvtp2IntRnd, SSE_CVT_PS2I>, PD,
+ EVEX_CD8<32, CD8VH>;
defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
- X86cvtp2UIntRnd>, VEX_W,
+ X86cvtp2UIntRnd, SSE_CVT_PD2I>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
- X86cvtp2UIntRnd>, PD, EVEX_CD8<32, CD8VH>;
+ X86cvtp2UIntRnd, SSE_CVT_PS2I>, PD,
+ EVEX_CD8<32, CD8VH>;
defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", fp_to_sint,
- X86cvttp2siRnd>, VEX_W,
+ X86cvttp2siRnd, SSE_CVT_PD2I>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", fp_to_sint, X86cvttp2si,
- X86cvttp2siRnd>, PD, EVEX_CD8<32, CD8VH>;
+ X86cvttp2siRnd, SSE_CVT_PS2I>, PD,
+ EVEX_CD8<32, CD8VH>;
defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", fp_to_uint,
- X86cvttp2uiRnd>, VEX_W,
+ X86cvttp2uiRnd, SSE_CVT_PD2I>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", fp_to_uint, X86cvttp2ui,
- X86cvttp2uiRnd>, PD, EVEX_CD8<32, CD8VH>;
+ X86cvttp2uiRnd, SSE_CVT_PS2I>, PD,
+ EVEX_CD8<32, CD8VH>;
defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
- X86VSintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+ X86VSintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS,
+ EVEX_CD8<64, CD8VF>;
defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
- X86VUintToFpRnd>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
+ X86VUintToFpRnd, SSE_CVT_I2PD>, VEX_W, XS,
+ EVEX_CD8<64, CD8VF>;
defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp, X86VSintToFP,
- X86VSintToFpRnd>, VEX_W, PS, EVEX_CD8<64, CD8VF>;
+ X86VSintToFpRnd, SSE_CVT_I2PS>, VEX_W, PS,
+ EVEX_CD8<64, CD8VF>;
defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp, X86VUintToFP,
- X86VUintToFpRnd>, VEX_W, XD, EVEX_CD8<64, CD8VF>;
+ X86VUintToFpRnd, SSE_CVT_I2PS>, VEX_W, XD,
+ EVEX_CD8<64, CD8VF>;
let Predicates = [HasAVX512, NoVLX] in {
def : Pat<(v8i32 (fp_to_uint (v8f32 VR256X:$src1))),
@@ -7362,11 +7321,6 @@ def : Pat<(v4i32 (fp_to_uint (v4f64 VR256X:$src1))),
(v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
VR256X:$src1, sub_ymm)))), sub_xmm)>;
-def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))),
- (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
- (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src, sub_xmm)))), sub_xmm)>;
-
def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
(EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
(v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
@@ -7393,16 +7347,32 @@ let Predicates = [HasAVX512, HasVLX] in {
def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvtp2Int (v2f64 VR128X:$src)))))),
(VCVTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))))),
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
+ (VCVTPD2DQZ128rm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2UInt (v2f64 VR128X:$src)))))),
(VCVTPD2UDQZ128rr VR128X:$src)>;
def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttp2si (v2f64 VR128X:$src)))))),
(VCVTTPD2DQZ128rr VR128X:$src)>;
- def : Pat<(v4i32 (bitconvert (X86vzmovl (v2i64 (bitconvert
- (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))))),
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
+ (VCVTTPD2DQZ128rm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2ui (v2f64 VR128X:$src)))))),
(VCVTTPD2UDQZ128rr VR128X:$src)>;
}
+
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (VCVTDQ2PDZ128rm addr:$src)>;
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ (VCVTDQ2PDZ128rm addr:$src)>;
+
+ def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+ (VCVTUDQ2PDZ128rm addr:$src)>;
+ def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ (VCVTUDQ2PDZ128rm addr:$src)>;
}
let Predicates = [HasAVX512] in {
@@ -7488,76 +7458,113 @@ def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
-multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
- X86MemOperand x86memop, PatFrag ld_frag> {
- defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
- "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT _src.RC:$src),
- (i32 FROUND_CURRENT))>, T8PD;
- defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst), (ins x86memop:$src),
- "vcvtph2ps", "$src", "$src",
- (X86cvtph2ps (_src.VT (bitconvert (ld_frag addr:$src))),
- (i32 FROUND_CURRENT))>, T8PD;
-}
-
-multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
- defm rb : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst), (ins _src.RC:$src),
- "vcvtph2ps", "{sae}, $src", "$src, {sae}",
- (X86cvtph2ps (_src.VT _src.RC:$src),
- (i32 FROUND_NO_EXC))>, T8PD, EVEX_B;
+multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ X86MemOperand x86memop, PatFrag ld_frag,
+ OpndItins itins> {
+ defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT _src.RC:$src)),itins.rr>,
+ T8PD, Sched<[itins.Sched]>;
+ defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+ (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
+ (X86cvtph2ps (_src.VT
+ (bitconvert
+ (ld_frag addr:$src)))), itins.rm>,
+ T8PD, Sched<[itins.Sched.Folded]>;
+}
+
+multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ OpndItins itins> {
+ defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
+ (ins _src.RC:$src), "vcvtph2ps",
+ "{sae}, $src", "$src, {sae}",
+ (X86cvtph2psRnd (_src.VT _src.RC:$src),
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ T8PD, EVEX_B, Sched<[itins.Sched]>;
}
-let Predicates = [HasAVX512] in {
- defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64>,
- avx512_cvtph2ps_sae<v16f32_info, v16i16x_info>,
+let Predicates = [HasAVX512] in
+ defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, loadv4i64,
+ SSE_CVT_PH2PS>,
+ avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, SSE_CVT_PH2PS>,
EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
- let Predicates = [HasVLX] in {
- defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
- loadv2i64>,EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
- defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
- loadv2i64>, EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
- }
+
+let Predicates = [HasVLX] in {
+ defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
+ loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V256,
+ EVEX_CD8<32, CD8VH>;
+ defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
+ loadv2i64, SSE_CVT_PH2PS>, EVEX, EVEX_V128,
+ EVEX_CD8<32, CD8VH>;
+
+ // Pattern match vcvtph2ps of a scalar i64 load.
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
+ (VCVTPH2PSZ128rm addr:$src)>;
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
+ (VCVTPH2PSZ128rm addr:$src)>;
}
multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
- X86MemOperand x86memop> {
+ X86MemOperand x86memop, OpndItins itins> {
defm rr : AVX512_maskable<0x1D, MRMDestReg, _dest ,(outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph", "$src2, $src1", "$src1, $src2",
(X86cvtps2ph (_src.VT _src.RC:$src1),
(i32 imm:$src2)),
- NoItinerary, 0, 0, X86select>, AVX512AIi8Base;
- def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(store (_dest.VT (X86cvtps2ph (_src.VT _src.RC:$src1),
- (i32 imm:$src2))),
- addr:$dst)]>;
- let hasSideEffects = 0, mayStore = 1 in
- def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
- (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
- "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
- []>, EVEX_K;
+ itins.rr, 0, 0>, AVX512AIi8Base, Sched<[itins.Sched]>;
+ let hasSideEffects = 0, mayStore = 1 in {
+ def mr : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ def mrk : AVX512AIi8<0x1D, MRMDestMem, (outs),
+ (ins x86memop:$dst, _dest.KRCWM:$mask, _src.RC:$src1, i32u8imm:$src2),
+ "vcvtps2ph\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+ [], itins.rm>, EVEX_K, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
}
-multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src> {
+
+multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
+ OpndItins itins> {
let hasSideEffects = 0 in
- defm rb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
+ defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
(outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph", "$src2, {sae}, $src1", "$src1, {sae}, $src2",
- []>, EVEX_B, AVX512AIi8Base;
+ [], itins.rr>, EVEX_B, AVX512AIi8Base, Sched<[itins.Sched]>;
}
+
let Predicates = [HasAVX512] in {
- defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem>,
- avx512_cvtps2ph_sae<v16i16x_info, v16f32_info>,
- EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ : avx512_cvtps2ph<v16i16x_info, v16f32_info, f256mem,
+ SSE_CVT_PS2PH>,
+ avx512_cvtps2ph_sae<v16i16x_info, v16f32_info,
+ SSE_CVT_PS2PH>, EVEX, EVEX_V512,
+ EVEX_CD8<32, CD8VH>;
let Predicates = [HasVLX] in {
- defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
- EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
- defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
- EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
+ SSE_CVT_PS2PH>, EVEX, EVEX_V256,
+ EVEX_CD8<32, CD8VH>;
+ defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
+ SSE_CVT_PS2PH>, EVEX, EVEX_V128,
+ EVEX_CD8<32, CD8VH>;
}
+
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, imm:$src2)>;
+ def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHZmr addr:$dst, VR512:$src1, imm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -7580,502 +7587,500 @@ let Predicates = [HasVLX] in {
(VCVTPS2PHZ128rr (COPY_TO_REGCLASS FR32X:$src, VR128X), 4)), FR32X)) >;
}
-// Patterns for matching float to half-float conversion when AVX512 is supported
-// but F16C isn't. In that case we have to use 512-bit vectors.
-let Predicates = [HasAVX512, NoVLX, NoF16C] in {
- def : Pat<(fp_to_f16 FR32X:$src),
- (i16 (EXTRACT_SUBREG
- (VMOVPDI2DIZrr
- (v8i16 (EXTRACT_SUBREG
- (VCVTPS2PHZrr
- (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
- sub_xmm), 4), sub_xmm))), sub_16bit))>;
-
- def : Pat<(f16_to_fp GR16:$src),
- (f32 (COPY_TO_REGCLASS
- (v4f32 (EXTRACT_SUBREG
- (VCVTPH2PSZrr
- (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)),
- (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)),
- sub_xmm)), sub_xmm)), FR32X))>;
-
- def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
- (f32 (COPY_TO_REGCLASS
- (v4f32 (EXTRACT_SUBREG
- (VCVTPH2PSZrr
- (VCVTPS2PHZrr (INSERT_SUBREG (v16f32 (IMPLICIT_DEF)),
- (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)),
- sub_xmm), 4)), sub_xmm)), FR32X))>;
-}
-
// Unordered/Ordered scalar fp compare with Sea and set EFLAGS
multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr> {
- def rb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
- !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
- [], IIC_SSE_COMIS_RR>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
- Sched<[WriteFAdd]>;
+ string OpcodeStr, OpndItins itins> {
+ let hasSideEffects = 0 in
+ def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
+ !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"),
+ [], itins.rr>, EVEX, EVEX_B, VEX_LIG, EVEX_V128,
+ Sched<[itins.Sched]>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
- defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss">,
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSE_COMIS>,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd">,
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSE_COMIS>,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss">,
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSE_COMIS>,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd">,
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSE_COMIS>,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
- "ucomiss">, PS, EVEX, VEX_LIG,
+ "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
- "ucomisd">, PD, EVEX,
+ "ucomisd", SSE_COMIS>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
let Pattern = []<dag> in {
defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
- "comiss">, PS, EVEX, VEX_LIG,
+ "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
- "comisd">, PD, EVEX,
+ "comisd", SSE_COMIS>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let isCodeGenOnly = 1 in {
defm Int_VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss">, PS, EVEX, VEX_LIG,
+ sse_load_f32, "ucomiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm Int_VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd">, PD, EVEX,
+ sse_load_f64, "ucomisd", SSE_COMIS>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
defm Int_VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss">, PS, EVEX, VEX_LIG,
+ sse_load_f32, "comiss", SSE_COMIS>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm Int_VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd">, PD, EVEX,
+ sse_load_f64, "comisd", SSE_COMIS>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
}
/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>, EVEX_4V;
+ (OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2)), itins.rr>,
+ EVEX_4V, Sched<[itins.Sched]>;
defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))))>, EVEX_4V;
+ _.ScalarIntMemCPat:$src2), itins.rm>, EVEX_4V,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86frcp14s, f32x_info>,
- EVEX_CD8<32, CD8VT1>, T8PD;
-defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86frcp14s, f64x_info>,
- VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
-defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86frsqrt14s, f32x_info>,
- EVEX_CD8<32, CD8VT1>, T8PD;
-defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
- VEX_W, EVEX_CD8<64, CD8VT1>, T8PD;
+defm VRCP14SS : avx512_fp14_s<0x4D, "vrcp14ss", X86rcp14s, SSE_RCPS, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
+defm VRCP14SD : avx512_fp14_s<0x4D, "vrcp14sd", X86rcp14s, SSE_RCPS, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
+defm VRSQRT14SS : avx512_fp14_s<0x4F, "vrsqrt14ss", X86rsqrt14s, SSE_RSQRTSS, f32x_info>,
+ EVEX_CD8<32, CD8VT1>, T8PD, NotMemoryFoldable;
+defm VRSQRT14SD : avx512_fp14_s<0x4F, "vrsqrt14sd", X86rsqrt14s, SSE_RSQRTSS, f64x_info>,
+ VEX_W, EVEX_CD8<64, CD8VT1>, T8PD, NotMemoryFoldable;
/// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
+ (_.FloatVT (OpNode _.RC:$src)), itins.rr>, EVEX, T8PD,
+ Sched<[itins.Sched]>;
defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
(OpNode (_.FloatVT
- (bitconvert (_.LdFrag addr:$src))))>, EVEX, T8PD;
+ (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX, T8PD,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.FloatVT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
- EVEX, T8PD, EVEX_B;
+ (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>,
+ EVEX, T8PD, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
- defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, v16f32_info>,
- EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, v8f64_info>,
- EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
+multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins> {
+ defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, itins.s,
+ v16f32_info>, EVEX_V512, EVEX_CD8<32, CD8VF>;
+ defm PDZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"), OpNode, itins.d,
+ v8f64_info>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
defm PSZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, v4f32x_info>,
+ OpNode, itins.s, v4f32x_info>,
EVEX_V128, EVEX_CD8<32, CD8VF>;
defm PSZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, v8f32x_info>,
+ OpNode, itins.s, v8f32x_info>,
EVEX_V256, EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, v2f64x_info>,
+ OpNode, itins.d, v2f64x_info>,
EVEX_V128, VEX_W, EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_fp14_p<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, v4f64x_info>,
+ OpNode, itins.d, v4f64x_info>,
EVEX_V256, VEX_W, EVEX_CD8<64, CD8VF>;
}
}
-defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86frsqrt>;
-defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
+defm VRSQRT14 : avx512_fp14_p_vl_all<0x4E, "vrsqrt14", X86rsqrt14, SSE_RSQRT_P>;
+defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SSE_RCP_P>;
/// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_CURRENT))>;
+ (i32 FROUND_CURRENT)), itins.rr>,
+ Sched<[itins.Sched]>;
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rm>, EVEX_B,
+ Sched<[itins.Sched]>;
defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNode (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
- (i32 FROUND_CURRENT))>;
+ (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+ (i32 FROUND_CURRENT)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
- defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode>,
+multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins> {
+ defm SS : avx512_fp28_s<opc, OpcodeStr#"ss", f32x_info, OpNode, itins.s>,
EVEX_CD8<32, CD8VT1>;
- defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode>,
+ defm SD : avx512_fp28_s<opc, OpcodeStr#"sd", f64x_info, OpNode, itins.d>,
EVEX_CD8<64, CD8VT1>, VEX_W;
}
let Predicates = [HasERI] in {
- defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s>, T8PD, EVEX_4V;
- defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s>, T8PD, EVEX_4V;
+ defm VRCP28 : avx512_eri_s<0xCB, "vrcp28", X86rcp28s, SSE_RCP_S>,
+ T8PD, EVEX_4V;
+ defm VRSQRT28 : avx512_eri_s<0xCD, "vrsqrt28", X86rsqrt28s, SSE_RSQRT_S>,
+ T8PD, EVEX_4V;
}
-defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
+defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds, SSE_ALU_ITINS_S>,
+ T8PD, EVEX_4V;
/// avx512_fp28_p rcp28ps, rcp28pd, rsqrt28ps, rsqrt28pd
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
let ExeDomain = _.ExeDomain in {
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT)),
+ itins.rr>, Sched<[itins.Sched]>;
defm m : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
(OpNode (_.FloatVT
(bitconvert (_.LdFrag addr:$src))),
- (i32 FROUND_CURRENT))>;
+ (i32 FROUND_CURRENT)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
(OpNode (_.FloatVT
(X86VBroadcast (_.ScalarLdFrag addr:$src))),
- (i32 FROUND_CURRENT))>, EVEX_B;
+ (i32 FROUND_CURRENT)), itins.rm>, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
let ExeDomain = _.ExeDomain in
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
- (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC))>, EVEX_B;
+ (OpNode (_.VT _.RC:$src), (i32 FROUND_NO_EXC)),
+ itins.rr>, EVEX_B, Sched<[itins.Sched]>;
}
-multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode> {
- defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
- avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode>,
+multiclass avx512_eri<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ SizeItins itins> {
+ defm PS : avx512_fp28_p<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>,
+ avx512_fp28_p_round<opc, OpcodeStr#"ps", v16f32_info, OpNode, itins.s>,
T8PD, EVEX_V512, EVEX_CD8<32, CD8VF>;
- defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
- avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode>,
+ defm PD : avx512_fp28_p<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>,
+ avx512_fp28_p_round<opc, OpcodeStr#"pd", v8f64_info, OpNode, itins.d>,
T8PD, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
}
multiclass avx512_fp_unaryop_packed<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
+ SDNode OpNode, SizeItins itins> {
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
- defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode>,
+ defm PSZ128 : avx512_fp28_p<opc, OpcodeStr#"ps", v4f32x_info, OpNode, itins.s>,
EVEX_V128, T8PD, EVEX_CD8<32, CD8VF>;
- defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode>,
+ defm PSZ256 : avx512_fp28_p<opc, OpcodeStr#"ps", v8f32x_info, OpNode, itins.s>,
EVEX_V256, T8PD, EVEX_CD8<32, CD8VF>;
- defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode>,
+ defm PDZ128 : avx512_fp28_p<opc, OpcodeStr#"pd", v2f64x_info, OpNode, itins.d>,
EVEX_V128, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
- defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode>,
+ defm PDZ256 : avx512_fp28_p<opc, OpcodeStr#"pd", v4f64x_info, OpNode, itins.d>,
EVEX_V256, VEX_W, T8PD, EVEX_CD8<64, CD8VF>;
}
}
let Predicates = [HasERI] in {
- defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28>, EVEX;
- defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28>, EVEX;
- defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2>, EVEX;
+ defm VRSQRT28 : avx512_eri<0xCC, "vrsqrt28", X86rsqrt28, SSE_RSQRT_P>, EVEX;
+ defm VRCP28 : avx512_eri<0xCA, "vrcp28", X86rcp28, SSE_RCP_P>, EVEX;
+ defm VEXP2 : avx512_eri<0xC8, "vexp2", X86exp2, SSE_ALU_ITINS_P>, EVEX;
}
-defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
- avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd> , EVEX;
+defm VGETEXP : avx512_eri<0x42, "vgetexp", X86fgetexpRnd, SSE_ALU_ITINS_P>,
+ avx512_fp_unaryop_packed<0x42, "vgetexp", X86fgetexpRnd,
+ SSE_ALU_ITINS_P>, EVEX;
-multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd, X86VectorVTInfo _>{
+multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain in
defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
- (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
- EVEX, EVEX_B, EVEX_RC;
+ (_.VT (X86fsqrtRnd _.RC:$src, (i32 imm:$rc))), itins.rr>,
+ EVEX, EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
}
-multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _>{
+multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain in {
defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.FloatVT (OpNode _.RC:$src))>, EVEX;
+ (_.FloatVT (fsqrt _.RC:$src)), itins.rr>, EVEX,
+ Sched<[itins.Sched]>;
defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
- (OpNode (_.FloatVT
- (bitconvert (_.LdFrag addr:$src))))>, EVEX;
-
+ (fsqrt (_.FloatVT
+ (bitconvert (_.LdFrag addr:$src)))), itins.rm>, EVEX,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
- (OpNode (_.FloatVT
- (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
- EVEX, EVEX_B;
+ (fsqrt (_.FloatVT
+ (X86VBroadcast (_.ScalarLdFrag addr:$src)))), itins.rm>,
+ EVEX, EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
- SDNode OpNode> {
- defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
- v16f32_info>,
+multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr> {
+ defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS, v16f32_info>,
EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
- v8f64_info>,
+ defm PDZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD, v8f64_info>,
EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
// Define only if AVX512VL feature is present.
let Predicates = [HasVLX] in {
defm PSZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, v4f32x_info>,
+ SSE_SQRTPS, v4f32x_info>,
EVEX_V128, PS, EVEX_CD8<32, CD8VF>;
defm PSZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
- OpNode, v8f32x_info>,
+ SSE_SQRTPS, v8f32x_info>,
EVEX_V256, PS, EVEX_CD8<32, CD8VF>;
defm PDZ128 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, v2f64x_info>,
+ SSE_SQRTPD, v2f64x_info>,
EVEX_V128, VEX_W, PD, EVEX_CD8<64, CD8VF>;
defm PDZ256 : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "pd"),
- OpNode, v4f64x_info>,
+ SSE_SQRTPD, v4f64x_info>,
EVEX_V256, VEX_W, PD, EVEX_CD8<64, CD8VF>;
}
}
-multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
- SDNode OpNodeRnd> {
- defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), OpNodeRnd,
+multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr> {
+ defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"), SSE_SQRTPS,
v16f32_info>, EVEX_V512, PS, EVEX_CD8<32, CD8VF>;
- defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), OpNodeRnd,
+ defm PDZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "pd"), SSE_SQRTPD,
v8f64_info>, EVEX_V512, VEX_W, PD, EVEX_CD8<64, CD8VF>;
}
-multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
- string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
+multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _, string SUFF, Intrinsic Intr> {
let ExeDomain = _.ExeDomain in {
defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNodeRnd (_.VT _.RC:$src1),
+ (X86fsqrtRnds (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 FROUND_CURRENT))>;
+ (i32 FROUND_CURRENT)), itins.rr>,
+ Sched<[itins.Sched]>;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
- (OpNodeRnd (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src2))),
- (i32 FROUND_CURRENT))>;
-
+ (X86fsqrtRnds (_.VT _.RC:$src1),
+ _.ScalarIntMemCPat:$src2,
+ (i32 FROUND_CURRENT)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
- (OpNodeRnd (_.VT _.RC:$src1),
+ (X86fsqrtRnds (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$rc))>,
- EVEX_B, EVEX_RC;
+ (i32 imm:$rc)), itins.rr>,
+ EVEX_B, EVEX_RC, Sched<[itins.Sched]>;
let isCodeGenOnly = 1, hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
-
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rr>,
+ Sched<[itins.Sched]>;
let mayLoad = 1 in
def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
- OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
+ OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", [], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
- def : Pat<(_.EltVT (OpNode _.FRC:$src)),
+let Predicates = [HasAVX512] in {
+ def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
(!cast<Instruction>(NAME#SUFF#Zr)
(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
- def : Pat<(_.EltVT (OpNode (load addr:$src))),
+ def : Pat<(Intr VR128X:$src),
+ (!cast<Instruction>(NAME#SUFF#Zr_Int) VR128X:$src,
+ VR128X:$src)>;
+}
+
+let Predicates = [HasAVX512, OptForSize] in {
+ def : Pat<(_.EltVT (fsqrt (load addr:$src))),
(!cast<Instruction>(NAME#SUFF#Zm)
- (_.EltVT (IMPLICIT_DEF)), addr:$src)>, Requires<[HasAVX512, OptForSize]>;
+ (_.EltVT (IMPLICIT_DEF)), addr:$src)>;
+
+ def : Pat<(Intr _.ScalarIntMemCPat:$src2),
+ (!cast<Instruction>(NAME#SUFF#Zm_Int)
+ (_.VT (IMPLICIT_DEF)), addr:$src2)>;
+}
+
}
multiclass avx512_sqrt_scalar_all<bits<8> opc, string OpcodeStr> {
- defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", f32x_info, "SS", fsqrt,
- X86fsqrtRnds>, EVEX_CD8<32, CD8VT1>, EVEX_4V, XS;
- defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", f64x_info, "SD", fsqrt,
- X86fsqrtRnds>, EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W;
+ defm SSZ : avx512_sqrt_scalar<opc, OpcodeStr#"ss", SSE_SQRTPS, f32x_info, "SS",
+ int_x86_sse_sqrt_ss>,
+ EVEX_CD8<32, CD8VT1>, EVEX_4V, XS, NotMemoryFoldable;
+ defm SDZ : avx512_sqrt_scalar<opc, OpcodeStr#"sd", SSE_SQRTPD, f64x_info, "SD",
+ int_x86_sse2_sqrt_sd>,
+ EVEX_CD8<64, CD8VT1>, EVEX_4V, XD, VEX_W,
+ NotMemoryFoldable;
}
-defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt", fsqrt>,
- avx512_sqrt_packed_all_round<0x51, "vsqrt", X86fsqrtRnd>;
+defm VSQRT : avx512_sqrt_packed_all<0x51, "vsqrt">,
+ avx512_sqrt_packed_all_round<0x51, "vsqrt">;
defm VSQRT : avx512_sqrt_scalar_all<0x51, "vsqrt">, VEX_LIG;
-let Predicates = [HasAVX512] in {
- def : Pat<(f32 (X86frsqrt FR32X:$src)),
- (COPY_TO_REGCLASS (VRSQRT14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>;
- def : Pat<(f32 (X86frsqrt (load addr:$src))),
- (COPY_TO_REGCLASS (VRSQRT14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
- Requires<[OptForSize]>;
- def : Pat<(f32 (X86frcp FR32X:$src)),
- (COPY_TO_REGCLASS (VRCP14SSrr (v4f32 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X )>;
- def : Pat<(f32 (X86frcp (load addr:$src))),
- (COPY_TO_REGCLASS (VRCP14SSrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
- Requires<[OptForSize]>;
-}
-
-multiclass
-avx512_rndscale_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _> {
-
+multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
- defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+ (i32 imm:$src3))), itins.rr>,
+ Sched<[itins.Sched]>;
- defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+ defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
- (_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- (i32 imm:$src3), (i32 FROUND_NO_EXC)))>, EVEX_B;
+ (_.VT (X86RndScalesRnd (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+ (i32 imm:$src3), (i32 FROUND_NO_EXC))), itins.rr>, EVEX_B,
+ Sched<[itins.Sched]>;
- defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
- (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+ (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
- (_.VT (X86RndScales (_.VT _.RC:$src1),
- (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3), (i32 FROUND_CURRENT)))>;
+ (_.VT (X86RndScales _.RC:$src1,
+ _.ScalarIntMemCPat:$src2, (i32 imm:$src3))), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+
+ let isCodeGenOnly = 1, hasSideEffects = 0 in {
+ def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [], itins.rr>, Sched<[itins.Sched]>;
+
+ let mayLoad = 1 in
+ def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+ (ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
+ OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [], itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
+ }
+
let Predicates = [HasAVX512] in {
- def : Pat<(ffloor _.FRC:$src), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x9))), _.FRC)>;
- def : Pat<(fceil _.FRC:$src), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xa))), _.FRC)>;
- def : Pat<(ftrunc _.FRC:$src), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xb))), _.FRC)>;
- def : Pat<(frint _.FRC:$src), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0x4))), _.FRC)>;
- def : Pat<(fnearbyint _.FRC:$src), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##r) (_.VT (IMPLICIT_DEF)),
- (_.VT (COPY_TO_REGCLASS _.FRC:$src, _.RC)), (i32 0xc))), _.FRC)>;
-
- def : Pat<(ffloor (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x9))), _.FRC)>;
- def : Pat<(fceil (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0xa))), _.FRC)>;
- def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0xb))), _.FRC)>;
- def : Pat<(frint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0x4))), _.FRC)>;
- def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)), (COPY_TO_REGCLASS
- (_.VT (!cast<Instruction>(NAME##m) (_.VT (IMPLICIT_DEF)),
- addr:$src, (i32 0xc))), _.FRC)>;
- }
-}
-
-defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", f32x_info>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-
-defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", f64x_info>, VEX_W,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VT1>;
+ def : Pat<(ffloor _.FRC:$src),
+ (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+ _.FRC:$src, (i32 0x9)))>;
+ def : Pat<(fceil _.FRC:$src),
+ (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+ _.FRC:$src, (i32 0xa)))>;
+ def : Pat<(ftrunc _.FRC:$src),
+ (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+ _.FRC:$src, (i32 0xb)))>;
+ def : Pat<(frint _.FRC:$src),
+ (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+ _.FRC:$src, (i32 0x4)))>;
+ def : Pat<(fnearbyint _.FRC:$src),
+ (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+ _.FRC:$src, (i32 0xc)))>;
+ }
+
+ let Predicates = [HasAVX512, OptForSize] in {
+ def : Pat<(ffloor (_.ScalarLdFrag addr:$src)),
+ (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x9)))>;
+ def : Pat<(fceil (_.ScalarLdFrag addr:$src)),
+ (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+ addr:$src, (i32 0xa)))>;
+ def : Pat<(ftrunc (_.ScalarLdFrag addr:$src)),
+ (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+ addr:$src, (i32 0xb)))>;
+ def : Pat<(frint (_.ScalarLdFrag addr:$src)),
+ (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+ addr:$src, (i32 0x4)))>;
+ def : Pat<(fnearbyint (_.ScalarLdFrag addr:$src)),
+ (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+ addr:$src, (i32 0xc)))>;
+ }
+}
+
+defm VRNDSCALESS : avx512_rndscale_scalar<0x0A, "vrndscaless", SSE_ALU_F32S,
+ f32x_info>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VT1>;
+
+defm VRNDSCALESD : avx512_rndscale_scalar<0x0B, "vrndscalesd", SSE_ALU_F64S,
+ f64x_info>, VEX_W, AVX512AIi8Base, EVEX_4V,
+ EVEX_CD8<64, CD8VT1>;
//-------------------------------------------------
// Integer truncate and extend operations
//-------------------------------------------------
+let Sched = WriteShuffle256 in
+def AVX512_EXTEND : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
+let Sched = WriteShuffle256 in
+def AVX512_TRUNCATE : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo SrcInfo, X86VectorVTInfo DestInfo,
- X86MemOperand x86memop> {
+ OpndItins itins, X86VectorVTInfo SrcInfo,
+ X86VectorVTInfo DestInfo, X86MemOperand x86memop> {
let ExeDomain = DestInfo.ExeDomain in
defm rr : AVX512_maskable<opc, MRMDestReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1), OpcodeStr ,"$src1", "$src1",
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1)))>,
- EVEX, T8XS;
-
- // for intrinsic patter match
- def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
- undef)),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
- SrcInfo.RC:$src1)>;
-
- def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
- DestInfo.ImmAllZerosV)),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrkz) DestInfo.KRCWM:$mask ,
- SrcInfo.RC:$src1)>;
-
- def : Pat<(DestInfo.VT (X86select DestInfo.KRCWM:$mask,
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
- DestInfo.RC:$src0)),
- (!cast<Instruction>(NAME#SrcInfo.ZSuffix##rrk) DestInfo.RC:$src0,
- DestInfo.KRCWM:$mask ,
- SrcInfo.RC:$src1)>;
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1))),
+ itins.rr>, EVEX, T8XS, Sched<[itins.Sched]>;
let mayStore = 1, mayLoad = 1, hasSideEffects = 0,
ExeDomain = DestInfo.ExeDomain in {
def mr : AVX512XS8I<opc, MRMDestMem, (outs),
(ins x86memop:$dst, SrcInfo.RC:$src),
OpcodeStr # "\t{$src, $dst|$dst, $src}",
- []>, EVEX;
+ [], itins.rm>, EVEX, Sched<[itins.Sched.Folded]>;
def mrk : AVX512XS8I<opc, MRMDestMem, (outs),
(ins x86memop:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
- []>, EVEX, EVEX_K;
+ [], itins.rm>, EVEX, EVEX_K, Sched<[itins.Sched.Folded]>;
}//mayStore = 1, mayLoad = 1, hasSideEffects = 0
}
@@ -8094,112 +8099,118 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
}
multiclass avx512_trunc<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
+ OpndItins itins, AVX512VLVectorVTInfo VTSrcInfo, X86VectorVTInfo DestInfoZ128,
X86VectorVTInfo DestInfoZ256, X86VectorVTInfo DestInfoZ,
X86MemOperand x86memopZ128, X86MemOperand x86memopZ256,
X86MemOperand x86memopZ, PatFrag truncFrag, PatFrag mtruncFrag,
Predicate prd = HasAVX512>{
let Predicates = [HasVLX, prd] in {
- defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info128,
- DestInfoZ128, x86memopZ128>,
+ defm Z128: avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+ VTSrcInfo.info128, DestInfoZ128, x86memopZ128>,
avx512_trunc_mr_lowering<VTSrcInfo.info128, DestInfoZ128,
truncFrag, mtruncFrag>, EVEX_V128;
- defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info256,
- DestInfoZ256, x86memopZ256>,
+ defm Z256: avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+ VTSrcInfo.info256, DestInfoZ256, x86memopZ256>,
avx512_trunc_mr_lowering<VTSrcInfo.info256, DestInfoZ256,
truncFrag, mtruncFrag>, EVEX_V256;
}
let Predicates = [prd] in
- defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, VTSrcInfo.info512,
- DestInfoZ, x86memopZ>,
+ defm Z: avx512_trunc_common<opc, OpcodeStr, OpNode, itins,
+ VTSrcInfo.info512, DestInfoZ, x86memopZ>,
avx512_trunc_mr_lowering<VTSrcInfo.info512, DestInfoZ,
truncFrag, mtruncFrag>, EVEX_V512;
}
multiclass avx512_trunc_qb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- PatFrag StoreNode, PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ OpndItins itins, PatFrag StoreNode,
+ PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
v16i8x_info, v16i8x_info, v16i8x_info, i16mem, i32mem, i64mem,
StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VO>;
}
multiclass avx512_trunc_qw<bits<8> opc, string OpcodeStr, SDNode OpNode,
- PatFrag StoreNode, PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ OpndItins itins, PatFrag StoreNode,
+ PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
v8i16x_info, v8i16x_info, v8i16x_info, i32mem, i64mem, i128mem,
StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VQ>;
}
multiclass avx512_trunc_qd<bits<8> opc, string OpcodeStr, SDNode OpNode,
- PatFrag StoreNode, PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i64_info,
+ OpndItins itins, PatFrag StoreNode,
+ PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i64_info,
v4i32x_info, v4i32x_info, v8i32x_info, i64mem, i128mem, i256mem,
StoreNode, MaskedStoreNode>, EVEX_CD8<32, CD8VH>;
}
multiclass avx512_trunc_db<bits<8> opc, string OpcodeStr, SDNode OpNode,
- PatFrag StoreNode, PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ OpndItins itins, PatFrag StoreNode,
+ PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info,
v16i8x_info, v16i8x_info, v16i8x_info, i32mem, i64mem, i128mem,
StoreNode, MaskedStoreNode>, EVEX_CD8<8, CD8VQ>;
}
multiclass avx512_trunc_dw<bits<8> opc, string OpcodeStr, SDNode OpNode,
- PatFrag StoreNode, PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i32_info,
+ OpndItins itins, PatFrag StoreNode,
+ PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i32_info,
v8i16x_info, v8i16x_info, v16i16x_info, i64mem, i128mem, i256mem,
StoreNode, MaskedStoreNode>, EVEX_CD8<16, CD8VH>;
}
multiclass avx512_trunc_wb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- PatFrag StoreNode, PatFrag MaskedStoreNode> {
- defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, avx512vl_i16_info,
+ OpndItins itins, PatFrag StoreNode,
+ PatFrag MaskedStoreNode> {
+ defm NAME: avx512_trunc<opc, OpcodeStr, OpNode, itins, avx512vl_i16_info,
v16i8x_info, v16i8x_info, v32i8x_info, i64mem, i128mem, i256mem,
StoreNode, MaskedStoreNode, HasBWI>, EVEX_CD8<16, CD8VH>;
}
-defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc,
+defm VPMOVQB : avx512_trunc_qb<0x32, "vpmovqb", X86vtrunc, AVX512_TRUNCATE,
truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs,
+defm VPMOVSQB : avx512_trunc_qb<0x22, "vpmovsqb", X86vtruncs, AVX512_TRUNCATE,
truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus,
+defm VPMOVUSQB : avx512_trunc_qb<0x12, "vpmovusqb", X86vtruncus, AVX512_TRUNCATE,
truncstore_us_vi8, masked_truncstore_us_vi8>;
-defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc,
+defm VPMOVQW : avx512_trunc_qw<0x34, "vpmovqw", X86vtrunc, AVX512_TRUNCATE,
truncstorevi16, masked_truncstorevi16>;
-defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs,
+defm VPMOVSQW : avx512_trunc_qw<0x24, "vpmovsqw", X86vtruncs, AVX512_TRUNCATE,
truncstore_s_vi16, masked_truncstore_s_vi16>;
-defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus,
+defm VPMOVUSQW : avx512_trunc_qw<0x14, "vpmovusqw", X86vtruncus, AVX512_TRUNCATE,
truncstore_us_vi16, masked_truncstore_us_vi16>;
-defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc,
+defm VPMOVQD : avx512_trunc_qd<0x35, "vpmovqd", X86vtrunc, AVX512_TRUNCATE,
truncstorevi32, masked_truncstorevi32>;
-defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs,
+defm VPMOVSQD : avx512_trunc_qd<0x25, "vpmovsqd", X86vtruncs, AVX512_TRUNCATE,
truncstore_s_vi32, masked_truncstore_s_vi32>;
-defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus,
+defm VPMOVUSQD : avx512_trunc_qd<0x15, "vpmovusqd", X86vtruncus, AVX512_TRUNCATE,
truncstore_us_vi32, masked_truncstore_us_vi32>;
-defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc,
+defm VPMOVDB : avx512_trunc_db<0x31, "vpmovdb", X86vtrunc, AVX512_TRUNCATE,
truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs,
+defm VPMOVSDB : avx512_trunc_db<0x21, "vpmovsdb", X86vtruncs, AVX512_TRUNCATE,
truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus,
+defm VPMOVUSDB : avx512_trunc_db<0x11, "vpmovusdb", X86vtruncus, AVX512_TRUNCATE,
truncstore_us_vi8, masked_truncstore_us_vi8>;
-defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc,
+defm VPMOVDW : avx512_trunc_dw<0x33, "vpmovdw", X86vtrunc, AVX512_TRUNCATE,
truncstorevi16, masked_truncstorevi16>;
-defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs,
+defm VPMOVSDW : avx512_trunc_dw<0x23, "vpmovsdw", X86vtruncs, AVX512_TRUNCATE,
truncstore_s_vi16, masked_truncstore_s_vi16>;
-defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus,
+defm VPMOVUSDW : avx512_trunc_dw<0x13, "vpmovusdw", X86vtruncus, AVX512_TRUNCATE,
truncstore_us_vi16, masked_truncstore_us_vi16>;
-defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc,
+defm VPMOVWB : avx512_trunc_wb<0x30, "vpmovwb", X86vtrunc, AVX512_TRUNCATE,
truncstorevi8, masked_truncstorevi8>;
-defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs,
+defm VPMOVSWB : avx512_trunc_wb<0x20, "vpmovswb", X86vtruncs, AVX512_TRUNCATE,
truncstore_s_vi8, masked_truncstore_s_vi8>;
-defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus,
+defm VPMOVUSWB : avx512_trunc_wb<0x10, "vpmovuswb", X86vtruncus, AVX512_TRUNCATE,
truncstore_us_vi8, masked_truncstore_us_vi8>;
let Predicates = [HasAVX512, NoVLX] in {
@@ -8219,191 +8230,151 @@ def: Pat<(v16i8 (X86vtrunc (v16i16 VR256X:$src))),
VR256X:$src, sub_ymm))), sub_xmm))>;
}
-multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
+multiclass avx512_extend_common<bits<8> opc, string OpcodeStr, OpndItins itins,
X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo,
X86MemOperand x86memop, PatFrag LdFrag, SDPatternOperator OpNode>{
let ExeDomain = DestInfo.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src), OpcodeStr ,"$src", "$src",
- (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src)))>,
- EVEX;
+ (DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src))), itins.rr>,
+ EVEX, Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
(ins x86memop:$src), OpcodeStr ,"$src", "$src",
- (DestInfo.VT (LdFrag addr:$src))>,
- EVEX;
+ (DestInfo.VT (LdFrag addr:$src)), itins.rm>,
+ EVEX, Sched<[itins.Sched.Folded]>;
}
}
multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode,
- string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
+ OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
let Predicates = [HasVLX, HasBWI] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, v8i16x_info,
+ defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v8i16x_info,
v16i8x_info, i64mem, LdFrag, InVecNode>,
- EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, v16i16x_info,
+ defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v16i16x_info,
v16i8x_info, i128mem, LdFrag, OpNode>,
- EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256;
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasBWI] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, v32i16_info,
+ defm Z : avx512_extend_common<opc, OpcodeStr, itins, v32i16_info,
v32i8x_info, i256mem, LdFrag, OpNode>,
- EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512;
+ EVEX_CD8<8, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
}
}
multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode,
- string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
+ OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+ defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info,
v16i8x_info, i32mem, LdFrag, InVecNode>,
- EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+ defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info,
v16i8x_info, i64mem, LdFrag, OpNode>,
- EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256;
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
+ defm Z : avx512_extend_common<opc, OpcodeStr, itins, v16i32_info,
v16i8x_info, i128mem, LdFrag, OpNode>,
- EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512;
+ EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
}
}
multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode,
- string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
+ SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
+ OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
v16i8x_info, i16mem, LdFrag, InVecNode>,
- EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
v16i8x_info, i32mem, LdFrag, OpNode>,
- EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256;
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
v16i8x_info, i64mem, LdFrag, OpNode>,
- EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512;
+ EVEX_CD8<8, CD8VO>, T8PD, EVEX_V512, VEX_WIG;
}
}
multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode,
- string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
+ OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, v4i32x_info,
+ defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v4i32x_info,
v8i16x_info, i64mem, LdFrag, InVecNode>,
- EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, v8i32x_info,
+ defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v8i32x_info,
v8i16x_info, i128mem, LdFrag, OpNode>,
- EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256;
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, v16i32_info,
+ defm Z : avx512_extend_common<opc, OpcodeStr, itins, v16i32_info,
v16i16x_info, i256mem, LdFrag, OpNode>,
- EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512;
+ EVEX_CD8<16, CD8VH>, T8PD, EVEX_V512, VEX_WIG;
}
}
multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode,
- string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
+ SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
+ OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
v8i16x_info, i32mem, LdFrag, InVecNode>,
- EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128, VEX_WIG;
- defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
v8i16x_info, i64mem, LdFrag, OpNode>,
- EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256;
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V256, VEX_WIG;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
v8i16x_info, i128mem, LdFrag, OpNode>,
- EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512;
+ EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V512, VEX_WIG;
}
}
multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
- SDPatternOperator OpNode, SDPatternOperator InVecNode,
- string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
+ SDPatternOperator OpNode, SDPatternOperator InVecNode, string ExtTy,
+ OpndItins itins, PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
let Predicates = [HasVLX, HasAVX512] in {
- defm Z128: avx512_extend_common<opc, OpcodeStr, v2i64x_info,
+ defm Z128: avx512_extend_common<opc, OpcodeStr, itins, v2i64x_info,
v4i32x_info, i64mem, LdFrag, InVecNode>,
EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
- defm Z256: avx512_extend_common<opc, OpcodeStr, v4i64x_info,
+ defm Z256: avx512_extend_common<opc, OpcodeStr, itins, v4i64x_info,
v4i32x_info, i128mem, LdFrag, OpNode>,
EVEX_CD8<32, CD8VH>, T8PD, EVEX_V256;
}
let Predicates = [HasAVX512] in {
- defm Z : avx512_extend_common<opc, OpcodeStr, v8i64_info,
+ defm Z : avx512_extend_common<opc, OpcodeStr, itins, v8i64_info,
v8i32x_info, i256mem, LdFrag, OpNode>,
EVEX_CD8<32, CD8VH>, T8PD, EVEX_V512;
}
}
-defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">;
-defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">;
-defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">;
-defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">;
-defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">;
-defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">;
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z", AVX512_EXTEND>;
-defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">;
-defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">;
-defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">;
-defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">;
-defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">;
-defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">;
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s", AVX512_EXTEND>;
-// EXTLOAD patterns, implemented using vpmovz
-multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
- X86VectorVTInfo From, PatFrag LdFrag> {
- def : Pat<(To.VT (LdFrag addr:$src)),
- (!cast<Instruction>("VPMOVZX"#InstrStr#"rm") addr:$src)>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src), To.RC:$src0)),
- (!cast<Instruction>("VPMOVZX"#InstrStr#"rmk") To.RC:$src0,
- To.KRC:$mask, addr:$src)>;
- def : Pat<(To.VT (vselect To.KRCWM:$mask, (LdFrag addr:$src),
- To.ImmAllZerosV)),
- (!cast<Instruction>("VPMOVZX"#InstrStr#"rmkz") To.KRC:$mask,
- addr:$src)>;
-}
-
-let Predicates = [HasVLX, HasBWI] in {
- defm : avx512_ext_lowering<"BWZ128", v8i16x_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"BWZ256", v16i16x_info, v16i8x_info, extloadvi8>;
-}
-let Predicates = [HasBWI] in {
- defm : avx512_ext_lowering<"BWZ", v32i16_info, v32i8x_info, extloadvi8>;
-}
-let Predicates = [HasVLX, HasAVX512] in {
- defm : avx512_ext_lowering<"BDZ128", v4i32x_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"BDZ256", v8i32x_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"BQZ128", v2i64x_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"BQZ256", v4i64x_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"WDZ128", v4i32x_info, v8i16x_info, extloadvi16>;
- defm : avx512_ext_lowering<"WDZ256", v8i32x_info, v8i16x_info, extloadvi16>;
- defm : avx512_ext_lowering<"WQZ128", v2i64x_info, v8i16x_info, extloadvi16>;
- defm : avx512_ext_lowering<"WQZ256", v4i64x_info, v8i16x_info, extloadvi16>;
- defm : avx512_ext_lowering<"DQZ128", v2i64x_info, v4i32x_info, extloadvi32>;
- defm : avx512_ext_lowering<"DQZ256", v4i64x_info, v4i32x_info, extloadvi32>;
-}
-let Predicates = [HasAVX512] in {
- defm : avx512_ext_lowering<"BDZ", v16i32_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"BQZ", v8i64_info, v16i8x_info, extloadvi8>;
- defm : avx512_ext_lowering<"WDZ", v16i32_info, v16i16x_info, extloadvi16>;
- defm : avx512_ext_lowering<"WQZ", v8i64_info, v8i16x_info, extloadvi16>;
- defm : avx512_ext_lowering<"DQZ", v8i64_info, v8i32x_info, extloadvi32>;
-}
multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
SDNode InVecOp, PatFrag ExtLoad16> {
@@ -8552,18 +8523,20 @@ defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;
//===----------------------------------------------------------------------===//
// GATHER - SCATTER Operations
+// FIXME: Improve scheduling of gather/scatter instructions.
multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
- X86MemOperand memop, PatFrag GatherNode> {
+ X86MemOperand memop, PatFrag GatherNode,
+ RegisterClass MaskRC = _.KRCWM> {
let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
ExeDomain = _.ExeDomain in
- def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, _.KRCWM:$mask_wb),
- (ins _.RC:$src1, _.KRCWM:$mask, memop:$src2),
+ def rm : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb),
+ (ins _.RC:$src1, MaskRC:$mask, memop:$src2),
!strconcat(OpcodeStr#_.Suffix,
"\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
- [(set _.RC:$dst, _.KRCWM:$mask_wb,
- (GatherNode (_.VT _.RC:$src1), _.KRCWM:$mask,
+ [(set _.RC:$dst, MaskRC:$mask_wb,
+ (GatherNode (_.VT _.RC:$src1), MaskRC:$mask,
vectoraddr:$src2))]>, EVEX, EVEX_K,
- EVEX_CD8<_.EltSize, CD8VT1>;
+ EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
}
multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
@@ -8598,7 +8571,8 @@ let Predicates = [HasVLX] in {
defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
vx128xmem, mgatherv4i32>, EVEX_V128;
defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
- vx64xmem, X86mgatherv2i64>, EVEX_V128;
+ vx64xmem, mgatherv2i64, VK2WM>,
+ EVEX_V128;
}
}
@@ -8620,7 +8594,8 @@ let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
"\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
[(set _.KRCWM:$mask_wb, (ScatterNode (_.VT _.RC:$src),
_.KRCWM:$mask, vectoraddr:$dst))]>,
- EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+ EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[WriteStore]>;
}
multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
@@ -8671,7 +8646,7 @@ multiclass avx512_gather_scatter_prefetch<bits<8> opc, Format F, string OpcodeSt
let Predicates = [HasPFI], hasSideEffects = 1 in
def m : AVX5128I<opc, F, (outs), (ins KRC:$mask, memop:$src),
!strconcat(OpcodeStr, "\t{$src {${mask}}|{${mask}}, $src}"),
- []>, EVEX, EVEX_K;
+ [], IIC_SSE_PREFETCH>, EVEX, EVEX_K, Sched<[WriteLoad]>;
}
defm VGATHERPF0DPS: avx512_gather_scatter_prefetch<0xC6, MRM1m, "vgatherpf0dps",
@@ -8722,20 +8697,11 @@ defm VSCATTERPF1DPD: avx512_gather_scatter_prefetch<0xC6, MRM6m, "vscatterpf1dpd
defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd",
VK8WM, vz512mem>, EVEX_V512, VEX_W, EVEX_CD8<64, CD8VT1>;
-// Helper fragments to match sext vXi1 to vXiY.
-def v64i1sextv64i8 : PatLeaf<(v64i8
- (X86vsext
- (v64i1 (X86pcmpgtm
- (bc_v64i8 (v16i32 immAllZerosV)),
- VR512:$src))))>;
-def v32i1sextv32i16 : PatLeaf<(v32i16 (X86vsrai VR512:$src, (i8 15)))>;
-def v16i1sextv16i32 : PatLeaf<(v16i32 (X86vsrai VR512:$src, (i8 31)))>;
-def v8i1sextv8i64 : PatLeaf<(v8i64 (X86vsrai VR512:$src, (i8 63)))>;
-
multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
!strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
- [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
+ [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))],
+ IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
}
// Use 512bit version to implement 128/256 bit in case NoVLX.
@@ -8773,7 +8739,8 @@ defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI
multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))]>, EVEX;
+ [(set _.KRC:$dst, (X86cvt2mask (_.VT _.RC:$src)))],
+ IIC_SSE_MOV_S_RR>, EVEX, Sched<[WriteMove]>;
}
// Use 512bit version to implement 128/256 bit in case NoVLX.
@@ -8819,27 +8786,39 @@ defm VPMOVQ2M : avx512_convert_vector_to_mask<0x39, "vpmovq2m",
// AVX-512 - COMPRESS and EXPAND
//
+// FIXME: Is there a better scheduler itinerary for VPCOMPRESS/VPEXPAND?
+let Sched = WriteShuffle256 in {
+def AVX512_COMPRESS : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+def AVX512_EXPAND : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+}
+
multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr> {
+ string OpcodeStr, OpndItins itins> {
defm rr : AVX512_maskable<opc, MRMDestReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86compress _.RC:$src1))>, AVX5128IBase;
+ (_.VT (X86compress _.RC:$src1)), itins.rr>, AVX5128IBase,
+ Sched<[itins.Sched]>;
let mayStore = 1, hasSideEffects = 0 in
def mr : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.RC:$src),
OpcodeStr # "\t{$src, $dst|$dst, $src}",
- []>, EVEX_CD8<_.EltSize, CD8VT1>;
+ []>, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[itins.Sched.Folded]>;
def mrk : AVX5128I<opc, MRMDestMem, (outs),
(ins _.MemOp:$dst, _.KRCWM:$mask, _.RC:$src),
OpcodeStr # "\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
[]>,
- EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>;
+ EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[itins.Sched.Folded]>;
}
multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
-
def : Pat<(X86mCompressingStore addr:$dst, _.KRCWM:$mask,
(_.VT _.RC:$src)),
(!cast<Instruction>(NAME#_.ZSuffix##mrk)
@@ -8847,39 +8826,44 @@ multiclass compress_by_vec_width_lowering<X86VectorVTInfo _ > {
}
multiclass compress_by_elt_width<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo VTInfo> {
- defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr>,
+ OpndItins itins,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate Pred = HasAVX512> {
+ let Predicates = [Pred] in
+ defm Z : compress_by_vec_width_common<opc, VTInfo.info512, OpcodeStr, itins>,
compress_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
- let Predicates = [HasVLX] in {
- defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr>,
+ let Predicates = [Pred, HasVLX] in {
+ defm Z256 : compress_by_vec_width_common<opc, VTInfo.info256, OpcodeStr, itins>,
compress_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
- defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr>,
+ defm Z128 : compress_by_vec_width_common<opc, VTInfo.info128, OpcodeStr, itins>,
compress_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
}
}
-defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", avx512vl_i32_info>,
- EVEX;
-defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", avx512vl_i64_info>,
- EVEX, VEX_W;
-defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", avx512vl_f32_info>,
- EVEX;
-defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", avx512vl_f64_info>,
- EVEX, VEX_W;
+defm VPCOMPRESSD : compress_by_elt_width <0x8B, "vpcompressd", AVX512_COMPRESS,
+ avx512vl_i32_info>, EVEX;
+defm VPCOMPRESSQ : compress_by_elt_width <0x8B, "vpcompressq", AVX512_COMPRESS,
+ avx512vl_i64_info>, EVEX, VEX_W;
+defm VCOMPRESSPS : compress_by_elt_width <0x8A, "vcompressps", AVX512_COMPRESS,
+ avx512vl_f32_info>, EVEX;
+defm VCOMPRESSPD : compress_by_elt_width <0x8A, "vcompresspd", AVX512_COMPRESS,
+ avx512vl_f64_info>, EVEX, VEX_W;
// expand
multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr> {
+ string OpcodeStr, OpndItins itins> {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr, "$src1", "$src1",
- (_.VT (X86expand _.RC:$src1))>, AVX5128IBase;
+ (_.VT (X86expand _.RC:$src1)), itins.rr>, AVX5128IBase,
+ Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr, "$src1", "$src1",
(_.VT (X86expand (_.VT (bitconvert
- (_.LdFrag addr:$src1)))))>,
- AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>;
+ (_.LdFrag addr:$src1))))), itins.rm>,
+ AVX5128IBase, EVEX_CD8<_.EltSize, CD8VT1>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
@@ -8895,59 +8879,62 @@ multiclass expand_by_vec_width_lowering<X86VectorVTInfo _ > {
}
multiclass expand_by_elt_width<bits<8> opc, string OpcodeStr,
- AVX512VLVectorVTInfo VTInfo> {
- defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr>,
+ OpndItins itins,
+ AVX512VLVectorVTInfo VTInfo,
+ Predicate Pred = HasAVX512> {
+ let Predicates = [Pred] in
+ defm Z : expand_by_vec_width<opc, VTInfo.info512, OpcodeStr, itins>,
expand_by_vec_width_lowering<VTInfo.info512>, EVEX_V512;
- let Predicates = [HasVLX] in {
- defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr>,
+ let Predicates = [Pred, HasVLX] in {
+ defm Z256 : expand_by_vec_width<opc, VTInfo.info256, OpcodeStr, itins>,
expand_by_vec_width_lowering<VTInfo.info256>, EVEX_V256;
- defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr>,
+ defm Z128 : expand_by_vec_width<opc, VTInfo.info128, OpcodeStr, itins>,
expand_by_vec_width_lowering<VTInfo.info128>, EVEX_V128;
}
}
-defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", avx512vl_i32_info>,
- EVEX;
-defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", avx512vl_i64_info>,
- EVEX, VEX_W;
-defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", avx512vl_f32_info>,
- EVEX;
-defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", avx512vl_f64_info>,
- EVEX, VEX_W;
+defm VPEXPANDD : expand_by_elt_width <0x89, "vpexpandd", AVX512_EXPAND,
+ avx512vl_i32_info>, EVEX;
+defm VPEXPANDQ : expand_by_elt_width <0x89, "vpexpandq", AVX512_EXPAND,
+ avx512vl_i64_info>, EVEX, VEX_W;
+defm VEXPANDPS : expand_by_elt_width <0x88, "vexpandps", AVX512_EXPAND,
+ avx512vl_f32_info>, EVEX;
+defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", AVX512_EXPAND,
+ avx512vl_f64_info>, EVEX, VEX_W;
//handle instruction reg_vec1 = op(reg_vec,imm)
// op(mem_vec,imm)
// op(broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1),
- (i32 imm:$src2),
- (i32 FROUND_CURRENT))>;
+ (i32 imm:$src2)), itins.rr>, Sched<[itins.Sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
(OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i32 imm:$src2),
- (i32 FROUND_CURRENT))>;
+ (i32 imm:$src2)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr##", $src2",
(OpNode (_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src1))),
- (i32 imm:$src2),
- (i32 FROUND_CURRENT))>, EVEX_B;
+ (i32 imm:$src2)), itins.rm>, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _>{
+ SDNode OpNode, OpndItins itins,
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
@@ -8955,21 +8942,24 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
"$src1, {sae}, $src2",
(OpNode (_.VT _.RC:$src1),
(i32 imm:$src2),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ EVEX_B, Sched<[itins.Sched]>;
}
multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
- AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
- EVEX_V512;
+ defm Z : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+ _.info512>,
+ avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd,
+ itins, _.info512>, EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
- EVEX_V128;
- defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
- EVEX_V256;
+ defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+ _.info128>, EVEX_V128;
+ defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, itins,
+ _.info256>, EVEX_V256;
}
}
@@ -8978,51 +8968,54 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
// op(reg_vec2,broadcast(eltVt),imm)
//all instruction created with FROUND_CURRENT
multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
+ OpndItins itins, X86VectorVTInfo _>{
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3),
- (i32 FROUND_CURRENT))>;
+ (i32 imm:$src3)), itins.rr>,
+ Sched<[itins.Sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.MemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (bitconvert (_.LdFrag addr:$src2))),
- (i32 imm:$src3),
- (i32 FROUND_CURRENT))>;
+ (i32 imm:$src3)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3),
- (i32 FROUND_CURRENT))>, EVEX_B;
+ (i32 imm:$src3)), itins.rm>, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_vec,imm)
multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo>{
+ OpndItins itins, X86VectorVTInfo DestInfo,
+ X86VectorVTInfo SrcInfo>{
let ExeDomain = DestInfo.ExeDomain in {
defm rri : AVX512_maskable<opc, MRMSrcReg, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1, SrcInfo.RC:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT SrcInfo.RC:$src2),
- (i8 imm:$src3)))>;
+ (i8 imm:$src3))), itins.rr>,
+ Sched<[itins.Sched]>;
defm rmi : AVX512_maskable<opc, MRMSrcMem, DestInfo, (outs DestInfo.RC:$dst),
(ins SrcInfo.RC:$src1, SrcInfo.MemOp:$src2, u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(DestInfo.VT (OpNode (SrcInfo.VT SrcInfo.RC:$src1),
(SrcInfo.VT (bitconvert
(SrcInfo.LdFrag addr:$src2))),
- (i8 imm:$src3)))>;
+ (i8 imm:$src3))), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -9030,8 +9023,8 @@ multiclass avx512_3Op_rm_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
// op(reg_vec2,mem_vec,imm)
// op(reg_vec2,broadcast(eltVt),imm)
multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>:
- avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, _, _>{
+ OpndItins itins, X86VectorVTInfo _>:
+ avx512_3Op_rm_imm8<opc, OpcodeStr, OpNode, itins, _, _>{
let ExeDomain = _.ExeDomain in
defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
@@ -9040,36 +9033,37 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
"$src1, ${src2}"##_.BroadcastStr##", $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src2))),
- (i8 imm:$src3))>, EVEX_B;
+ (i8 imm:$src3)), itins.rm>, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm)
// op(reg_vec2,mem_scalar,imm)
-//all instruction created with FROUND_CURRENT
multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
- (i32 imm:$src3),
- (i32 FROUND_CURRENT))>;
+ (i32 imm:$src3)), itins.rr>,
+ Sched<[itins.Sched]>;
defm rmi : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
(OpNode (_.VT _.RC:$src1),
(_.VT (scalar_to_vector
(_.ScalarLdFrag addr:$src2))),
- (i32 imm:$src3),
- (i32 FROUND_CURRENT))>;
+ (i32 imm:$src3)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
//handle instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _>{
+ SDNode OpNode, OpndItins itins,
+ X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
@@ -9078,11 +9072,13 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(i32 imm:$src3),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ EVEX_B, Sched<[itins.Sched]>;
}
+
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
-multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _> {
+multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in
defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
@@ -9091,113 +9087,114 @@ multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(i32 imm:$src3),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ EVEX_B, Sched<[itins.Sched]>;
}
multiclass avx512_common_fp_sae_packed_imm<string OpcodeStr,
- AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
let Predicates = [prd] in {
- defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
- avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNode, _.info512>,
+ defm Z : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info512>,
+ avx512_fp_sae_packed_imm<opc, OpcodeStr, OpNodeRnd, itins, _.info512>,
EVEX_V512;
}
let Predicates = [prd, HasVLX] in {
- defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info128>,
+ defm Z128 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info128>,
EVEX_V128;
- defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, _.info256>,
+ defm Z256 : avx512_fp_packed_imm<opc, OpcodeStr, OpNode, itins, _.info256>,
EVEX_V256;
}
}
multiclass avx512_common_3Op_rm_imm8<bits<8> opc, SDNode OpNode, string OpStr,
- AVX512VLVectorVTInfo DestInfo, AVX512VLVectorVTInfo SrcInfo>{
- let Predicates = [HasBWI] in {
- defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info512,
+ OpndItins itins, AVX512VLVectorVTInfo DestInfo,
+ AVX512VLVectorVTInfo SrcInfo, Predicate Pred = HasBWI> {
+ let Predicates = [Pred] in {
+ defm Z : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info512,
SrcInfo.info512>, EVEX_V512, AVX512AIi8Base, EVEX_4V;
}
- let Predicates = [HasBWI, HasVLX] in {
- defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info128,
+ let Predicates = [Pred, HasVLX] in {
+ defm Z128 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info128,
SrcInfo.info128>, EVEX_V128, AVX512AIi8Base, EVEX_4V;
- defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, DestInfo.info256,
+ defm Z256 : avx512_3Op_rm_imm8<opc, OpStr, OpNode, itins, DestInfo.info256,
SrcInfo.info256>, EVEX_V256, AVX512AIi8Base, EVEX_4V;
}
}
multiclass avx512_common_3Op_imm8<string OpcodeStr, AVX512VLVectorVTInfo _,
- bits<8> opc, SDNode OpNode>{
- let Predicates = [HasAVX512] in {
- defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
+ bits<8> opc, SDNode OpNode, OpndItins itins,
+ Predicate Pred = HasAVX512> {
+ let Predicates = [Pred] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info512>, EVEX_V512;
}
- let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info128>, EVEX_V128;
- defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
+ let Predicates = [Pred, HasVLX] in {
+ defm Z128 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info128>, EVEX_V128;
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, itins, _.info256>, EVEX_V256;
}
}
multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
- X86VectorVTInfo _, bits<8> opc, SDNode OpNode, Predicate prd>{
+ X86VectorVTInfo _, bits<8> opc, SDNode OpNode,
+ SDNode OpNodeRnd, OpndItins itins, Predicate prd>{
let Predicates = [prd] in {
- defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, _>,
- avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNode, _>;
+ defm Z128 : avx512_fp_scalar_imm<opc, OpcodeStr, OpNode, itins, _>,
+ avx512_fp_sae_scalar_imm<opc, OpcodeStr, OpNodeRnd, itins, _>;
}
}
multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
- bits<8> opcPs, bits<8> opcPd, SDNode OpNode, Predicate prd>{
+ bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
+ SDNode OpNodeRnd, SizeItins itins, Predicate prd>{
defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
- opcPs, OpNode, prd>, EVEX_CD8<32, CD8VF>;
+ opcPs, OpNode, OpNodeRnd, itins.s, prd>,
+ EVEX_CD8<32, CD8VF>;
defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
- opcPd, OpNode, prd>, EVEX_CD8<64, CD8VF>, VEX_W;
+ opcPd, OpNode, OpNodeRnd, itins.d, prd>,
+ EVEX_CD8<64, CD8VF>, VEX_W;
}
-
defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
- X86VReduce, HasDQI>, AVX512AIi8Base, EVEX;
+ X86VReduce, X86VReduceRnd, SSE_ALU_ITINS_P, HasDQI>,
+ AVX512AIi8Base, EVEX;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
- X86VRndScale, HasAVX512>, AVX512AIi8Base, EVEX;
+ X86VRndScale, X86VRndScaleRnd, SSE_ALU_ITINS_P, HasAVX512>,
+ AVX512AIi8Base, EVEX;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
- X86VGetMant, HasAVX512>, AVX512AIi8Base, EVEX;
-
+ X86VGetMant, X86VGetMantRnd, SSE_ALU_ITINS_P, HasAVX512>,
+ AVX512AIi8Base, EVEX;
defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
- 0x50, X86VRange, HasDQI>,
+ 0x50, X86VRange, X86VRangeRnd,
+ SSE_ALU_F64P, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
defm VRANGEPS : avx512_common_fp_sae_packed_imm<"vrangeps", avx512vl_f32_info,
- 0x50, X86VRange, HasDQI>,
+ 0x50, X86VRange, X86VRangeRnd,
+ SSE_ALU_F32P, HasDQI>,
AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd", f64x_info,
- 0x51, X86VRange, HasDQI>,
+defm VRANGESD: avx512_common_fp_sae_scalar_imm<"vrangesd",
+ f64x_info, 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F64S, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VRANGESS: avx512_common_fp_sae_scalar_imm<"vrangess", f32x_info,
- 0x51, X86VRange, HasDQI>,
+ 0x51, X86Ranges, X86RangesRnd, SSE_ALU_F32S, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VREDUCESD: avx512_common_fp_sae_scalar_imm<"vreducesd", f64x_info,
- 0x57, X86Reduces, HasDQI>,
+ 0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F64S, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VREDUCESS: avx512_common_fp_sae_scalar_imm<"vreducess", f32x_info,
- 0x57, X86Reduces, HasDQI>,
+ 0x57, X86Reduces, X86ReducesRnd, SSE_ALU_F32S, HasDQI>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VGETMANTSD: avx512_common_fp_sae_scalar_imm<"vgetmantsd", f64x_info,
- 0x27, X86GetMants, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F64S, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
defm VGETMANTSS: avx512_common_fp_sae_scalar_imm<"vgetmantss", f32x_info,
- 0x27, X86GetMants, HasAVX512>,
+ 0x27, X86GetMants, X86GetMantsRnd, SSE_ALU_F32S, HasAVX512>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
-multiclass avx512_shuff_packed_128<string OpcodeStr, AVX512VLVectorVTInfo _,
- bits<8> opc, SDNode OpNode = X86Shuf128>{
- let Predicates = [HasAVX512] in {
- defm Z : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info512>, EVEX_V512;
-
- }
- let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, OpNode, _.info256>, EVEX_V256;
- }
-}
let Predicates = [HasAVX512] in {
def : Pat<(v16f32 (ffloor VR512:$src)),
(VRNDSCALEPSZrri VR512:$src, (i32 0x9))>;
@@ -9222,14 +9219,71 @@ def : Pat<(v8f64 (ftrunc VR512:$src)),
(VRNDSCALEPDZrri VR512:$src, (i32 0xB))>;
}
-defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4",avx512vl_f32_info, 0x23>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2",avx512vl_f64_info, 0x23>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
-defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4",avx512vl_i32_info, 0x43>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2",avx512vl_i64_info, 0x43>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+let Predicates = [HasVLX] in {
+def : Pat<(v4f32 (ffloor VR128X:$src)),
+ (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x9))>;
+def : Pat<(v4f32 (fnearbyint VR128X:$src)),
+ (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xC))>;
+def : Pat<(v4f32 (fceil VR128X:$src)),
+ (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xA))>;
+def : Pat<(v4f32 (frint VR128X:$src)),
+ (VRNDSCALEPSZ128rri VR128X:$src, (i32 0x4))>;
+def : Pat<(v4f32 (ftrunc VR128X:$src)),
+ (VRNDSCALEPSZ128rri VR128X:$src, (i32 0xB))>;
+
+def : Pat<(v2f64 (ffloor VR128X:$src)),
+ (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x9))>;
+def : Pat<(v2f64 (fnearbyint VR128X:$src)),
+ (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xC))>;
+def : Pat<(v2f64 (fceil VR128X:$src)),
+ (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xA))>;
+def : Pat<(v2f64 (frint VR128X:$src)),
+ (VRNDSCALEPDZ128rri VR128X:$src, (i32 0x4))>;
+def : Pat<(v2f64 (ftrunc VR128X:$src)),
+ (VRNDSCALEPDZ128rri VR128X:$src, (i32 0xB))>;
+
+def : Pat<(v8f32 (ffloor VR256X:$src)),
+ (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x9))>;
+def : Pat<(v8f32 (fnearbyint VR256X:$src)),
+ (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xC))>;
+def : Pat<(v8f32 (fceil VR256X:$src)),
+ (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xA))>;
+def : Pat<(v8f32 (frint VR256X:$src)),
+ (VRNDSCALEPSZ256rri VR256X:$src, (i32 0x4))>;
+def : Pat<(v8f32 (ftrunc VR256X:$src)),
+ (VRNDSCALEPSZ256rri VR256X:$src, (i32 0xB))>;
+
+def : Pat<(v4f64 (ffloor VR256X:$src)),
+ (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x9))>;
+def : Pat<(v4f64 (fnearbyint VR256X:$src)),
+ (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xC))>;
+def : Pat<(v4f64 (fceil VR256X:$src)),
+ (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xA))>;
+def : Pat<(v4f64 (frint VR256X:$src)),
+ (VRNDSCALEPDZ256rri VR256X:$src, (i32 0x4))>;
+def : Pat<(v4f64 (ftrunc VR256X:$src)),
+ (VRNDSCALEPDZ256rri VR256X:$src, (i32 0xB))>;
+}
+
+multiclass avx512_shuff_packed_128<string OpcodeStr, OpndItins itins,
+ AVX512VLVectorVTInfo _, bits<8> opc>{
+ let Predicates = [HasAVX512] in {
+ defm Z : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info512>, EVEX_V512;
+
+ }
+ let Predicates = [HasAVX512, HasVLX] in {
+ defm Z256 : avx512_3Op_imm8<opc, OpcodeStr, X86Shuf128, itins, _.info256>, EVEX_V256;
+ }
+}
+
+defm VSHUFF32X4 : avx512_shuff_packed_128<"vshuff32x4", SSE_SHUFP,
+ avx512vl_f32_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFF64X2 : avx512_shuff_packed_128<"vshuff64x2", SSE_SHUFP,
+ avx512vl_f64_info, 0x23>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+defm VSHUFI32X4 : avx512_shuff_packed_128<"vshufi32x4", SSE_SHUFP,
+ avx512vl_i32_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+defm VSHUFI64X2 : avx512_shuff_packed_128<"vshufi64x2", SSE_SHUFP,
+ avx512vl_i64_info, 0x43>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
let Predicates = [HasAVX512] in {
// Provide fallback in case the load node that is used in the broadcast
@@ -9264,120 +9318,230 @@ def : Pat<(v64i8 (X86SubVBroadcast (v16i8 VR128X:$src))),
0)>;
}
-multiclass avx512_valign<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I> {
- defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign>,
+multiclass avx512_valign<string OpcodeStr, OpndItins itins,
+ AVX512VLVectorVTInfo VTInfo_I> {
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_I, 0x03, X86VAlign, itins>,
AVX512AIi8Base, EVEX_4V;
}
-defm VALIGND: avx512_valign<"valignd", avx512vl_i32_info>,
+defm VALIGND: avx512_valign<"valignd", SSE_PALIGN, avx512vl_i32_info>,
EVEX_CD8<32, CD8VF>;
-defm VALIGNQ: avx512_valign<"valignq", avx512vl_i64_info>,
+defm VALIGNQ: avx512_valign<"valignq", SSE_PALIGN, avx512vl_i64_info>,
EVEX_CD8<64, CD8VF>, VEX_W;
-multiclass avx512_vpalignr_lowering<X86VectorVTInfo _ , list<Predicate> p>{
- let Predicates = p in
- def NAME#_.VTName#rri:
- Pat<(_.VT (X86PAlignr _.RC:$src1, _.RC:$src2, (i8 imm:$imm))),
- (!cast<Instruction>(NAME#_.ZSuffix#rri)
- _.RC:$src1, _.RC:$src2, imm:$imm)>;
+defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr", SSE_PALIGN,
+ avx512vl_i8_info, avx512vl_i8_info>,
+ EVEX_CD8<8, CD8VF>;
+
+// Fragments to help convert valignq into masked valignd. Or valignq/valignd
+// into vpalignr.
+def ValignqImm32XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 2, SDLoc(N));
+}]>;
+def ValignqImm8XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 8, SDLoc(N));
+}]>;
+def ValigndImm8XForm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() * 4, SDLoc(N));
+}]>;
+
+multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From, X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> {
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
+ To.RC:$src1, To.RC:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.LdFrag addr:$src2)),
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.LdFrag addr:$src2)),
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+}
+
+multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
+ X86VectorVTInfo From,
+ X86VectorVTInfo To,
+ SDNodeXForm ImmXForm> :
+ avx512_vpalign_mask_lowering<OpcodeStr, OpNode, From, To, ImmXForm> {
+ def : Pat<(From.VT (OpNode From.RC:$src1,
+ (bitconvert (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3)),
+ (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3))),
+ To.RC:$src0)),
+ (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
+
+ def : Pat<(To.VT (vselect To.KRCWM:$mask,
+ (bitconvert
+ (From.VT (OpNode From.RC:$src1,
+ (bitconvert
+ (To.VT (X86VBroadcast
+ (To.ScalarLdFrag addr:$src2)))),
+ imm:$src3))),
+ To.ImmAllZerosV)),
+ (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
+ To.RC:$src1, addr:$src2,
+ (ImmXForm imm:$src3))>;
}
-multiclass avx512_vpalignr_lowering_common<AVX512VLVectorVTInfo _>:
- avx512_vpalignr_lowering<_.info512, [HasBWI]>,
- avx512_vpalignr_lowering<_.info128, [HasBWI, HasVLX]>,
- avx512_vpalignr_lowering<_.info256, [HasBWI, HasVLX]>;
+let Predicates = [HasAVX512] in {
+ // For 512-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ", X86VAlign, v8i64_info,
+ v16i32_info, ValignqImm32XForm>;
+}
-defm VPALIGNR: avx512_common_3Op_rm_imm8<0x0F, X86PAlignr, "vpalignr" ,
- avx512vl_i8_info, avx512vl_i8_info>,
- avx512_vpalignr_lowering_common<avx512vl_i16_info>,
- avx512_vpalignr_lowering_common<avx512vl_i32_info>,
- avx512_vpalignr_lowering_common<avx512vl_f32_info>,
- avx512_vpalignr_lowering_common<avx512vl_i64_info>,
- avx512_vpalignr_lowering_common<avx512vl_f64_info>,
- EVEX_CD8<8, CD8VF>;
+let Predicates = [HasVLX] in {
+ // For 128-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ128", X86VAlign, v2i64x_info,
+ v4i32x_info, ValignqImm32XForm>;
+ // For 256-bit we lower to the widest element type we can. So we only need
+ // to handle converting valignq to valignd.
+ defm : avx512_vpalign_mask_lowering_mb<"VALIGNDZ256", X86VAlign, v4i64x_info,
+ v8i32x_info, ValignqImm32XForm>;
+}
+
+let Predicates = [HasVLX, HasBWI] in {
+ // We can turn 128 and 256 bit VALIGND/VALIGNQ into VPALIGNR.
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v2i64x_info,
+ v16i8x_info, ValignqImm8XForm>;
+ defm : avx512_vpalign_mask_lowering<"VPALIGNRZ128", X86VAlign, v4i32x_info,
+ v16i8x_info, ValigndImm8XForm>;
+}
-defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
- avx512vl_i16_info, avx512vl_i8_info>, EVEX_CD8<8, CD8VF>;
+defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw",
+ SSE_INTMUL_ITINS_P, avx512vl_i16_info, avx512vl_i8_info>,
+ EVEX_CD8<8, CD8VF>;
multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode _.RC:$src1))>, EVEX, AVX5128IBase;
+ (_.VT (OpNode _.RC:$src1)), itins.rr>, EVEX, AVX5128IBase,
+ Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src1), OpcodeStr,
"$src1", "$src1",
- (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
- EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+ (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1)))), itins.rm>,
+ EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded]>;
}
}
multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> :
- avx512_unary_rm<opc, OpcodeStr, OpNode, _> {
+ OpndItins itins, X86VectorVTInfo _> :
+ avx512_unary_rm<opc, OpcodeStr, OpNode, itins, _> {
defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src1), OpcodeStr,
"${src1}"##_.BroadcastStr,
"${src1}"##_.BroadcastStr,
(_.VT (OpNode (X86VBroadcast
- (_.ScalarLdFrag addr:$src1))))>,
- EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ (_.ScalarLdFrag addr:$src1)))), itins.rm>,
+ EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded]>;
}
multiclass avx512_unary_rm_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info512>,
+ EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ defm Z128 : avx512_unary_rm<opc, OpcodeStr, OpNode, itins, VTInfo.info128>,
EVEX_V128;
}
}
multiclass avx512_unary_rmb_vl<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo, Predicate prd> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo,
+ Predicate prd> {
let Predicates = [prd] in
- defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info512>,
+ defm Z : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info512>,
EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ defm Z256 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, VTInfo.info128>,
+ defm Z128 : avx512_unary_rmb<opc, OpcodeStr, OpNode, itins, VTInfo.info128>,
EVEX_V128;
}
}
multiclass avx512_unary_rm_vl_dq<bits<8> opc_d, bits<8> opc_q, string OpcodeStr,
- SDNode OpNode, Predicate prd> {
- defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, avx512vl_i64_info,
- prd>, VEX_W;
- defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, avx512vl_i32_info,
- prd>;
+ SDNode OpNode, OpndItins itins, Predicate prd> {
+ defm Q : avx512_unary_rmb_vl<opc_q, OpcodeStr#"q", OpNode, itins,
+ avx512vl_i64_info, prd>, VEX_W;
+ defm D : avx512_unary_rmb_vl<opc_d, OpcodeStr#"d", OpNode, itins,
+ avx512vl_i32_info, prd>;
}
multiclass avx512_unary_rm_vl_bw<bits<8> opc_b, bits<8> opc_w, string OpcodeStr,
- SDNode OpNode, Predicate prd> {
- defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, avx512vl_i16_info, prd>;
- defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, avx512vl_i8_info, prd>;
+ SDNode OpNode, OpndItins itins, Predicate prd> {
+ defm W : avx512_unary_rm_vl<opc_w, OpcodeStr#"w", OpNode, itins,
+ avx512vl_i16_info, prd>, VEX_WIG;
+ defm B : avx512_unary_rm_vl<opc_b, OpcodeStr#"b", OpNode, itins,
+ avx512vl_i8_info, prd>, VEX_WIG;
}
multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
bits<8> opc_d, bits<8> opc_q,
- string OpcodeStr, SDNode OpNode> {
- defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode,
+ string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm NAME : avx512_unary_rm_vl_dq<opc_d, opc_q, OpcodeStr, OpNode, itins,
HasAVX512>,
- avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode,
+ avx512_unary_rm_vl_bw<opc_b, opc_w, OpcodeStr, OpNode, itins,
HasBWI>;
}
-defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>;
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs, SSE_PABS>;
// VPABS: Use 512bit version to implement 128/256 bit in case NoVLX.
let Predicates = [HasAVX512, NoVLX] in {
@@ -9393,137 +9557,111 @@ let Predicates = [HasAVX512, NoVLX] in {
sub_xmm)>;
}
-multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
+// Use 512bit version to implement 128/256 bit.
+multiclass avx512_unary_lowering<string InstrStr, SDNode OpNode,
+ AVX512VLVectorVTInfo _, Predicate prd> {
+ let Predicates = [prd, NoVLX] in {
+ def : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(InstrStr # "Zrr")
+ (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+ _.info256.RC:$src1,
+ _.info256.SubRegIdx)),
+ _.info256.SubRegIdx)>;
- defm NAME : avx512_unary_rm_vl_dq<opc, opc, OpcodeStr, ctlz, prd>;
+ def : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
+ (EXTRACT_SUBREG
+ (!cast<Instruction>(InstrStr # "Zrr")
+ (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
+ _.info128.RC:$src1,
+ _.info128.SubRegIdx)),
+ _.info128.SubRegIdx)>;
+ }
}
-defm VPLZCNT : avx512_ctlz<0x44, "vplzcnt", HasCDI>;
-defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict, HasCDI>;
+// FIXME: Is there a better scheduler itinerary for VPLZCNT?
+defm VPLZCNT : avx512_unary_rm_vl_dq<0x44, 0x44, "vplzcnt", ctlz,
+ SSE_INTALU_ITINS_P, HasCDI>;
-// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
-let Predicates = [HasCDI, NoVLX] in {
- def : Pat<(v4i64 (ctlz VR256X:$src)),
- (EXTRACT_SUBREG
- (VPLZCNTQZrr
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
- sub_ymm)>;
- def : Pat<(v2i64 (ctlz VR128X:$src)),
- (EXTRACT_SUBREG
- (VPLZCNTQZrr
- (INSERT_SUBREG (v8i64 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
- sub_xmm)>;
+// FIXME: Is there a better scheduler itinerary for VPCONFLICT?
+defm VPCONFLICT : avx512_unary_rm_vl_dq<0xC4, 0xC4, "vpconflict", X86Conflict,
+ SSE_INTALU_ITINS_P, HasCDI>;
- def : Pat<(v8i32 (ctlz VR256X:$src)),
- (EXTRACT_SUBREG
- (VPLZCNTDZrr
- (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR256X:$src, sub_ymm)),
- sub_ymm)>;
- def : Pat<(v4i32 (ctlz VR128X:$src)),
- (EXTRACT_SUBREG
- (VPLZCNTDZrr
- (INSERT_SUBREG (v16i32 (IMPLICIT_DEF)), VR128X:$src, sub_xmm)),
- sub_xmm)>;
-}
+// VPLZCNT: Use 512bit version to implement 128/256 bit in case NoVLX.
+defm : avx512_unary_lowering<"VPLZCNTQ", ctlz, avx512vl_i64_info, HasCDI>;
+defm : avx512_unary_lowering<"VPLZCNTD", ctlz, avx512vl_i32_info, HasCDI>;
//===---------------------------------------------------------------------===//
// Counts number of ones - VPOPCNTD and VPOPCNTQ
//===---------------------------------------------------------------------===//
-multiclass avx512_unary_rmb_popcnt<bits<8> opc, string OpcodeStr, X86VectorVTInfo VTInfo> {
- let Predicates = [HasVPOPCNTDQ] in
- defm Z : avx512_unary_rmb<opc, OpcodeStr, ctpop, VTInfo>, EVEX_V512;
-}
+// FIXME: Is there a better scheduler itinerary for VPOPCNTD/VPOPCNTQ?
+defm VPOPCNT : avx512_unary_rm_vl_dq<0x55, 0x55, "vpopcnt", ctpop,
+ SSE_INTALU_ITINS_P, HasVPOPCNTDQ>;
-// Use 512bit version to implement 128/256 bit.
-multiclass avx512_unary_lowering<SDNode OpNode, AVX512VLVectorVTInfo _, Predicate prd> {
- let Predicates = [prd] in {
- def Z256_Alt : Pat<(_.info256.VT(OpNode _.info256.RC:$src1)),
- (EXTRACT_SUBREG
- (!cast<Instruction>(NAME # "Zrr")
- (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
- _.info256.RC:$src1,
- _.info256.SubRegIdx)),
- _.info256.SubRegIdx)>;
-
- def Z128_Alt : Pat<(_.info128.VT(OpNode _.info128.RC:$src1)),
- (EXTRACT_SUBREG
- (!cast<Instruction>(NAME # "Zrr")
- (INSERT_SUBREG(_.info512.VT(IMPLICIT_DEF)),
- _.info128.RC:$src1,
- _.info128.SubRegIdx)),
- _.info128.SubRegIdx)>;
- }
-}
-
-defm VPOPCNTD : avx512_unary_rmb_popcnt<0x55, "vpopcntd", v16i32_info>,
- avx512_unary_lowering<ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
-defm VPOPCNTQ : avx512_unary_rmb_popcnt<0x55, "vpopcntq", v8i64_info>,
- avx512_unary_lowering<ctpop, avx512vl_i64_info, HasVPOPCNTDQ>, VEX_W;
+defm : avx512_unary_lowering<"VPOPCNTQ", ctpop, avx512vl_i64_info, HasVPOPCNTDQ>;
+defm : avx512_unary_lowering<"VPOPCNTD", ctpop, avx512vl_i32_info, HasVPOPCNTDQ>;
//===---------------------------------------------------------------------===//
// Replicate Single FP - MOVSHDUP and MOVSLDUP
//===---------------------------------------------------------------------===//
-multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode>{
- defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, avx512vl_f32_info,
- HasAVX512>, XS;
+multiclass avx512_replicate<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm NAME: avx512_unary_rm_vl<opc, OpcodeStr, OpNode, itins,
+ avx512vl_f32_info, HasAVX512>, XS;
}
-defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup>;
-defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
+defm VMOVSHDUP : avx512_replicate<0x16, "vmovshdup", X86Movshdup, SSE_MOVDDUP>;
+defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup, SSE_MOVDDUP>;
//===----------------------------------------------------------------------===//
// AVX-512 - MOVDDUP
//===----------------------------------------------------------------------===//
multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _> {
+ OpndItins itins, X86VectorVTInfo _> {
let ExeDomain = _.ExeDomain in {
defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
+ (_.VT (OpNode (_.VT _.RC:$src))), itins.rr>, EVEX,
+ Sched<[itins.Sched]>;
defm rm : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr, "$src", "$src",
(_.VT (OpNode (_.VT (scalar_to_vector
- (_.ScalarLdFrag addr:$src)))))>,
- EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+ (_.ScalarLdFrag addr:$src))))),
+ itins.rm>, EVEX, EVEX_CD8<_.EltSize, CD8VH>,
+ Sched<[itins.Sched.Folded]>;
}
}
multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo VTInfo> {
+ OpndItins itins, AVX512VLVectorVTInfo VTInfo> {
- defm Z : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info512>, EVEX_V512;
+ defm Z : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z256 : avx512_unary_rm<opc, OpcodeStr, OpNode, VTInfo.info256>,
+ defm Z256 : avx512_unary_rm<opc, OpcodeStr, X86Movddup, itins, VTInfo.info256>,
EVEX_V256;
- defm Z128 : avx512_movddup_128<opc, OpcodeStr, OpNode, VTInfo.info128>,
- EVEX_V128;
+ defm Z128 : avx512_movddup_128<opc, OpcodeStr, X86VBroadcast, itins, VTInfo.info128>,
+ EVEX_V128;
}
}
-multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode>{
- defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode,
+multiclass avx512_movddup<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ OpndItins itins> {
+ defm NAME: avx512_movddup_common<opc, OpcodeStr, OpNode, itins,
avx512vl_f64_info>, XD, VEX_W;
}
-defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup>;
+defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SSE_MOVDDUP>;
let Predicates = [HasVLX] in {
-def : Pat<(X86Movddup (loadv2f64 addr:$src)),
- (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
(VMOVDDUPZ128rm addr:$src)>;
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPZ128rr (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
-
-def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
- (v2f64 VR128X:$src0)),
- (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (X86Movddup (loadv2f64 addr:$src)),
- (bitconvert (v4i32 immAllZerosV))),
- (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (VMOVDDUPZ128rm addr:$src)>;
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
(v2f64 VR128X:$src0)),
@@ -9539,6 +9677,13 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src)
def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadf64 addr:$src))),
(bitconvert (v4i32 immAllZerosV))),
(VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
+
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (v2f64 VR128X:$src0)),
+ (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
+def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (bitconvert (v4i32 immAllZerosV))),
+ (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
}
//===----------------------------------------------------------------------===//
@@ -9576,10 +9721,9 @@ multiclass avx512_extract_elt_bw_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
def mr : AVX512Ii8<opc, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(store (_.EltVT (trunc (assertzext (OpNode (_.VT _.RC:$src1),
- imm:$src2)))),
- addr:$dst)]>,
- EVEX, EVEX_CD8<_.EltSize, CD8VT1>;
+ [(store (_.EltVT (trunc (OpNode (_.VT _.RC:$src1), imm:$src2))),
+ addr:$dst)]>,
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd]>;
}
multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
@@ -9589,7 +9733,7 @@ multiclass avx512_extract_elt_b<string OpcodeStr, X86VectorVTInfo _> {
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
(X86pextrb (_.VT _.RC:$src1), imm:$src2))]>,
- EVEX, TAPD;
+ EVEX, TAPD, Sched<[WriteShuffle]>;
defm NAME : avx512_extract_elt_bw_m<0x14, OpcodeStr, X86pextrb, _>, TAPD;
}
@@ -9601,14 +9745,15 @@ multiclass avx512_extract_elt_w<string OpcodeStr, X86VectorVTInfo _> {
(ins _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GR32orGR64:$dst,
- (X86pextrw (_.VT _.RC:$src1), imm:$src2))]>,
- EVEX, PD;
+ (X86pextrw (_.VT _.RC:$src1), imm:$src2))],
+ IIC_SSE_PEXTRW>, EVEX, PD, Sched<[WriteShuffle]>;
let hasSideEffects = 0 in
def rr_REV : AVX512Ii8<0x15, MRMDestReg, (outs GR32orGR64:$dst),
(ins _.RC:$src1, u8imm:$src2),
- OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- EVEX, TAPD, FoldGenData<NAME#rr>;
+ OpcodeStr#".s\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
+ IIC_SSE_PEXTRW>, EVEX, TAPD, FoldGenData<NAME#rr>,
+ Sched<[WriteShuffle]>;
defm NAME : avx512_extract_elt_bw_m<0x15, OpcodeStr, X86pextrw, _>, TAPD;
}
@@ -9622,19 +9767,20 @@ multiclass avx512_extract_elt_dq<string OpcodeStr, X86VectorVTInfo _,
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set GRC:$dst,
(extractelt (_.VT _.RC:$src1), imm:$src2))]>,
- EVEX, TAPD;
+ EVEX, TAPD, Sched<[WriteShuffle]>;
def mr : AVX512Ii8<0x16, MRMDestMem, (outs),
(ins _.ScalarMemOp:$dst, _.RC:$src1, u8imm:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(store (extractelt (_.VT _.RC:$src1),
imm:$src2),addr:$dst)]>,
- EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD;
+ EVEX, EVEX_CD8<_.EltSize, CD8VT1>, TAPD,
+ Sched<[WriteShuffleLd]>;
}
}
-defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>;
-defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>;
+defm VPEXTRBZ : avx512_extract_elt_b<"vpextrb", v16i8x_info>, VEX_WIG;
+defm VPEXTRWZ : avx512_extract_elt_w<"vpextrw", v8i16x_info>, VEX_WIG;
defm VPEXTRDZ : avx512_extract_elt_dq<"vpextrd", v4i32x_info, GR32>;
defm VPEXTRQZ : avx512_extract_elt_dq<"vpextrq", v2i64x_info, GR64>, VEX_W;
@@ -9645,7 +9791,7 @@ multiclass avx512_insert_elt_m<bits<8> opc, string OpcodeStr, SDNode OpNode,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(_.VT (OpNode _.RC:$src1, (LdFrag addr:$src2), imm:$src3)))]>,
- EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
+ EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteShuffleLd, ReadAfterLd]>;
}
multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -9655,7 +9801,8 @@ multiclass avx512_insert_elt_bw<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins _.RC:$src1, GR32orGR64:$src2, u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
- (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V;
+ (OpNode _.RC:$src1, GR32orGR64:$src2, imm:$src3))]>, EVEX_4V,
+ Sched<[WriteShuffle]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, OpNode, _, LdFrag>;
}
@@ -9669,7 +9816,7 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
[(set _.RC:$dst,
(_.VT (insertelt _.RC:$src1, GRC:$src2, imm:$src3)))]>,
- EVEX_4V, TAPD;
+ EVEX_4V, TAPD, Sched<[WriteShuffle]>;
defm NAME : avx512_insert_elt_m<opc, OpcodeStr, insertelt, _,
_.ScalarLdFrag>, TAPD;
@@ -9677,92 +9824,109 @@ multiclass avx512_insert_elt_dq<bits<8> opc, string OpcodeStr,
}
defm VPINSRBZ : avx512_insert_elt_bw<0x20, "vpinsrb", X86pinsrb, v16i8x_info,
- extloadi8>, TAPD;
+ extloadi8>, TAPD, VEX_WIG;
defm VPINSRWZ : avx512_insert_elt_bw<0xC4, "vpinsrw", X86pinsrw, v8i16x_info,
- extloadi16>, PD;
+ extloadi16>, PD, VEX_WIG;
defm VPINSRDZ : avx512_insert_elt_dq<0x22, "vpinsrd", v4i32x_info, GR32>;
defm VPINSRQZ : avx512_insert_elt_dq<0x22, "vpinsrq", v2i64x_info, GR64>, VEX_W;
+
//===----------------------------------------------------------------------===//
// VSHUFPS - VSHUFPD Operations
//===----------------------------------------------------------------------===//
+
multiclass avx512_shufp<string OpcodeStr, AVX512VLVectorVTInfo VTInfo_I,
AVX512VLVectorVTInfo VTInfo_FP>{
- defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp>,
- EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
- AVX512AIi8Base, EVEX_4V;
+ defm NAME: avx512_common_3Op_imm8<OpcodeStr, VTInfo_FP, 0xC6, X86Shufp,
+ SSE_SHUFP>, EVEX_CD8<VTInfo_FP.info512.EltSize, CD8VF>,
+ AVX512AIi8Base, EVEX_4V;
}
defm VSHUFPS: avx512_shufp<"vshufps", avx512vl_i32_info, avx512vl_f32_info>, PS;
defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD, VEX_W;
+
//===----------------------------------------------------------------------===//
// AVX-512 - Byte shift Left/Right
//===----------------------------------------------------------------------===//
+let Sched = WriteVecShift in
+def AVX512_BYTESHIFT : OpndItins<
+ IIC_SSE_INTSHDQ_P_RI, IIC_SSE_INTSHDQ_P_RI
+>;
+
multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
- Format MRMm, string OpcodeStr, X86VectorVTInfo _>{
+ Format MRMm, string OpcodeStr,
+ OpndItins itins, X86VectorVTInfo _>{
def rr : AVX512<opc, MRMr,
(outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))]>;
+ [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 imm:$src2))))],
+ itins.rr>, Sched<[itins.Sched]>;
def rm : AVX512<opc, MRMm,
(outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.RC:$dst,(_.VT (OpNode
(_.VT (bitconvert (_.LdFrag addr:$src1))),
- (i8 imm:$src2))))]>;
+ (i8 imm:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_shift_packed_all<bits<8> opc, SDNode OpNode, Format MRMr,
- Format MRMm, string OpcodeStr, Predicate prd>{
+ Format MRMm, string OpcodeStr,
+ OpndItins itins, Predicate prd>{
let Predicates = [prd] in
- defm Z512 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
- OpcodeStr, v64i8_info>, EVEX_V512;
+ defm Z : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
+ OpcodeStr, itins, v64i8_info>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
defm Z256 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
- OpcodeStr, v32i8x_info>, EVEX_V256;
+ OpcodeStr, itins, v32i8x_info>, EVEX_V256;
defm Z128 : avx512_shift_packed<opc, OpNode, MRMr, MRMm,
- OpcodeStr, v16i8x_info>, EVEX_V128;
+ OpcodeStr, itins, v16i8x_info>, EVEX_V128;
}
}
defm VPSLLDQ : avx512_shift_packed_all<0x73, X86vshldq, MRM7r, MRM7m, "vpslldq",
- HasBWI>, AVX512PDIi8Base, EVEX_4V;
+ AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base,
+ EVEX_4V, VEX_WIG;
defm VPSRLDQ : avx512_shift_packed_all<0x73, X86vshrdq, MRM3r, MRM3m, "vpsrldq",
- HasBWI>, AVX512PDIi8Base, EVEX_4V;
+ AVX512_BYTESHIFT, HasBWI>, AVX512PDIi8Base,
+ EVEX_4V, VEX_WIG;
multiclass avx512_psadbw_packed<bits<8> opc, SDNode OpNode,
- string OpcodeStr, X86VectorVTInfo _dst,
- X86VectorVTInfo _src>{
+ string OpcodeStr, OpndItins itins,
+ X86VectorVTInfo _dst, X86VectorVTInfo _src> {
def rr : AVX512BI<opc, MRMSrcReg,
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _dst.RC:$dst,(_dst.VT
(OpNode (_src.VT _src.RC:$src1),
- (_src.VT _src.RC:$src2))))]>;
+ (_src.VT _src.RC:$src2))))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm : AVX512BI<opc, MRMSrcMem,
(outs _dst.RC:$dst), (ins _src.RC:$src1, _src.MemOp:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _dst.RC:$dst,(_dst.VT
(OpNode (_src.VT _src.RC:$src1),
(_src.VT (bitconvert
- (_src.LdFrag addr:$src2))))))]>;
+ (_src.LdFrag addr:$src2))))))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
- string OpcodeStr, Predicate prd> {
+ string OpcodeStr, OpndItins itins,
+ Predicate prd> {
let Predicates = [prd] in
- defm Z512 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v8i64_info,
- v64i8_info>, EVEX_V512;
+ defm Z : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v8i64_info,
+ v64i8_info>, EVEX_V512;
let Predicates = [prd, HasVLX] in {
- defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v4i64x_info,
+ defm Z256 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v4i64x_info,
v32i8x_info>, EVEX_V256;
- defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, v2i64x_info,
+ defm Z128 : avx512_psadbw_packed<opc, OpNode, OpcodeStr, itins, v2i64x_info,
v16i8x_info>, EVEX_V128;
}
}
defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
- HasBWI>, EVEX_4V;
+ SSE_MPSADBW_ITINS, HasBWI>, EVEX_4V, VEX_WIG;
// Transforms to swizzle an immediate to enable better matching when
// memory operand isn't in the right place.
@@ -9827,7 +9991,7 @@ def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
}]>;
multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
+ OpndItins itins, X86VectorVTInfo _>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, u8imm:$src4),
@@ -9835,15 +9999,17 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT _.RC:$src3),
- (i8 imm:$src4)), 1, 1>, AVX512AIi8Base, EVEX_4V;
+ (i8 imm:$src4)), itins.rr, 1, 1>,
+ AVX512AIi8Base, EVEX_4V, Sched<[itins.Sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, u8imm:$src4),
OpcodeStr, "$src4, $src3, $src2", "$src2, $src3, $src4",
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (bitconvert (_.LdFrag addr:$src3))),
- (i8 imm:$src4)), 1, 0>,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ (i8 imm:$src4)), itins.rm, 1, 0>,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
@@ -9851,8 +10017,9 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_.VT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
- (i8 imm:$src4)), 1, 0>, EVEX_B,
- AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
+ (i8 imm:$src4)), itins.rm, 1, 0>, EVEX_B,
+ AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}// Constraints = "$src1 = $dst"
// Additional patterns for matching passthru operand in other positions.
@@ -9968,47 +10135,50 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2, _.RC:$src1,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
(i8 imm:$src4)), _.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode _.RC:$src2,
(X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src1, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
def : Pat<(_.VT (vselect _.KRCWM:$mask,
(OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
_.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
_.RC:$src1)),
- (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+ (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
_.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
}
-multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
+multiclass avx512_common_ternlog<string OpcodeStr, OpndItins itins,
+ AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512] in
- defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info512>, EVEX_V512;
+ defm Z : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info128>, EVEX_V128;
- defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, _.info256>, EVEX_V256;
+ defm Z128 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info128>, EVEX_V128;
+ defm Z256 : avx512_ternlog<0x25, OpcodeStr, X86vpternlog, itins, _.info256>, EVEX_V256;
}
}
-defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", avx512vl_i32_info>;
-defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", avx512vl_i64_info>, VEX_W;
+defm VPTERNLOGD : avx512_common_ternlog<"vpternlogd", SSE_INTALU_ITINS_P,
+ avx512vl_i32_info>;
+defm VPTERNLOGQ : avx512_common_ternlog<"vpternlogq", SSE_INTALU_ITINS_P,
+ avx512vl_i64_info>, VEX_W;
//===----------------------------------------------------------------------===//
// AVX-512 - FixupImm
//===----------------------------------------------------------------------===//
multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _>{
+ OpndItins itins, X86VectorVTInfo _>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
@@ -10017,7 +10187,7 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_.IntVT _.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_CURRENT))>;
+ (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>;
defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -10025,7 +10195,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_.IntVT (bitconvert (_.LdFrag addr:$src3))),
(i32 imm:$src4),
- (i32 FROUND_CURRENT))>;
+ (i32 FROUND_CURRENT)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
@@ -10034,12 +10205,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_.IntVT (X86VBroadcast(_.ScalarLdFrag addr:$src3))),
(i32 imm:$src4),
- (i32 FROUND_CURRENT))>, EVEX_B;
+ (i32 FROUND_CURRENT)), itins.rm>,
+ EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
} // Constraints = "$src1 = $dst"
}
multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
- SDNode OpNode, X86VectorVTInfo _>{
+ SDNode OpNode, OpndItins itins,
+ X86VectorVTInfo _>{
let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
@@ -10049,12 +10222,14 @@ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
(_.VT _.RC:$src2),
(_.IntVT _.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rr>,
+ EVEX_B, Sched<[itins.Sched]>;
}
}
multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
- X86VectorVTInfo _, X86VectorVTInfo _src3VT> {
+ OpndItins itins, X86VectorVTInfo _,
+ X86VectorVTInfo _src3VT> {
let Constraints = "$src1 = $dst" , Predicates = [HasAVX512],
ExeDomain = _.ExeDomain in {
defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -10064,8 +10239,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_CURRENT))>;
-
+ (i32 FROUND_CURRENT)), itins.rr>, Sched<[itins.Sched]>;
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -10074,7 +10248,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
(i32 imm:$src4),
- (i32 FROUND_NO_EXC))>, EVEX_B;
+ (i32 FROUND_NO_EXC)), itins.rm>,
+ EVEX_B, Sched<[itins.Sched.Folded, ReadAfterLd]>;
defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -10083,32 +10258,34 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
(_src3VT.VT (scalar_to_vector
(_src3VT.ScalarLdFrag addr:$src3))),
(i32 imm:$src4),
- (i32 FROUND_CURRENT))>;
+ (i32 FROUND_CURRENT)), itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-multiclass avx512_fixupimm_packed_all<AVX512VLVectorVTInfo _Vec>{
+multiclass avx512_fixupimm_packed_all<OpndItins itins, AVX512VLVectorVTInfo _Vec> {
let Predicates = [HasAVX512] in
- defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
- avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, _Vec.info512>,
- AVX512AIi8Base, EVEX_4V, EVEX_V512;
+ defm Z : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
+ _Vec.info512>,
+ avx512_fixupimm_packed_sae<0x54, "vfixupimm", X86VFixupimm, itins,
+ _Vec.info512>, AVX512AIi8Base, EVEX_4V, EVEX_V512;
let Predicates = [HasAVX512, HasVLX] in {
- defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info128>,
- AVX512AIi8Base, EVEX_4V, EVEX_V128;
- defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, _Vec.info256>,
- AVX512AIi8Base, EVEX_4V, EVEX_V256;
+ defm Z128 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
+ _Vec.info128>, AVX512AIi8Base, EVEX_4V, EVEX_V128;
+ defm Z256 : avx512_fixupimm_packed<0x54, "vfixupimm", X86VFixupimm, itins,
+ _Vec.info256>, AVX512AIi8Base, EVEX_4V, EVEX_V256;
}
}
defm VFIXUPIMMSS : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
- f32x_info, v4i32x_info>,
+ SSE_ALU_F32S, f32x_info, v4i32x_info>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<32, CD8VT1>;
defm VFIXUPIMMSD : avx512_fixupimm_scalar<0x55, "vfixupimm", X86VFixupimmScalar,
- f64x_info, v2i64x_info>,
+ SSE_ALU_F64S, f64x_info, v2i64x_info>,
AVX512AIi8Base, VEX_LIG, EVEX_4V, EVEX_CD8<64, CD8VT1>, VEX_W;
-defm VFIXUPIMMPS : avx512_fixupimm_packed_all<avx512vl_f32_info>,
+defm VFIXUPIMMPS : avx512_fixupimm_packed_all<SSE_ALU_F32P, avx512vl_f32_info>,
EVEX_CD8<32, CD8VF>;
-defm VFIXUPIMMPD : avx512_fixupimm_packed_all<avx512vl_f64_info>,
+defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SSE_ALU_F64P, avx512vl_f64_info>,
EVEX_CD8<64, CD8VF>, VEX_W;
@@ -10164,23 +10341,11 @@ multiclass AVX512_scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32X:$src, VR128X))>;
- // extracted scalar math op with insert via blend
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (extractelt (v4f32 VR128X:$dst), (iPTR 0))),
- FR32X:$src))), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst,
- (COPY_TO_REGCLASS FR32X:$src, VR128X))>;
-
// vector math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128X:$dst),
(Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)))),
(!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
- // vector math op with insert via blend
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128X:$dst),
- (Op (v4f32 VR128X:$dst), (v4f32 VR128X:$src)), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SSZrr_Int) v4f32:$dst, v4f32:$src)>;
-
// extracted masked scalar math op with insert via movss
def : Pat<(X86Movss (v4f32 VR128X:$src1),
(scalar_to_vector
@@ -10208,23 +10373,11 @@ multiclass AVX512_scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64X:$src, VR128X))>;
- // extracted scalar math op with insert via blend
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (extractelt (v2f64 VR128X:$dst), (iPTR 0))),
- FR64X:$src))), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst,
- (COPY_TO_REGCLASS FR64X:$src, VR128X))>;
-
// vector math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128X:$dst),
(Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)))),
(!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
- // vector math op with insert via blend
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128X:$dst),
- (Op (v2f64 VR128X:$dst), (v2f64 VR128X:$src)), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SDZrr_Int) v2f64:$dst, v2f64:$src)>;
-
// extracted masked scalar math op with insert via movss
def : Pat<(X86Movsd (v2f64 VR128X:$src1),
(scalar_to_vector
@@ -10242,3 +10395,292 @@ defm : AVX512_scalar_math_f64_patterns<fadd, "ADD">;
defm : AVX512_scalar_math_f64_patterns<fsub, "SUB">;
defm : AVX512_scalar_math_f64_patterns<fmul, "MUL">;
defm : AVX512_scalar_math_f64_patterns<fdiv, "DIV">;
+
+//===----------------------------------------------------------------------===//
+// AES instructions
+//===----------------------------------------------------------------------===//
+
+multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
+ let Predicates = [HasVLX, HasVAES] in {
+ defm Z128 : AESI_binop_rm_int<Op, OpStr,
+ !cast<Intrinsic>(IntPrefix),
+ loadv2i64, 0, VR128X, i128mem>,
+ EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG;
+ defm Z256 : AESI_binop_rm_int<Op, OpStr,
+ !cast<Intrinsic>(IntPrefix##"_256"),
+ loadv4i64, 0, VR256X, i256mem>,
+ EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG;
+ }
+ let Predicates = [HasAVX512, HasVAES] in
+ defm Z : AESI_binop_rm_int<Op, OpStr,
+ !cast<Intrinsic>(IntPrefix##"_512"),
+ loadv8i64, 0, VR512, i512mem>,
+ EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG;
+}
+
+defm VAESENC : avx512_vaes<0xDC, "vaesenc", "int_x86_aesni_aesenc">;
+defm VAESENCLAST : avx512_vaes<0xDD, "vaesenclast", "int_x86_aesni_aesenclast">;
+defm VAESDEC : avx512_vaes<0xDE, "vaesdec", "int_x86_aesni_aesdec">;
+defm VAESDECLAST : avx512_vaes<0xDF, "vaesdeclast", "int_x86_aesni_aesdeclast">;
+
+//===----------------------------------------------------------------------===//
+// PCLMUL instructions - Carry less multiplication
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasAVX512, HasVPCLMULQDQ] in
+defm VPCLMULQDQZ : vpclmulqdq<VR512, i512mem, loadv8i64, int_x86_pclmulqdq_512>,
+ EVEX_4V, EVEX_V512, EVEX_CD8<64, CD8VF>, VEX_WIG;
+
+let Predicates = [HasVLX, HasVPCLMULQDQ] in {
+defm VPCLMULQDQZ128 : vpclmulqdq<VR128X, i128mem, loadv2i64, int_x86_pclmulqdq>,
+ EVEX_4V, EVEX_V128, EVEX_CD8<64, CD8VF>, VEX_WIG;
+
+defm VPCLMULQDQZ256: vpclmulqdq<VR256X, i256mem, loadv4i64,
+ int_x86_pclmulqdq_256>, EVEX_4V, EVEX_V256,
+ EVEX_CD8<64, CD8VF>, VEX_WIG;
+}
+
+// Aliases
+defm : vpclmulqdq_aliases<"VPCLMULQDQZ", VR512, i512mem>;
+defm : vpclmulqdq_aliases<"VPCLMULQDQZ128", VR128X, i128mem>;
+defm : vpclmulqdq_aliases<"VPCLMULQDQZ256", VR256X, i256mem>;
+
+//===----------------------------------------------------------------------===//
+// VBMI2
+//===----------------------------------------------------------------------===//
+
+multiclass VBMI2_shift_var_rm<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins, X86VectorVTInfo VTI> {
+ let Constraints = "$src1 = $dst",
+ ExeDomain = VTI.ExeDomain in {
+ defm r: AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2, VTI.RC:$src3)),
+ itins.rr>, AVX512FMA3Base, Sched<[itins.Sched]>;
+ defm m: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (bitconvert (VTI.LdFrag addr:$src3))))),
+ itins.rm>, AVX512FMA3Base,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ }
+}
+
+multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins, X86VectorVTInfo VTI>
+ : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI> {
+ let Constraints = "$src1 = $dst",
+ ExeDomain = VTI.ExeDomain in
+ defm mb: AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
+ "${src3}"##VTI.BroadcastStr##", $src2",
+ "$src2, ${src3}"##VTI.BroadcastStr,
+ (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (X86VBroadcast (VTI.ScalarLdFrag addr:$src3)))),
+ itins.rm>, AVX512FMA3Base, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass VBMI2_shift_var_rm_common<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins, AVX512VLVectorVTInfo VTI> {
+ let Predicates = [HasVBMI2] in
+ defm Z : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512;
+ let Predicates = [HasVBMI2, HasVLX] in {
+ defm Z256 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256;
+ defm Z128 : VBMI2_shift_var_rm<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128;
+ }
+}
+
+multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins, AVX512VLVectorVTInfo VTI> {
+ let Predicates = [HasVBMI2] in
+ defm Z : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info512>, EVEX_V512;
+ let Predicates = [HasVBMI2, HasVLX] in {
+ defm Z256 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info256>, EVEX_V256;
+ defm Z128 : VBMI2_shift_var_rmb<Op, OpStr, OpNode, itins, VTI.info128>, EVEX_V128;
+ }
+}
+multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
+ SDNode OpNode, OpndItins itins> {
+ defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, itins,
+ avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
+ defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, itins,
+ avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
+ defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, itins,
+ avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
+}
+
+multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
+ SDNode OpNode, OpndItins itins> {
+ defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", itins,
+ avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
+ VEX_W, EVEX_CD8<16, CD8VF>;
+ defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp,
+ OpNode, itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
+ defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode,
+ itins, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
+}
+
+// Concat & Shift
+defm VPSHLDV : VBMI2_shift_var<0x70, 0x71, "vpshldv", X86VShldv, SSE_INTMUL_ITINS_P>;
+defm VPSHRDV : VBMI2_shift_var<0x72, 0x73, "vpshrdv", X86VShrdv, SSE_INTMUL_ITINS_P>;
+defm VPSHLD : VBMI2_shift_imm<0x70, 0x71, "vpshld", X86VShld, SSE_INTMUL_ITINS_P>;
+defm VPSHRD : VBMI2_shift_imm<0x72, 0x73, "vpshrd", X86VShrd, SSE_INTMUL_ITINS_P>;
+
+// Compress
+defm VPCOMPRESSB : compress_by_elt_width<0x63, "vpcompressb", AVX512_COMPRESS,
+ avx512vl_i8_info, HasVBMI2>, EVEX;
+defm VPCOMPRESSW : compress_by_elt_width <0x63, "vpcompressw", AVX512_COMPRESS,
+ avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
+// Expand
+defm VPEXPANDB : expand_by_elt_width <0x62, "vpexpandb", AVX512_EXPAND,
+ avx512vl_i8_info, HasVBMI2>, EVEX;
+defm VPEXPANDW : expand_by_elt_width <0x62, "vpexpandw", AVX512_EXPAND,
+ avx512vl_i16_info, HasVBMI2>, EVEX, VEX_W;
+
+//===----------------------------------------------------------------------===//
+// VNNI
+//===----------------------------------------------------------------------===//
+
+let Constraints = "$src1 = $dst" in
+multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins, X86VectorVTInfo VTI> {
+ defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1,
+ VTI.RC:$src2, VTI.RC:$src3)),
+ itins.rr>, EVEX_4V, T8PD, Sched<[itins.Sched]>;
+ defm m : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.MemOp:$src3), OpStr,
+ "$src3, $src2", "$src2, $src3",
+ (VTI.VT (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (bitconvert
+ (VTI.LdFrag addr:$src3))))),
+ itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, T8PD,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ defm mb : AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3),
+ OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
+ "$src2, ${src3}"##VTI.BroadcastStr,
+ (OpNode VTI.RC:$src1, VTI.RC:$src2,
+ (VTI.VT (X86VBroadcast
+ (VTI.ScalarLdFrag addr:$src3)))),
+ itins.rm>, EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
+ T8PD, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode, OpndItins itins> {
+ let Predicates = [HasVNNI] in
+ defm Z : VNNI_rmb<Op, OpStr, OpNode, itins, v16i32_info>, EVEX_V512;
+ let Predicates = [HasVNNI, HasVLX] in {
+ defm Z256 : VNNI_rmb<Op, OpStr, OpNode, itins, v8i32x_info>, EVEX_V256;
+ defm Z128 : VNNI_rmb<Op, OpStr, OpNode, itins, v4i32x_info>, EVEX_V128;
+ }
+}
+
+// FIXME: Is there a better scheduler itinerary for VPDP?
+defm VPDPBUSD : VNNI_common<0x50, "vpdpbusd", X86Vpdpbusd, SSE_PMADD>;
+defm VPDPBUSDS : VNNI_common<0x51, "vpdpbusds", X86Vpdpbusds, SSE_PMADD>;
+defm VPDPWSSD : VNNI_common<0x52, "vpdpwssd", X86Vpdpwssd, SSE_PMADD>;
+defm VPDPWSSDS : VNNI_common<0x53, "vpdpwssds", X86Vpdpwssds, SSE_PMADD>;
+
+//===----------------------------------------------------------------------===//
+// Bit Algorithms
+//===----------------------------------------------------------------------===//
+
+// FIXME: Is there a better scheduler itinerary for VPOPCNTB/VPOPCNTW?
+defm VPOPCNTB : avx512_unary_rm_vl<0x54, "vpopcntb", ctpop, SSE_INTALU_ITINS_P,
+ avx512vl_i8_info, HasBITALG>;
+defm VPOPCNTW : avx512_unary_rm_vl<0x54, "vpopcntw", ctpop, SSE_INTALU_ITINS_P,
+ avx512vl_i16_info, HasBITALG>, VEX_W;
+
+defm : avx512_unary_lowering<"VPOPCNTB", ctpop, avx512vl_i8_info, HasBITALG>;
+defm : avx512_unary_lowering<"VPOPCNTW", ctpop, avx512vl_i16_info, HasBITALG>;
+
+multiclass VPSHUFBITQMB_rm<OpndItins itins, X86VectorVTInfo VTI> {
+ defm rr : AVX512_maskable_cmp<0x8F, MRMSrcReg, VTI, (outs VTI.KRC:$dst),
+ (ins VTI.RC:$src1, VTI.RC:$src2),
+ "vpshufbitqmb",
+ "$src2, $src1", "$src1, $src2",
+ (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT VTI.RC:$src2)), itins.rr>, EVEX_4V, T8PD,
+ Sched<[itins.Sched]>;
+ defm rm : AVX512_maskable_cmp<0x8F, MRMSrcMem, VTI, (outs VTI.KRC:$dst),
+ (ins VTI.RC:$src1, VTI.MemOp:$src2),
+ "vpshufbitqmb",
+ "$src2, $src1", "$src1, $src2",
+ (X86Vpshufbitqmb (VTI.VT VTI.RC:$src1),
+ (VTI.VT (bitconvert (VTI.LdFrag addr:$src2)))),
+ itins.rm>, EVEX_4V, EVEX_CD8<8, CD8VF>, T8PD,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass VPSHUFBITQMB_common<OpndItins itins, AVX512VLVectorVTInfo VTI> {
+ let Predicates = [HasBITALG] in
+ defm Z : VPSHUFBITQMB_rm<itins, VTI.info512>, EVEX_V512;
+ let Predicates = [HasBITALG, HasVLX] in {
+ defm Z256 : VPSHUFBITQMB_rm<itins, VTI.info256>, EVEX_V256;
+ defm Z128 : VPSHUFBITQMB_rm<itins, VTI.info128>, EVEX_V128;
+ }
+}
+
+// FIXME: Is there a better scheduler itinerary for VPSHUFBITQMB?
+defm VPSHUFBITQMB : VPSHUFBITQMB_common<SSE_INTMUL_ITINS_P, avx512vl_i8_info>;
+
+//===----------------------------------------------------------------------===//
+// GFNI
+//===----------------------------------------------------------------------===//
+
+multiclass GF2P8MULB_avx512_common<bits<8> Op, string OpStr, SDNode OpNode> {
+ let Predicates = [HasGFNI, HasAVX512, HasBWI] in
+ defm Z : avx512_binop_rm<Op, OpStr, OpNode, v64i8_info,
+ SSE_INTALU_ITINS_P, 1>, EVEX_V512;
+ let Predicates = [HasGFNI, HasVLX, HasBWI] in {
+ defm Z256 : avx512_binop_rm<Op, OpStr, OpNode, v32i8x_info,
+ SSE_INTALU_ITINS_P, 1>, EVEX_V256;
+ defm Z128 : avx512_binop_rm<Op, OpStr, OpNode, v16i8x_info,
+ SSE_INTALU_ITINS_P, 1>, EVEX_V128;
+ }
+}
+
+defm GF2P8MULB : GF2P8MULB_avx512_common<0xCF, "vgf2p8mulb", X86GF2P8mulb>,
+ EVEX_CD8<8, CD8VF>, T8PD;
+
+multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins, X86VectorVTInfo VTI,
+ X86VectorVTInfo BcstVTI>
+ : avx512_3Op_rm_imm8<Op, OpStr, OpNode, itins, VTI, VTI> {
+ let ExeDomain = VTI.ExeDomain in
+ defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
+ (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
+ OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1",
+ "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
+ (OpNode (VTI.VT VTI.RC:$src1),
+ (bitconvert (BcstVTI.VT (X86VBroadcast (loadi64 addr:$src2)))),
+ (i8 imm:$src3)), itins.rm>, EVEX_B,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+multiclass GF2P8AFFINE_avx512_common<bits<8> Op, string OpStr, SDNode OpNode,
+ OpndItins itins> {
+ let Predicates = [HasGFNI, HasAVX512, HasBWI] in
+ defm Z : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v64i8_info,
+ v8i64_info>, EVEX_V512;
+ let Predicates = [HasGFNI, HasVLX, HasBWI] in {
+ defm Z256 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v32i8x_info,
+ v4i64x_info>, EVEX_V256;
+ defm Z128 : GF2P8AFFINE_avx512_rmb_imm<Op, OpStr, OpNode, itins, v16i8x_info,
+ v2i64x_info>, EVEX_V128;
+ }
+}
+
+defm GF2P8AFFINEINVQB : GF2P8AFFINE_avx512_common<0xCF, "vgf2p8affineinvqb",
+ X86GF2P8affineinvqb, SSE_INTMUL_ITINS_P>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+defm GF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
+ X86GF2P8affineqb, SSE_INTMUL_ITINS_P>,
+ EVEX_4V, EVEX_CD8<8, CD8VF>, VEX_W, AVX512AIi8Base;
+
diff --git a/lib/Target/X86/X86InstrArithmetic.td b/lib/Target/X86/X86InstrArithmetic.td
index e38bbc9b3d36..d09deb5b7584 100644
--- a/lib/Target/X86/X86InstrArithmetic.td
+++ b/lib/Target/X86/X86InstrArithmetic.td
@@ -104,7 +104,8 @@ def MUL32m : I<0xF7, MRM4m, (outs), (ins i32mem:$src),
// RAX,RDX = RAX*[mem64]
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
def MUL64m : RI<0xF7, MRM4m, (outs), (ins i64mem:$src),
- "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>;
+ "mul{q}\t$src", [], IIC_MUL64>, SchedLoadReg<WriteIMulLd>,
+ Requires<[In64BitMode]>;
}
let hasSideEffects = 0 in {
@@ -143,7 +144,8 @@ def IMUL32m : I<0xF7, MRM5m, (outs), (ins i32mem:$src),
// RAX,RDX = RAX*[mem64]
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX] in
def IMUL64m : RI<0xF7, MRM5m, (outs), (ins i64mem:$src),
- "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>;
+ "imul{q}\t$src", [], IIC_IMUL64>, SchedLoadReg<WriteIMulLd>,
+ Requires<[In64BitMode]>;
}
} // hasSideEffects
@@ -326,7 +328,7 @@ def DIV32m : I<0xF7, MRM6m, (outs), (ins i32mem:$src),
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in
def DIV64m : RI<0xF7, MRM6m, (outs), (ins i64mem:$src),
"div{q}\t$src", [], IIC_DIV64>,
- SchedLoadReg<WriteIDivLd>;
+ SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>;
}
// Signed division/remainder.
@@ -362,7 +364,7 @@ def IDIV32m: I<0xF7, MRM7m, (outs), (ins i32mem:$src),
let Defs = [RAX,RDX,EFLAGS], Uses = [RAX,RDX] in // RDX:RAX/[mem64] = RAX,RDX
def IDIV64m: RI<0xF7, MRM7m, (outs), (ins i64mem:$src),
"idiv{q}\t$src", [], IIC_IDIV64>,
- SchedLoadReg<WriteIDivLd>;
+ SchedLoadReg<WriteIDivLd>, Requires<[In64BitMode]>;
}
} // hasSideEffects = 0
@@ -407,7 +409,8 @@ def NEG32m : I<0xF7, MRM3m, (outs), (ins i32mem:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
def NEG64m : RI<0xF7, MRM3m, (outs), (ins i64mem:$dst), "neg{q}\t$dst",
[(store (ineg (loadi64 addr:$dst)), addr:$dst),
- (implicit EFLAGS)], IIC_UNARY_MEM>;
+ (implicit EFLAGS)], IIC_UNARY_MEM>,
+ Requires<[In64BitMode]>;
} // SchedRW
} // Defs = [EFLAGS]
@@ -444,7 +447,8 @@ def NOT32m : I<0xF7, MRM2m, (outs), (ins i32mem:$dst),
[(store (not (loadi32 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
OpSize32;
def NOT64m : RI<0xF7, MRM2m, (outs), (ins i64mem:$dst), "not{q}\t$dst",
- [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>;
+ [(store (not (loadi64 addr:$dst)), addr:$dst)], IIC_UNARY_MEM>,
+ Requires<[In64BitMode]>;
} // SchedRW
} // CodeSize
@@ -482,6 +486,7 @@ def INC32r_alt : I<0x40, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
} // Constraints = "$src1 = $dst", SchedRW
let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+let Predicates = [UseIncDec] in {
def INC8m : I<0xFE, MRM0m, (outs), (ins i8mem :$dst), "inc{b}\t$dst",
[(store (add (loadi8 addr:$dst), 1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -491,9 +496,12 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
def INC32m : I<0xFF, MRM0m, (outs), (ins i32mem:$dst), "inc{l}\t$dst",
[(store (add (loadi32 addr:$dst), 1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+} // Predicates
+let Predicates = [UseIncDec, In64BitMode] in {
def INC64m : RI<0xFF, MRM0m, (outs), (ins i64mem:$dst), "inc{q}\t$dst",
[(store (add (loadi64 addr:$dst), 1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>;
+} // Predicates
} // CodeSize = 2, SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteALU] in {
@@ -529,6 +537,7 @@ def DEC32r_alt : I<0x48, AddRegFrm, (outs GR32:$dst), (ins GR32:$src1),
let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
+let Predicates = [UseIncDec] in {
def DEC8m : I<0xFE, MRM1m, (outs), (ins i8mem :$dst), "dec{b}\t$dst",
[(store (add (loadi8 addr:$dst), -1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>;
@@ -538,9 +547,12 @@ let CodeSize = 2, SchedRW = [WriteALULd, WriteRMW] in {
def DEC32m : I<0xFF, MRM1m, (outs), (ins i32mem:$dst), "dec{l}\t$dst",
[(store (add (loadi32 addr:$dst), -1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>, OpSize32;
+} // Predicates
+let Predicates = [UseIncDec, In64BitMode] in {
def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
[(store (add (loadi64 addr:$dst), -1), addr:$dst),
(implicit EFLAGS)], IIC_UNARY_MEM>;
+} // Predicates
} // CodeSize = 2, SchedRW
} // Defs = [EFLAGS]
@@ -652,9 +664,8 @@ class ITy<bits<8> opcode, Format f, X86TypeInfo typeinfo, dag outs, dag ins,
// BinOpRR - Instructions like "add reg, reg, reg".
class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- dag outlist, list<dag> pattern, InstrItinClass itin,
- Format f = MRMDestReg>
- : ITy<opcode, f, typeinfo, outlist,
+ dag outlist, list<dag> pattern, InstrItinClass itin>
+ : ITy<opcode, MRMDestReg, typeinfo, outlist,
(ins typeinfo.RegClass:$src1, typeinfo.RegClass:$src2),
mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
Sched<[WriteALU]>;
@@ -662,11 +673,11 @@ class BinOpRR<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
// BinOpRR_F - Instructions like "cmp reg, Reg", where the pattern has
// just a EFLAGS as a result.
class BinOpRR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- SDPatternOperator opnode, Format f = MRMDestReg>
+ SDPatternOperator opnode>
: BinOpRR<opcode, mnemonic, typeinfo, (outs),
[(set EFLAGS,
(opnode typeinfo.RegClass:$src1, typeinfo.RegClass:$src2))],
- IIC_BIN_NONMEM, f>;
+ IIC_BIN_NONMEM>;
// BinOpRR_RF - Instructions like "add reg, reg, reg", where the pattern has
// both a regclass and EFLAGS as a result.
@@ -725,16 +736,9 @@ class BinOpRM<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
mnemonic, "{$src2, $src1|$src1, $src2}", pattern, itin>,
Sched<[WriteALULd, ReadAfterLd]>;
-// BinOpRM_R - Instructions like "add reg, reg, [mem]".
-class BinOpRM_R<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- SDNode opnode>
- : BinOpRM<opcode, mnemonic, typeinfo, (outs typeinfo.RegClass:$dst),
- [(set typeinfo.RegClass:$dst,
- (opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
-
// BinOpRM_F - Instructions like "cmp reg, [mem]".
class BinOpRM_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- SDPatternOperator opnode>
+ SDNode opnode>
: BinOpRM<opcode, mnemonic, typeinfo, (outs),
[(set EFLAGS,
(opnode typeinfo.RegClass:$src1, (typeinfo.LoadNode addr:$src2)))]>;
@@ -844,7 +848,7 @@ class BinOpMR_RMW_FF<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
// BinOpMR_F - Instructions like "cmp [mem], reg".
class BinOpMR_F<bits<8> opcode, string mnemonic, X86TypeInfo typeinfo,
- SDNode opnode>
+ SDPatternOperator opnode>
: BinOpMR<opcode, mnemonic, typeinfo,
[(set EFLAGS, (opnode (load addr:$dst), typeinfo.RegClass:$src))]>;
@@ -1000,11 +1004,13 @@ multiclass ArithBinOp_RF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
// first so that they are slightly preferred to the mi forms.
def NAME#16mi8 : BinOpMI8_RMW<mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi8 : BinOpMI8_RMW<mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
def NAME#64mi8 : BinOpMI8_RMW<mnemonic, Xi64, opnode, MemMRM>;
def NAME#8mi : BinOpMI_RMW<0x80, mnemonic, Xi8 , opnode, MemMRM>;
def NAME#16mi : BinOpMI_RMW<0x80, mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi : BinOpMI_RMW<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
def NAME#64mi32 : BinOpMI_RMW<0x80, mnemonic, Xi64, opnode, MemMRM>;
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
@@ -1083,11 +1089,13 @@ multiclass ArithBinOp_RFF<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
// first so that they are slightly preferred to the mi forms.
def NAME#16mi8 : BinOpMI8_RMW_FF<mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi8 : BinOpMI8_RMW_FF<mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
def NAME#64mi8 : BinOpMI8_RMW_FF<mnemonic, Xi64, opnode, MemMRM>;
def NAME#8mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi8 , opnode, MemMRM>;
def NAME#16mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi : BinOpMI_RMW_FF<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
def NAME#64mi32 : BinOpMI_RMW_FF<0x80, mnemonic, Xi64, opnode, MemMRM>;
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
@@ -1162,11 +1170,13 @@ multiclass ArithBinOp_F<bits<8> BaseOpc, bits<8> BaseOpc2, bits<8> BaseOpc4,
// first so that they are slightly preferred to the mi forms.
def NAME#16mi8 : BinOpMI8_F<mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi8 : BinOpMI8_F<mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
def NAME#64mi8 : BinOpMI8_F<mnemonic, Xi64, opnode, MemMRM>;
def NAME#8mi : BinOpMI_F<0x80, mnemonic, Xi8 , opnode, MemMRM>;
def NAME#16mi : BinOpMI_F<0x80, mnemonic, Xi16, opnode, MemMRM>;
def NAME#32mi : BinOpMI_F<0x80, mnemonic, Xi32, opnode, MemMRM>;
+ let Predicates = [In64BitMode] in
def NAME#64mi32 : BinOpMI_F<0x80, mnemonic, Xi64, opnode, MemMRM>;
// These are for the disassembler since 0x82 opcode behaves like 0x80, but
@@ -1231,19 +1241,21 @@ let isCompare = 1 in {
def TEST64rr : BinOpRR_F<0x84, "test", Xi64, X86testpat>;
} // isCommutable
- def TEST8rm : BinOpRM_F<0x84, "test", Xi8 , X86testpat>;
- def TEST16rm : BinOpRM_F<0x84, "test", Xi16, X86testpat>;
- def TEST32rm : BinOpRM_F<0x84, "test", Xi32, X86testpat>;
- def TEST64rm : BinOpRM_F<0x84, "test", Xi64, X86testpat>;
+ def TEST8mr : BinOpMR_F<0x84, "test", Xi8 , X86testpat>;
+ def TEST16mr : BinOpMR_F<0x84, "test", Xi16, X86testpat>;
+ def TEST32mr : BinOpMR_F<0x84, "test", Xi32, X86testpat>;
+ def TEST64mr : BinOpMR_F<0x84, "test", Xi64, X86testpat>;
def TEST8ri : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
def TEST16ri : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
def TEST32ri : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
+ let Predicates = [In64BitMode] in
def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
def TEST8mi : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
def TEST16mi : BinOpMI_F<0xF6, "test", Xi16, X86testpat, MRM0m>;
def TEST32mi : BinOpMI_F<0xF6, "test", Xi32, X86testpat, MRM0m>;
+ let Predicates = [In64BitMode] in
def TEST64mi32 : BinOpMI_F<0xF6, "test", Xi64, X86testpat, MRM0m>;
// When testing the result of EXTRACT_SUBREG sub_8bit_hi, make sure the
diff --git a/lib/Target/X86/X86InstrCMovSetCC.td b/lib/Target/X86/X86InstrCMovSetCC.td
index b85abfb9ca7f..8dd5e1c0626b 100644
--- a/lib/Target/X86/X86InstrCMovSetCC.td
+++ b/lib/Target/X86/X86InstrCMovSetCC.td
@@ -113,6 +113,6 @@ defm SETG : SETCC<0x9F, "setg", X86_COND_G>; // signed greater than
// SALC is an undocumented instruction. Information for this instruction can be found
// here http://www.rcollins.org/secrets/opcodes/SALC.html
// Set AL if carry.
-let Uses = [EFLAGS], Defs = [AL] in {
- def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", []>, Requires<[Not64BitMode]>;
+let Uses = [EFLAGS], Defs = [AL], SchedRW = [WriteALU] in {
+ def SALC : I<0xD6, RawFrm, (outs), (ins), "salc", [], IIC_AHF>, Requires<[Not64BitMode]>;
}
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index d003d027ddb9..06600a4ef286 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -32,9 +32,10 @@ def GetLo8XForm : SDNodeXForm<imm, [{
// PIC base construction. This expands to code that looks like this:
// call $next_inst
// popl %destreg"
-let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
+let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP, SSP],
+ SchedRW = [WriteJump] in
def MOVPC32r : Ii32<0xE8, Pseudo, (outs GR32:$reg), (ins i32imm:$label),
- "", []>;
+ "", [], IIC_CALL_RI>;
// ADJCALLSTACKDOWN/UP implicitly use/def ESP because they may be expanded into
@@ -42,16 +43,15 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in
// pointer before prolog-epilog rewriting occurs.
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
-let Defs = [ESP, EFLAGS], Uses = [ESP] in {
+let Defs = [ESP, EFLAGS, SSP], Uses = [ESP, SSP], SchedRW = [WriteALU] in {
def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs),
(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
- "#ADJCALLSTACKDOWN",
- []>,
- Requires<[NotLP64]>;
+ "#ADJCALLSTACKDOWN", [], IIC_ALU_NONMEM>,
+ Requires<[NotLP64]>;
def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
- [(X86callseq_end timm:$amt1, timm:$amt2)]>,
- Requires<[NotLP64]>;
+ [(X86callseq_end timm:$amt1, timm:$amt2)],
+ IIC_ALU_NONMEM>, Requires<[NotLP64]>;
}
def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
(ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>;
@@ -62,20 +62,20 @@ def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
// pointer before prolog-epilog rewriting occurs.
// Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become
// sub / add which can clobber EFLAGS.
-let Defs = [RSP, EFLAGS], Uses = [RSP] in {
+let Defs = [RSP, EFLAGS, SSP], Uses = [RSP, SSP], SchedRW = [WriteALU] in {
def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs),
(ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3),
"#ADJCALLSTACKDOWN",
- []>,
- Requires<[IsLP64]>;
+ [], IIC_ALU_NONMEM>, Requires<[IsLP64]>;
def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2),
"#ADJCALLSTACKUP",
- [(X86callseq_end timm:$amt1, timm:$amt2)]>,
- Requires<[IsLP64]>;
+ [(X86callseq_end timm:$amt1, timm:$amt2)],
+ IIC_ALU_NONMEM>, Requires<[IsLP64]>;
}
def : Pat<(X86callseq_start timm:$amt1, timm:$amt2),
(ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>;
+let SchedRW = [WriteSystem] in {
// x86-64 va_start lowering magic.
let usesCustomInserter = 1, Defs = [EFLAGS] in {
@@ -141,7 +141,19 @@ def WIN_ALLOCA_64 : I<0, Pseudo, (outs), (ins GR64:$size),
"# dynamic stack allocation",
[(X86WinAlloca GR64:$size)]>,
Requires<[In64BitMode]>;
+} // SchedRW
+// These instructions XOR the frame pointer into a GPR. They are used in some
+// stack protection schemes. These are post-RA pseudos because we only know the
+// frame register after register allocation.
+let Constraints = "$src = $dst", isPseudo = 1, Defs = [EFLAGS] in {
+ def XOR32_FP : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$src),
+ "xorl\t$$FP, $src", [], IIC_BIN_NONMEM>,
+ Requires<[NotLP64]>, Sched<[WriteALU]>;
+ def XOR64_FP : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$src),
+ "xorq\t$$FP $src", [], IIC_BIN_NONMEM>,
+ Requires<[In64BitMode]>, Sched<[WriteALU]>;
+}
//===----------------------------------------------------------------------===//
// EH Pseudo Instructions
@@ -207,17 +219,17 @@ let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
Requires<[In64BitMode]>;
}
}
-} // SchedRW
let isBranch = 1, isTerminator = 1, isCodeGenOnly = 1 in {
def EH_SjLj_Setup : I<0, Pseudo, (outs), (ins brtarget:$dst),
"#EH_SjLj_Setup\t$dst", []>;
}
+} // SchedRW
//===----------------------------------------------------------------------===//
// Pseudo instructions used by unwind info.
//
-let isPseudo = 1 in {
+let isPseudo = 1, SchedRW = [WriteSystem] in {
def SEH_PushReg : I<0, Pseudo, (outs), (ins i32imm:$reg),
"#SEH_PushReg $reg", []>;
def SEH_SaveReg : I<0, Pseudo, (outs), (ins i32imm:$reg, i32imm:$dst),
@@ -243,15 +255,15 @@ let isPseudo = 1 in {
// This is lowered into a RET instruction by MCInstLower. We need
// this so that we don't have to have a MachineBasicBlock which ends
// with a RET and also has successors.
-let isPseudo = 1 in {
+let isPseudo = 1, SchedRW = [WriteJumpLd] in {
def MORESTACK_RET: I<0, Pseudo, (outs), (ins),
- "", []>;
+ "", [], IIC_RET>;
// This instruction is lowered to a RET followed by a MOV. The two
// instructions are not generated on a higher level since then the
// verifier sees a MachineBasicBlock ending with a non-terminator.
def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
- "", []>;
+ "", [], IIC_RET>;
}
//===----------------------------------------------------------------------===//
@@ -273,39 +285,42 @@ def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
}
-let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
+let Predicates = [OptForSize, Not64BitMode],
AddedComplexity = 10 in {
+ let SchedRW = [WriteALU] in {
// Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
// which only require 3 bytes compared to MOV32ri which requires 5.
let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
def MOV32r1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, 1)]>;
+ [(set GR32:$dst, 1)], IIC_ALU_NONMEM>;
def MOV32r_1 : I<0, Pseudo, (outs GR32:$dst), (ins), "",
- [(set GR32:$dst, -1)]>;
+ [(set GR32:$dst, -1)], IIC_ALU_NONMEM>;
}
+ } // SchedRW
// MOV16ri is 4 bytes, so the instructions above are smaller.
def : Pat<(i16 1), (EXTRACT_SUBREG (MOV32r1), sub_16bit)>;
def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
}
-let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5 in {
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5,
+ SchedRW = [WriteALU] in {
// AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
-// FIXME: Add itinerary class and Schedule.
def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
- [(set GR32:$dst, i32immSExt8:$src)]>,
- Requires<[OptForMinSize, NotWin64WithoutFP]>;
+ [(set GR32:$dst, i32immSExt8:$src)], IIC_ALU_NONMEM>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
- [(set GR64:$dst, i64immSExt8:$src)]>,
- Requires<[OptForMinSize, NotWin64WithoutFP]>;
+ [(set GR64:$dst, i64immSExt8:$src)], IIC_ALU_NONMEM>,
+ Requires<[OptForMinSize, NotWin64WithoutFP]>;
}
// Materialize i64 constant where top 32-bits are zero. This could theoretically
// use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
// that would make it more difficult to rematerialize.
let isReMaterializable = 1, isAsCheapAsAMove = 1,
- isPseudo = 1, hasSideEffects = 0 in
-def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", []>;
+ isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteALU] in
+def MOV32ri64 : I<0, Pseudo, (outs GR32:$dst), (ins i64i32imm:$src), "", [],
+ IIC_ALU_NONMEM>;
// This 64-bit pseudo-move can be used for both a 64-bit constant that is
// actually the zero-extension of a 32-bit constant and for labels in the
@@ -448,6 +463,7 @@ let Defs = [RCX,RDI], isCodeGenOnly = 1 in {
//===----------------------------------------------------------------------===//
// Thread Local Storage Instructions
//
+let SchedRW = [WriteSystem] in {
// ELF TLS Support
// All calls clobber the non-callee saved registers. ESP is marked as
@@ -458,7 +474,7 @@ let Defs = [EAX, ECX, EDX, FP0, FP1, FP2, FP3, FP4, FP5, FP6, FP7,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- usesCustomInserter = 1, Uses = [ESP] in {
+ usesCustomInserter = 1, Uses = [ESP, SSP] in {
def TLS_addr32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
"# TLS_addr32",
[(X86tlsaddr tls32addr:$sym)]>,
@@ -478,7 +494,7 @@ let Defs = [RAX, RCX, RDX, RSI, RDI, R8, R9, R10, R11,
MM0, MM1, MM2, MM3, MM4, MM5, MM6, MM7,
XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, EFLAGS],
- usesCustomInserter = 1, Uses = [RSP] in {
+ usesCustomInserter = 1, Uses = [RSP, SSP] in {
def TLS_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLS_addr64",
[(X86tlsaddr tls64addr:$sym)]>,
@@ -494,7 +510,7 @@ def TLS_base_addr64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
// address of the variable is in %eax. %ecx is trashed during the function
// call. All other registers are preserved.
let Defs = [EAX, ECX, EFLAGS],
- Uses = [ESP],
+ Uses = [ESP, SSP],
usesCustomInserter = 1 in
def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
"# TLSCall_32",
@@ -507,13 +523,13 @@ def TLSCall_32 : I<0, Pseudo, (outs), (ins i32mem:$sym),
// On return the address of the variable is in %rax. All other
// registers are preserved.
let Defs = [RAX, EFLAGS],
- Uses = [RSP],
+ Uses = [RSP, SSP],
usesCustomInserter = 1 in
def TLSCall_64 : I<0, Pseudo, (outs), (ins i64mem:$sym),
"# TLSCall_64",
[(X86TLSCall addr:$sym)]>,
Requires<[In64BitMode]>;
-
+} // SchedRW
//===----------------------------------------------------------------------===//
// Conditional Move Pseudo Instructions
@@ -528,7 +544,7 @@ multiclass CMOVrr_PSEUDO<RegisterClass RC, ValueType VT> {
EFLAGS)))]>;
}
-let usesCustomInserter = 1, Uses = [EFLAGS] in {
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
// X86 doesn't have 8-bit conditional moves. Use a customInserter to
// emit control flow. An alternative to this is to mark i8 SELECT as Promote,
// however that requires promoting the operands, and can induce additional
@@ -566,7 +582,7 @@ let usesCustomInserter = 1, Uses = [EFLAGS] in {
defm _V16I1 : CMOVrr_PSEUDO<VK16, v16i1>;
defm _V32I1 : CMOVrr_PSEUDO<VK32, v32i1>;
defm _V64I1 : CMOVrr_PSEUDO<VK64, v64i1>;
-} // usesCustomInserter = 1, Uses = [EFLAGS]
+} // usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS]
//===----------------------------------------------------------------------===//
// Normal-Instructions-With-Lock-Prefix Pseudo Instructions
@@ -593,7 +609,7 @@ def Int_MemBarrier : I<0, Pseudo, (outs), (ins),
// ImmOpc8 corresponds to the mi8 version of the instruction
// ImmMod corresponds to the instruction format of the mi and mi8 versions
multiclass LOCK_ArithBinOp<bits<8> RegOpc, bits<8> ImmOpc, bits<8> ImmOpc8,
- Format ImmMod, SDPatternOperator Op, string mnemonic> {
+ Format ImmMod, SDNode Op, string mnemonic> {
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
SchedRW = [WriteALULd, WriteRMW] in {
@@ -696,30 +712,52 @@ defm LOCK_AND : LOCK_ArithBinOp<0x20, 0x80, 0x83, MRM4m, X86lock_and, "and">;
defm LOCK_XOR : LOCK_ArithBinOp<0x30, 0x80, 0x83, MRM6m, X86lock_xor, "xor">;
multiclass LOCK_ArithUnOp<bits<8> Opc8, bits<8> Opc, Format Form,
- int Increment, string mnemonic> {
+ string frag, string mnemonic> {
let Defs = [EFLAGS], mayLoad = 1, mayStore = 1, isCodeGenOnly = 1,
- SchedRW = [WriteALULd, WriteRMW], Predicates = [NotSlowIncDec] in {
+ SchedRW = [WriteALULd, WriteRMW] in {
def NAME#8m : I<Opc8, Form, (outs), (ins i8mem :$dst),
!strconcat(mnemonic, "{b}\t$dst"),
- [(set EFLAGS, (X86lock_add addr:$dst, (i8 Increment)))],
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_8") addr:$dst))],
IIC_UNARY_MEM>, LOCK;
def NAME#16m : I<Opc, Form, (outs), (ins i16mem:$dst),
!strconcat(mnemonic, "{w}\t$dst"),
- [(set EFLAGS, (X86lock_add addr:$dst, (i16 Increment)))],
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_16") addr:$dst))],
IIC_UNARY_MEM>, OpSize16, LOCK;
def NAME#32m : I<Opc, Form, (outs), (ins i32mem:$dst),
!strconcat(mnemonic, "{l}\t$dst"),
- [(set EFLAGS, (X86lock_add addr:$dst, (i32 Increment)))],
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_32") addr:$dst))],
IIC_UNARY_MEM>, OpSize32, LOCK;
def NAME#64m : RI<Opc, Form, (outs), (ins i64mem:$dst),
!strconcat(mnemonic, "{q}\t$dst"),
- [(set EFLAGS, (X86lock_add addr:$dst, (i64 Increment)))],
+ [(set EFLAGS, (!cast<PatFrag>(frag # "_64") addr:$dst))],
IIC_UNARY_MEM>, LOCK;
}
}
-defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, 1, "inc">;
-defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, -1, "dec">;
+multiclass unary_atomic_intrin<SDNode atomic_op> {
+ def _8 : PatFrag<(ops node:$ptr),
+ (atomic_op node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i8;
+ }]>;
+ def _16 : PatFrag<(ops node:$ptr),
+ (atomic_op node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i16;
+ }]>;
+ def _32 : PatFrag<(ops node:$ptr),
+ (atomic_op node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
+ }]>;
+ def _64 : PatFrag<(ops node:$ptr),
+ (atomic_op node:$ptr), [{
+ return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+ }]>;
+}
+
+defm X86lock_inc : unary_atomic_intrin<X86lock_inc>;
+defm X86lock_dec : unary_atomic_intrin<X86lock_dec>;
+
+defm LOCK_INC : LOCK_ArithUnOp<0xFE, 0xFF, MRM0m, "X86lock_inc", "inc">;
+defm LOCK_DEC : LOCK_ArithUnOp<0xFE, 0xFF, MRM1m, "X86lock_dec", "dec">;
// Atomic compare and swap.
multiclass LCMPXCHG_UnOp<bits<8> Opc, Format Form, string mnemonic,
@@ -767,7 +805,7 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
// register and the register allocator will ignore any use/def of
// it. In other words, the register will not fix the clobbering of
// RBX that will happen when setting the arguments for the instrucion.
-//
+//
// Unlike the actual related instuction, we mark that this one
// defines EBX (instead of using EBX).
// The rationale is that we will define RBX during the expansion of
@@ -895,7 +933,7 @@ multiclass RELEASE_BINOP_MI<SDNode op> {
[(atomic_store_64 addr:$dst, (op
(atomic_load_64 addr:$dst), GR64:$src))]>;
}
-let Defs = [EFLAGS] in {
+let Defs = [EFLAGS], SchedRW = [WriteMicrocoded] in {
defm RELEASE_ADD : RELEASE_BINOP_MI<add>;
defm RELEASE_AND : RELEASE_BINOP_MI<and>;
defm RELEASE_OR : RELEASE_BINOP_MI<or>;
@@ -908,20 +946,20 @@ let Defs = [EFLAGS] in {
// FIXME: imm version.
// FIXME: Version that doesn't clobber $src, using AVX's VADDSS.
// FIXME: This could also handle SIMD operations with *ps and *pd instructions.
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, SchedRW = [WriteMicrocoded] in {
multiclass RELEASE_FP_BINOP_MI<SDNode op> {
def NAME#32mr : I<0, Pseudo, (outs), (ins i32mem:$dst, FR32:$src),
"#BINOP "#NAME#"32mr PSEUDO!",
[(atomic_store_32 addr:$dst,
- (i32 (bitconvert (op
+ (i32 (bitconvert (op
(f32 (bitconvert (i32 (atomic_load_32 addr:$dst)))),
- FR32:$src))))]>, Requires<[HasSSE1]>;
+ FR32:$src))))]>, Requires<[HasSSE1]>;
def NAME#64mr : I<0, Pseudo, (outs), (ins i64mem:$dst, FR64:$src),
"#BINOP "#NAME#"64mr PSEUDO!",
[(atomic_store_64 addr:$dst,
- (i64 (bitconvert (op
+ (i64 (bitconvert (op
(f64 (bitconvert (i64 (atomic_load_64 addr:$dst)))),
- FR64:$src))))]>, Requires<[HasSSE2]>;
+ FR64:$src))))]>, Requires<[HasSSE2]>;
}
defm RELEASE_FADD : RELEASE_FP_BINOP_MI<fadd>;
// FIXME: Add fsub, fmul, fdiv, ...
@@ -942,17 +980,17 @@ multiclass RELEASE_UNOP<dag dag8, dag dag16, dag dag32, dag dag64> {
[(atomic_store_64 addr:$dst, dag64)]>;
}
-let Defs = [EFLAGS] in {
+let Defs = [EFLAGS], Predicates = [UseIncDec], SchedRW = [WriteMicrocoded] in {
defm RELEASE_INC : RELEASE_UNOP<
(add (atomic_load_8 addr:$dst), (i8 1)),
(add (atomic_load_16 addr:$dst), (i16 1)),
(add (atomic_load_32 addr:$dst), (i32 1)),
- (add (atomic_load_64 addr:$dst), (i64 1))>, Requires<[NotSlowIncDec]>;
+ (add (atomic_load_64 addr:$dst), (i64 1))>;
defm RELEASE_DEC : RELEASE_UNOP<
(add (atomic_load_8 addr:$dst), (i8 -1)),
(add (atomic_load_16 addr:$dst), (i16 -1)),
(add (atomic_load_32 addr:$dst), (i32 -1)),
- (add (atomic_load_64 addr:$dst), (i64 -1))>, Requires<[NotSlowIncDec]>;
+ (add (atomic_load_64 addr:$dst), (i64 -1))>;
}
/*
TODO: These don't work because the type inference of TableGen fails.
@@ -972,18 +1010,19 @@ defm RELEASE_NOT : RELEASE_UNOP<
(not (atomic_load_64 addr:$dst))>;
*/
+let SchedRW = [WriteMicrocoded] in {
def RELEASE_MOV8mi : I<0, Pseudo, (outs), (ins i8mem:$dst, i8imm:$src),
- "#RELEASE_MOV8mi PSEUDO!",
- [(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
+ "#RELEASE_MOV8mi PSEUDO!",
+ [(atomic_store_8 addr:$dst, (i8 imm:$src))]>;
def RELEASE_MOV16mi : I<0, Pseudo, (outs), (ins i16mem:$dst, i16imm:$src),
- "#RELEASE_MOV16mi PSEUDO!",
- [(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
+ "#RELEASE_MOV16mi PSEUDO!",
+ [(atomic_store_16 addr:$dst, (i16 imm:$src))]>;
def RELEASE_MOV32mi : I<0, Pseudo, (outs), (ins i32mem:$dst, i32imm:$src),
- "#RELEASE_MOV32mi PSEUDO!",
- [(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
+ "#RELEASE_MOV32mi PSEUDO!",
+ [(atomic_store_32 addr:$dst, (i32 imm:$src))]>;
def RELEASE_MOV64mi32 : I<0, Pseudo, (outs), (ins i64mem:$dst, i64i32imm:$src),
- "#RELEASE_MOV64mi32 PSEUDO!",
- [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
+ "#RELEASE_MOV64mi32 PSEUDO!",
+ [(atomic_store_64 addr:$dst, i64immSExt32:$src)]>;
def RELEASE_MOV8mr : I<0, Pseudo, (outs), (ins i8mem :$dst, GR8 :$src),
"#RELEASE_MOV8mr PSEUDO!",
@@ -1010,6 +1049,7 @@ def ACQUIRE_MOV32rm : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$src),
def ACQUIRE_MOV64rm : I<0, Pseudo, (outs GR64:$dst), (ins i64mem:$src),
"#ACQUIRE_MOV64rm PSEUDO!",
[(set GR64:$dst, (atomic_load_64 addr:$src))]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// DAG Pattern Matching Rules
@@ -1239,18 +1279,20 @@ def : Pat<(i64 (anyext GR8 :$src)),
def : Pat<(i64 (anyext GR16:$src)),
(SUBREG_TO_REG (i64 0), (MOVZX32rr16 GR16 :$src), sub_32bit)>;
def : Pat<(i64 (anyext GR32:$src)),
- (SUBREG_TO_REG (i64 0), GR32:$src, sub_32bit)>;
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$src, sub_32bit)>;
// Any instruction that defines a 32-bit result leaves the high half of the
// register. Truncate can be lowered to EXTRACT_SUBREG. CopyFromReg may
// be copying from a truncate. Any other 32-bit operation will zero-extend
-// up to 64 bits.
+// up to 64 bits. AssertSext/AssertZext aren't saying anything about the upper
+// 32 bits, they're probably just qualifying a CopyFromReg.
def def32 : PatLeaf<(i32 GR32:$src), [{
return N->getOpcode() != ISD::TRUNCATE &&
N->getOpcode() != TargetOpcode::EXTRACT_SUBREG &&
N->getOpcode() != ISD::CopyFromReg &&
- N->getOpcode() != ISD::AssertSext;
+ N->getOpcode() != ISD::AssertSext &&
+ N->getOpcode() != ISD::AssertZext;
}]>;
// In the case of a 32-bit def that is known to implicitly zero-extend,
@@ -1397,16 +1439,11 @@ def : Pat<(and GR32:$src1, 0xffff),
(MOVZX32rr16 (EXTRACT_SUBREG GR32:$src1, sub_16bit))>;
// r & (2^8-1) ==> movz
def : Pat<(and GR32:$src1, 0xff),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src1,
- GR32_ABCD)),
- sub_8bit))>,
- Requires<[Not64BitMode]>;
+ (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>;
// r & (2^8-1) ==> movz
def : Pat<(and GR16:$src1, 0xff),
- (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG
- (i16 (COPY_TO_REGCLASS GR16:$src1, GR16_ABCD)), sub_8bit)),
- sub_16bit)>,
- Requires<[Not64BitMode]>;
+ (EXTRACT_SUBREG (MOVZX32rr8 (EXTRACT_SUBREG GR16:$src1, sub_8bit)),
+ sub_16bit)>;
// r & (2^32-1) ==> movz
def : Pat<(and GR64:$src, 0x00000000FFFFFFFF),
@@ -1423,15 +1460,6 @@ def : Pat<(and GR64:$src, 0xff),
(SUBREG_TO_REG (i64 0),
(MOVZX32rr8 (i8 (EXTRACT_SUBREG GR64:$src, sub_8bit))),
sub_32bit)>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR32:$src1, 0xff),
- (MOVZX32rr8 (EXTRACT_SUBREG GR32:$src1, sub_8bit))>,
- Requires<[In64BitMode]>;
-// r & (2^8-1) ==> movz
-def : Pat<(and GR16:$src1, 0xff),
- (EXTRACT_SUBREG (MOVZX32rr8 (i8
- (EXTRACT_SUBREG GR16:$src1, sub_8bit))), sub_16bit)>,
- Requires<[In64BitMode]>;
} // AddedComplexity = 1
@@ -1439,16 +1467,11 @@ def : Pat<(and GR16:$src1, 0xff),
def : Pat<(sext_inreg GR32:$src, i16),
(MOVSX32rr16 (EXTRACT_SUBREG GR32:$src, sub_16bit))>;
def : Pat<(sext_inreg GR32:$src, i8),
- (MOVSX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit))>,
- Requires<[Not64BitMode]>;
+ (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>;
def : Pat<(sext_inreg GR16:$src, i8),
- (EXTRACT_SUBREG (i32 (MOVSX32rr8 (EXTRACT_SUBREG
- (i32 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)), sub_8bit))),
- sub_16bit)>,
- Requires<[Not64BitMode]>;
+ (EXTRACT_SUBREG (MOVSX32rr8 (EXTRACT_SUBREG GR16:$src, sub_8bit)),
+ sub_16bit)>;
def : Pat<(sext_inreg GR64:$src, i32),
(MOVSX64rr32 (EXTRACT_SUBREG GR64:$src, sub_32bit))>;
@@ -1456,13 +1479,6 @@ def : Pat<(sext_inreg GR64:$src, i16),
(MOVSX64rr16 (EXTRACT_SUBREG GR64:$src, sub_16bit))>;
def : Pat<(sext_inreg GR64:$src, i8),
(MOVSX64rr8 (EXTRACT_SUBREG GR64:$src, sub_8bit))>;
-def : Pat<(sext_inreg GR32:$src, i8),
- (MOVSX32rr8 (EXTRACT_SUBREG GR32:$src, sub_8bit))>,
- Requires<[In64BitMode]>;
-def : Pat<(sext_inreg GR16:$src, i8),
- (EXTRACT_SUBREG (MOVSX32rr8
- (EXTRACT_SUBREG GR16:$src, sub_8bit)), sub_16bit)>,
- Requires<[In64BitMode]>;
// sext, sext_load, zext, zext_load
def: Pat<(i16 (sext GR8:$src)),
@@ -1500,44 +1516,26 @@ def : Pat<(i8 (trunc GR16:$src)),
// h-register tricks
def : Pat<(i8 (trunc (srl_su GR16:$src, (i8 8)))),
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)>,
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
Requires<[Not64BitMode]>;
def : Pat<(i8 (trunc (srl_su (i32 (anyext GR16:$src)), (i8 8)))),
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)>,
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)>,
Requires<[Not64BitMode]>;
def : Pat<(i8 (trunc (srl_su GR32:$src, (i8 8)))),
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit_hi)>,
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi)>,
Requires<[Not64BitMode]>;
def : Pat<(srl GR16:$src, (i8 8)),
(EXTRACT_SUBREG
- (MOVZX32rr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
- sub_16bit)>,
- Requires<[Not64BitMode]>;
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
+ sub_16bit)>;
def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
- GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32rr8 (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src,
- GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>;
def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
- (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[Not64BitMode]>;
+ (MOVZX32_NOREXrr8 (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>;
// h-register tricks.
// For now, be conservative on x86-64 and use an h-register extract only if the
@@ -1551,68 +1549,35 @@ def : Pat<(and (srl_su GR64:$src, (i8 8)), (i64 255)),
(SUBREG_TO_REG
(i64 0),
(MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
- sub_8bit_hi)),
+ (EXTRACT_SUBREG GR64:$src, sub_8bit_hi)),
sub_32bit)>;
-def : Pat<(and (srl_su GR32:$src, (i8 8)), (i32 255)),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(srl (and_su GR32:$src, 0xff00), (i8 8)),
- (MOVZX32_NOREXrr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src,
- GR32_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(srl GR16:$src, (i8 8)),
- (EXTRACT_SUBREG
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
- sub_16bit)>,
- Requires<[In64BitMode]>;
-def : Pat<(i32 (zext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
-def : Pat<(i32 (anyext (srl_su GR16:$src, (i8 8)))),
- (MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi))>,
- Requires<[In64BitMode]>;
def : Pat<(i64 (zext (srl_su GR16:$src, (i8 8)))),
(SUBREG_TO_REG
(i64 0),
(MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
sub_32bit)>;
def : Pat<(i64 (anyext (srl_su GR16:$src, (i8 8)))),
(SUBREG_TO_REG
(i64 0),
(MOVZX32_NOREXrr8
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi)),
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi)),
sub_32bit)>;
// h-register extract and store.
def : Pat<(store (i8 (trunc_su (srl_su GR64:$src, (i8 8)))), addr:$dst),
(MOV8mr_NOREX
addr:$dst,
- (EXTRACT_SUBREG (i64 (COPY_TO_REGCLASS GR64:$src, GR64_ABCD)),
- sub_8bit_hi))>;
+ (EXTRACT_SUBREG GR64:$src, sub_8bit_hi))>;
def : Pat<(store (i8 (trunc_su (srl_su GR32:$src, (i8 8)))), addr:$dst),
(MOV8mr_NOREX
addr:$dst,
- (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS GR32:$src, GR32_ABCD)),
- sub_8bit_hi))>,
+ (EXTRACT_SUBREG GR32:$src, sub_8bit_hi))>,
Requires<[In64BitMode]>;
def : Pat<(store (i8 (trunc_su (srl_su GR16:$src, (i8 8)))), addr:$dst),
(MOV8mr_NOREX
addr:$dst,
- (EXTRACT_SUBREG (i16 (COPY_TO_REGCLASS GR16:$src, GR16_ABCD)),
- sub_8bit_hi))>,
+ (EXTRACT_SUBREG GR16:$src, sub_8bit_hi))>,
Requires<[In64BitMode]>;
@@ -1627,7 +1592,13 @@ def : Pat<(shl GR16:$src1, (i8 1)), (ADD16rr GR16:$src1, GR16:$src1)>;
def : Pat<(shl GR32:$src1, (i8 1)), (ADD32rr GR32:$src1, GR32:$src1)>;
def : Pat<(shl GR64:$src1, (i8 1)), (ADD64rr GR64:$src1, GR64:$src1)>;
-// Helper imms that check if a mask doesn't change significant shift bits.
+// Helper imms to check if a mask doesn't change significant shift/rotate bits.
+def immShift8 : ImmLeaf<i8, [{
+ return countTrailingOnes<uint64_t>(Imm) >= 3;
+}]>;
+def immShift16 : ImmLeaf<i8, [{
+ return countTrailingOnes<uint64_t>(Imm) >= 4;
+}]>;
def immShift32 : ImmLeaf<i8, [{
return countTrailingOnes<uint64_t>(Imm) >= 5;
}]>;
@@ -1654,15 +1625,45 @@ multiclass MaskedShiftAmountPats<SDNode frag, string name> {
// (shift x (and y, 63)) ==> (shift x, y)
def : Pat<(frag GR64:$src1, (and CL, immShift64)),
(!cast<Instruction>(name # "64rCL") GR64:$src1)>;
- def : Pat<(store (frag (loadi64 addr:$dst), (and CL, 63)), addr:$dst),
+ def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
(!cast<Instruction>(name # "64mCL") addr:$dst)>;
}
defm : MaskedShiftAmountPats<shl, "SHL">;
defm : MaskedShiftAmountPats<srl, "SHR">;
defm : MaskedShiftAmountPats<sra, "SAR">;
-defm : MaskedShiftAmountPats<rotl, "ROL">;
-defm : MaskedShiftAmountPats<rotr, "ROR">;
+
+// ROL/ROR instructions allow a stronger mask optimization than shift for 8- and
+// 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount
+// because over-rotating produces the same result. This is noted in the Intel
+// docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation
+// amount could affect EFLAGS results, but that does not matter because we are
+// not tracking flags for these nodes.
+multiclass MaskedRotateAmountPats<SDNode frag, string name> {
+ // (rot x (and y, BitWidth - 1)) ==> (rot x, y)
+ def : Pat<(frag GR8:$src1, (and CL, immShift8)),
+ (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+ def : Pat<(frag GR16:$src1, (and CL, immShift16)),
+ (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+ def : Pat<(frag GR32:$src1, (and CL, immShift32)),
+ (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+ def : Pat<(store (frag (loadi8 addr:$dst), (and CL, immShift8)), addr:$dst),
+ (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi16 addr:$dst), (and CL, immShift16)), addr:$dst),
+ (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+ def : Pat<(store (frag (loadi32 addr:$dst), (and CL, immShift32)), addr:$dst),
+ (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+
+ // (rot x (and y, 63)) ==> (rot x, y)
+ def : Pat<(frag GR64:$src1, (and CL, immShift64)),
+ (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+ def : Pat<(store (frag (loadi64 addr:$dst), (and CL, immShift64)), addr:$dst),
+ (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+}
+
+
+defm : MaskedRotateAmountPats<rotl, "ROL">;
+defm : MaskedRotateAmountPats<rotr, "ROR">;
// Double shift amount is implicitly masked.
multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
@@ -1680,6 +1681,66 @@ multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+let Predicates = [HasBMI2] in {
+ let AddedComplexity = 1 in {
+ def : Pat<(sra GR32:$src1, (and GR8:$src2, immShift32)),
+ (SARX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra GR64:$src1, (and GR8:$src2, immShift64)),
+ (SARX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl GR32:$src1, (and GR8:$src2, immShift32)),
+ (SHRX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl GR64:$src1, (and GR8:$src2, immShift64)),
+ (SHRX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl GR32:$src1, (and GR8:$src2, immShift32)),
+ (SHLX32rr GR32:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl GR64:$src1, (and GR8:$src2, immShift64)),
+ (SHLX64rr GR64:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+
+ let AddedComplexity = -20 in {
+ def : Pat<(sra (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ (SARX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ (SARX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ (SHRX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ (SHRX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl (loadi32 addr:$src1), (and GR8:$src2, immShift32)),
+ (SHLX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl (loadi64 addr:$src1), (and GR8:$src2, immShift64)),
+ (SHLX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
+}
+
// (anyext (setcc_carry)) -> (setcc_carry)
def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
(SETB_C16r)>;
@@ -1821,7 +1882,7 @@ def : Pat<(mul (loadi64 addr:$src1), i64immSExt32:$src2),
// Increment/Decrement reg.
// Do not make INC/DEC if it is slow
-let Predicates = [NotSlowIncDec] in {
+let Predicates = [UseIncDec] in {
def : Pat<(add GR8:$src, 1), (INC8r GR8:$src)>;
def : Pat<(add GR16:$src, 1), (INC16r GR16:$src)>;
def : Pat<(add GR32:$src, 1), (INC32r GR32:$src)>;
diff --git a/lib/Target/X86/X86InstrControl.td b/lib/Target/X86/X86InstrControl.td
index 4ea223e82be9..5581fd462a1d 100644
--- a/lib/Target/X86/X86InstrControl.td
+++ b/lib/Target/X86/X86InstrControl.td
@@ -171,7 +171,7 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
"ljmp{w}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize16,
Sched<[WriteJumpLd]>;
def FARJMP32m : I<0xFF, MRM5m, (outs), (ins opaque48mem:$dst),
- "ljmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
+ "{l}jmp{l}\t{*}$dst", [], IIC_JMP_FAR_MEM>, OpSize32,
Sched<[WriteJumpLd]>;
}
@@ -191,7 +191,7 @@ let isCall = 1 in
// a use to prevent stack-pointer assignments that appear immediately
// before calls from potentially appearing dead. Uses for argument
// registers are added manually.
- let Uses = [ESP] in {
+ let Uses = [ESP, SSP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
(outs), (ins i32imm_pcrel:$dst),
"call{l}\t$dst", [], IIC_CALL_RI>, OpSize32,
@@ -233,7 +233,7 @@ let isCall = 1 in
"lcall{w}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize16,
Sched<[WriteJumpLd]>;
def FARCALL32m : I<0xFF, MRM3m, (outs), (ins opaque48mem:$dst),
- "lcall{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
+ "{l}call{l}\t{*}$dst", [], IIC_CALL_FAR_MEM>, OpSize32,
Sched<[WriteJumpLd]>;
}
@@ -241,11 +241,11 @@ let isCall = 1 in
// Tail call stuff.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [ESP] in {
+ let Uses = [ESP, SSP] in {
def TCRETURNdi : PseudoI<(outs),
- (ins i32imm_pcrel:$dst, i32imm:$offset), []>;
+ (ins i32imm_pcrel:$dst, i32imm:$offset), []>, NotMemoryFoldable;
def TCRETURNri : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
let mayLoad = 1 in
def TCRETURNmi : PseudoI<(outs),
(ins i32mem_TC:$dst, i32imm:$offset), []>;
@@ -268,7 +268,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [ESP, EFLAGS] in {
+ let Uses = [ESP, EFLAGS, SSP] in {
def TCRETURNdicc : PseudoI<(outs),
(ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
@@ -287,7 +287,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
// RSP is marked as a use to prevent stack-pointer assignments that appear
// immediately before calls from potentially appearing dead. Uses for argument
// registers are added manually.
-let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
+let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
// NOTE: this pattern doesn't match "X86call imm", because we do not know
// that the offset between an arbitrary immediate and the call will fit in
// the 32-bit pcrel field that we have.
@@ -309,16 +309,16 @@ let isCall = 1, Uses = [RSP], SchedRW = [WriteJump] in {
}
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
- isCodeGenOnly = 1, Uses = [RSP], usesCustomInserter = 1,
+ isCodeGenOnly = 1, Uses = [RSP, SSP], usesCustomInserter = 1,
SchedRW = [WriteJump] in {
def TCRETURNdi64 : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset),
[]>;
def TCRETURNri64 : PseudoI<(outs),
- (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>;
+ (ins ptr_rc_tailcall:$dst, i32imm:$offset), []>, NotMemoryFoldable;
let mayLoad = 1 in
def TCRETURNmi64 : PseudoI<(outs),
- (ins i64mem_TC:$dst, i32imm:$offset), []>;
+ (ins i64mem_TC:$dst, i32imm:$offset), []>, NotMemoryFoldable;
def TAILJMPd64 : Ii32PCRel<0xE9, RawFrm, (outs), (ins i64i32imm_pcrel:$dst),
"jmp\t$dst", [], IIC_JMP_REL>;
@@ -345,7 +345,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
// rather than barriers, and they use EFLAGS.
let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
isCodeGenOnly = 1, SchedRW = [WriteJumpLd] in
- let Uses = [RSP, EFLAGS] in {
+ let Uses = [RSP, EFLAGS, SSP] in {
def TCRETURNdi64cc : PseudoI<(outs),
(ins i64i32imm_pcrel:$dst, i32imm:$offset,
i32imm:$cond), []>;
diff --git a/lib/Target/X86/X86InstrExtension.td b/lib/Target/X86/X86InstrExtension.td
index af43d9f53325..2a8ab0069b1e 100644
--- a/lib/Target/X86/X86InstrExtension.td
+++ b/lib/Target/X86/X86InstrExtension.td
@@ -12,32 +12,30 @@
//===----------------------------------------------------------------------===//
let hasSideEffects = 0 in {
- let Defs = [AX], Uses = [AL] in
+ let Defs = [AX], Uses = [AL] in // AX = signext(AL)
def CBW : I<0x98, RawFrm, (outs), (ins),
- "{cbtw|cbw}", [], IIC_CBW>, OpSize16; // AX = signext(AL)
- let Defs = [EAX], Uses = [AX] in
+ "{cbtw|cbw}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>;
+ let Defs = [EAX], Uses = [AX] in // EAX = signext(AX)
def CWDE : I<0x98, RawFrm, (outs), (ins),
- "{cwtl|cwde}", [], IIC_CBW>, OpSize32; // EAX = signext(AX)
+ "{cwtl|cwde}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>;
- let Defs = [AX,DX], Uses = [AX] in
+ let Defs = [AX,DX], Uses = [AX] in // DX:AX = signext(AX)
def CWD : I<0x99, RawFrm, (outs), (ins),
- "{cwtd|cwd}", [], IIC_CBW>, OpSize16; // DX:AX = signext(AX)
- let Defs = [EAX,EDX], Uses = [EAX] in
+ "{cwtd|cwd}", [], IIC_CBW>, OpSize16, Sched<[WriteALU]>;
+ let Defs = [EAX,EDX], Uses = [EAX] in // EDX:EAX = signext(EAX)
def CDQ : I<0x99, RawFrm, (outs), (ins),
- "{cltd|cdq}", [], IIC_CBW>, OpSize32; // EDX:EAX = signext(EAX)
+ "{cltd|cdq}", [], IIC_CBW>, OpSize32, Sched<[WriteALU]>;
- let Defs = [RAX], Uses = [EAX] in
+ let Defs = [RAX], Uses = [EAX] in // RAX = signext(EAX)
def CDQE : RI<0x98, RawFrm, (outs), (ins),
- "{cltq|cdqe}", [], IIC_CBW>; // RAX = signext(EAX)
+ "{cltq|cdqe}", [], IIC_CBW>, Sched<[WriteALU]>;
- let Defs = [RAX,RDX], Uses = [RAX] in
+ let Defs = [RAX,RDX], Uses = [RAX] in // RDX:RAX = signext(RAX)
def CQO : RI<0x99, RawFrm, (outs), (ins),
- "{cqto|cqo}", [], IIC_CBW>; // RDX:RAX = signext(RAX)
+ "{cqto|cqo}", [], IIC_CBW>, Sched<[WriteALU]>;
}
-
-
// Sign/Zero extenders
let hasSideEffects = 0 in {
def MOVSX16rr8 : I<0xBE, MRMSrcReg, (outs GR16:$dst), (ins GR8:$src),
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 3a3cdc9fa574..35fa45590fc6 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -15,8 +15,8 @@
// FMA3 - Intel 3 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
-// For all FMA opcodes declared in fma3p_rm and fma3s_rm milticlasses defined
-// below, both the register and memory variants are commutable.
+// For all FMA opcodes declared in fma3p_rm_* and fma3s_rm_* multiclasses
+// defined below, both the register and memory variants are commutable.
// For the register form the commutable operands are 1, 2 and 3.
// For the memory variant the folded operand must be in 3. Thus,
// in that case, only the operands 1 and 2 can be swapped.
@@ -34,56 +34,87 @@
// operands 1 and 3 (register forms only): *231* --> *213*;
// operands 2 and 3 (register forms only): *231* --> *231*(no changes).
-let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
-multiclass fma3p_rm<bits<8> opc, string OpcodeStr,
- PatFrag MemFrag128, PatFrag MemFrag256,
- ValueType OpVT128, ValueType OpVT256,
- SDPatternOperator Op = null_frag> {
- def r : FMA3<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass fma3p_rm_213<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
+ SDNode Op> {
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (VT (Op RC:$src2, RC:$src1, RC:$src3)))]>,
+ Sched<[WriteFMA]>;
+
+ let mayLoad = 1 in
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst, (OpVT128 (Op VR128:$src2,
- VR128:$src1, VR128:$src3)))]>;
+ [(set RC:$dst, (VT (Op RC:$src2, RC:$src1,
+ (MemFrag addr:$src3))))]>,
+ Sched<[WriteFMALd, ReadAfterLd]>;
+}
+
+multiclass fma3p_rm_231<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
+ SDNode Op> {
+ let hasSideEffects = 0 in
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[WriteFMA]>;
let mayLoad = 1 in
- def m : FMA3<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, f128mem:$src3),
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR128:$dst, (OpVT128 (Op VR128:$src2, VR128:$src1,
- (MemFrag128 addr:$src3))))]>;
+ [(set RC:$dst, (VT (Op RC:$src2, (MemFrag addr:$src3),
+ RC:$src1)))]>, Sched<[WriteFMALd, ReadAfterLd]>;
+}
- def Yr : FMA3<opc, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, VR256:$src3),
+multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
+ ValueType VT, X86MemOperand x86memop, PatFrag MemFrag,
+ SDNode Op> {
+ let hasSideEffects = 0 in
+ def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst, (OpVT256 (Op VR256:$src2, VR256:$src1,
- VR256:$src3)))]>, VEX_L;
+ []>, Sched<[WriteFMA]>;
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
let mayLoad = 1 in
- def Ym : FMA3<opc, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, f256mem:$src3),
+ def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set VR256:$dst,
- (OpVT256 (Op VR256:$src2, VR256:$src1,
- (MemFrag256 addr:$src3))))]>, VEX_L;
+ [(set RC:$dst, (VT (Op (MemFrag addr:$src3), RC:$src1,
+ RC:$src2)))]>, Sched<[WriteFMALd, ReadAfterLd]>;
}
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy, string Suff,
PatFrag MemFrag128, PatFrag MemFrag256,
SDNode Op, ValueType OpTy128, ValueType OpTy256> {
- defm NAME#213#Suff : fma3p_rm<opc213,
- !strconcat(OpcodeStr, "213", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256, Op>;
- defm NAME#132#Suff : fma3p_rm<opc132,
- !strconcat(OpcodeStr, "132", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256>;
- defm NAME#231#Suff : fma3p_rm<opc231,
- !strconcat(OpcodeStr, "231", PackTy),
- MemFrag128, MemFrag256, OpTy128, OpTy256>;
+ defm NAME#213#Suff : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
+ VR128, OpTy128, f128mem, MemFrag128, Op>;
+ defm NAME#231#Suff : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
+ VR128, OpTy128, f128mem, MemFrag128, Op>;
+ defm NAME#132#Suff : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
+ VR128, OpTy128, f128mem, MemFrag128, Op>;
+
+ defm NAME#213#Suff#Y : fma3p_rm_213<opc213, !strconcat(OpcodeStr, "213", PackTy),
+ VR256, OpTy256, f256mem, MemFrag256, Op>,
+ VEX_L;
+ defm NAME#231#Suff#Y : fma3p_rm_231<opc231, !strconcat(OpcodeStr, "231", PackTy),
+ VR256, OpTy256, f256mem, MemFrag256, Op>,
+ VEX_L;
+ defm NAME#132#Suff#Y : fma3p_rm_132<opc132, !strconcat(OpcodeStr, "132", PackTy),
+ VR256, OpTy256, f256mem, MemFrag256, Op>,
+ VEX_L;
}
// Fused Multiply-Add
@@ -93,11 +124,9 @@ let ExeDomain = SSEPackedSingle in {
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32>;
defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmaddsub,
- v4f32, v8f32>;
+ loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32>;
defm VFMSUBADD : fma3p_forms<0x97, 0xA7, 0xB7, "vfmsubadd", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmsubadd,
- v4f32, v8f32>;
+ loadv4f32, loadv8f32, X86Fmsubadd, v4f32, v8f32>;
}
let ExeDomain = SSEPackedDouble in {
@@ -138,23 +167,79 @@ let ExeDomain = SSEPackedDouble in {
// FMA*231* reg2, reg1, reg3; // reg1 * reg3 + reg2;
// Please see more detailed comment at the very beginning of the section
// defining FMA3 opcodes above.
-let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
-multiclass fma3s_rm<bits<8> opc, string OpcodeStr,
- X86MemOperand x86memop, RegisterClass RC,
- SDPatternOperator OpNode = null_frag> {
- def r : FMA3<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>;
+multiclass fma3s_rm_213<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode> {
+ def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst, (OpNode RC:$src2, RC:$src1, RC:$src3))]>,
+ Sched<[WriteFMA]>;
let mayLoad = 1 in
- def m : FMA3<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, x86memop:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [(set RC:$dst,
- (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>;
+ def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src2, RC:$src1, (load addr:$src3)))]>,
+ Sched<[WriteFMALd, ReadAfterLd]>;
+}
+
+multiclass fma3s_rm_231<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode> {
+ let hasSideEffects = 0 in
+ def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[WriteFMA]>;
+
+ let mayLoad = 1 in
+ def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode RC:$src2, (load addr:$src3), RC:$src1))]>,
+ Sched<[WriteFMALd, ReadAfterLd]>;
+}
+
+multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
+ X86MemOperand x86memop, RegisterClass RC,
+ SDPatternOperator OpNode> {
+ let hasSideEffects = 0 in
+ def r : FMA3S<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[WriteFMA]>;
+
+ // Pattern is 312 order so that the load is in a different place from the
+ // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
+ let mayLoad = 1 in
+ def m : FMA3S<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, x86memop:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ [(set RC:$dst,
+ (OpNode (load addr:$src3), RC:$src1, RC:$src2))]>,
+ Sched<[WriteFMALd, ReadAfterLd]>;
+}
+
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
+ string OpStr, string PackTy, string Suff,
+ SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop> {
+ defm NAME#213#Suff : fma3s_rm_213<opc213, !strconcat(OpStr, "213", PackTy),
+ x86memop, RC, OpNode>;
+ defm NAME#231#Suff : fma3s_rm_231<opc231, !strconcat(OpStr, "231", PackTy),
+ x86memop, RC, OpNode>;
+ defm NAME#132#Suff : fma3s_rm_132<opc132, !strconcat(OpStr, "132", PackTy),
+ x86memop, RC, OpNode>;
}
// These FMA*_Int instructions are defined specially for being used when
@@ -174,32 +259,18 @@ let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
hasSideEffects = 0 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
Operand memopr, RegisterClass RC> {
- def r_Int : FMA3<opc, MRMSrcReg, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, RC:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
+ def r_Int : FMA3S_Int<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, RC:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[WriteFMA]>;
let mayLoad = 1 in
- def m_Int : FMA3<opc, MRMSrcMem, (outs RC:$dst),
- (ins RC:$src1, RC:$src2, memopr:$src3),
- !strconcat(OpcodeStr,
- "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- []>;
-}
-
-multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, string PackTy, string Suff,
- SDNode OpNode, RegisterClass RC,
- X86MemOperand x86memop> {
- let Predicates = [HasFMA, NoAVX512] in {
- defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
- x86memop, RC>;
- defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
- x86memop, RC, OpNode>;
- defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
- x86memop, RC>;
- }
+ def m_Int : FMA3S_Int<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, memopr:$src3),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ []>, Sched<[WriteFMALd, ReadAfterLd]>;
}
// The FMA 213 form is created for lowering of scalar FMA intrinscis
@@ -223,8 +294,7 @@ multiclass fma3s_int_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
}
multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
- string OpStr, Intrinsic IntF32, Intrinsic IntF64,
- SDNode OpNode> {
+ string OpStr, SDNode OpNodeIntrin, SDNode OpNode> {
let ExeDomain = SSEPackedSingle in
defm NAME : fma3s_forms<opc132, opc213, opc231, OpStr, "ss", "SS", OpNode,
FR32, f32mem>,
@@ -242,26 +312,44 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// This is because src1 is tied to dest, and the scalar intrinsics
// require the pass-through values to come from the first source
// operand, not the second.
- let Predicates = [HasFMA] in {
- def : Pat<(IntF32 VR128:$src1, VR128:$src2, VR128:$src3),
- (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SSr_Int")
- $src1, $src2, $src3), VR128)>;
-
- def : Pat<(IntF64 VR128:$src1, VR128:$src2, VR128:$src3),
- (COPY_TO_REGCLASS(!cast<Instruction>(NAME#"213SDr_Int")
- $src1, $src2, $src3), VR128)>;
+ let Predicates = [HasFMA, NoAVX512] in {
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
+ (!cast<Instruction>(NAME#"213SSr_Int")
+ VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2, VR128:$src3)),
+ (!cast<Instruction>(NAME#"213SDr_Int")
+ VR128:$src1, VR128:$src2, VR128:$src3)>;
+
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, VR128:$src2,
+ sse_load_f32:$src3)),
+ (!cast<Instruction>(NAME#"213SSm_Int")
+ VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
+
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, VR128:$src2,
+ sse_load_f64:$src3)),
+ (!cast<Instruction>(NAME#"213SDm_Int")
+ VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
+
+ def : Pat<(v4f32 (OpNodeIntrin VR128:$src1, sse_load_f32:$src3,
+ VR128:$src2)),
+ (!cast<Instruction>(NAME#"132SSm_Int")
+ VR128:$src1, VR128:$src2, sse_load_f32:$src3)>;
+
+ def : Pat<(v2f64 (OpNodeIntrin VR128:$src1, sse_load_f64:$src3,
+ VR128:$src2)),
+ (!cast<Instruction>(NAME#"132SDm_Int")
+ VR128:$src1, VR128:$src2, sse_load_f64:$src3)>;
}
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", int_x86_fma_vfmadd_ss,
- int_x86_fma_vfmadd_sd, X86Fmadd>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", int_x86_fma_vfmsub_ss,
- int_x86_fma_vfmsub_sd, X86Fmsub>, VEX_LIG;
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadds1, X86Fmadd>, VEX_LIG;
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsubs1, X86Fmsub>, VEX_LIG;
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", int_x86_fma_vfnmadd_ss,
- int_x86_fma_vfnmadd_sd, X86Fnmadd>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", int_x86_fma_vfnmsub_ss,
- int_x86_fma_vfnmsub_sd, X86Fnmsub>, VEX_LIG;
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadds1, X86Fnmadd>,
+ VEX_LIG;
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsubs1, X86Fnmsub>,
+ VEX_LIG;
//===----------------------------------------------------------------------===//
@@ -273,60 +361,66 @@ multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
PatFrag mem_frag> {
let isCommutable = 1 in
- def rr : FMA4<opc, MRMSrcRegOp4, (outs RC:$dst),
+ def rr : FMA4S<opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG;
- def rm : FMA4<opc, MRMSrcMemOp4, (outs RC:$dst),
+ (OpVT (OpNode RC:$src1, RC:$src2, RC:$src3)))]>, VEX_W, VEX_LIG,
+ Sched<[WriteFMA]>;
+ def rm : FMA4S<opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (OpNode RC:$src1, RC:$src2,
- (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG;
- def mr : FMA4<opc, MRMSrcMem, (outs RC:$dst),
+ (mem_frag addr:$src3)))]>, VEX_W, VEX_LIG,
+ Sched<[WriteFMALd, ReadAfterLd]>;
+ def mr : FMA4S<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst,
- (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG;
+ (OpNode RC:$src1, (mem_frag addr:$src2), RC:$src3))]>, VEX_LIG,
+ Sched<[WriteFMALd, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
- def rr_REV : FMA4<opc, MRMSrcReg, (outs RC:$dst),
+ def rr_REV : FMA4S<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- VEX_LIG, FoldGenData<NAME#rr>;
+ VEX_LIG, FoldGenData<NAME#rr>, Sched<[WriteFMA]>;
}
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
- ComplexPattern mem_cpat, Intrinsic Int> {
+ ValueType VT, ComplexPattern mem_cpat, SDNode OpNode> {
let isCodeGenOnly = 1 in {
- def rr_Int : FMA4<opc, MRMSrcRegOp4, (outs VR128:$dst),
+ def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, VEX_W, VEX_LIG;
- def rm_Int : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
+ (VT (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>, VEX_W,
+ VEX_LIG, Sched<[WriteFMA]>;
+ def rm_Int : FMA4S_Int<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- [(set VR128:$dst, (Int VR128:$src1, VR128:$src2,
- mem_cpat:$src3))]>, VEX_W, VEX_LIG;
- def mr_Int : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
+ [(set VR128:$dst, (VT (OpNode VR128:$src1, VR128:$src2,
+ mem_cpat:$src3)))]>, VEX_W, VEX_LIG,
+ Sched<[WriteFMALd, ReadAfterLd]>;
+ def mr_Int : FMA4S_Int<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, mem_cpat:$src2, VR128:$src3))]>, VEX_LIG;
+ (VT (OpNode VR128:$src1, mem_cpat:$src2, VR128:$src3)))]>,
+ VEX_LIG, Sched<[WriteFMALd, ReadAfterLd]>;
let hasSideEffects = 0 in
- def rr_Int_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
+ def rr_Int_REV : FMA4S_Int<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, VEX_LIG, FoldGenData<NAME#rr_Int>;
+ []>, VEX_LIG, FoldGenData<NAME#rr_Int>, Sched<[WriteFMA]>;
} // isCodeGenOnly = 1
}
@@ -340,19 +434,21 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(OpVT128 (OpNode VR128:$src1, VR128:$src2, VR128:$src3)))]>,
- VEX_W;
+ VEX_W, Sched<[WriteFMA]>;
def rm : FMA4<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, f128mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst, (OpNode VR128:$src1, VR128:$src2,
- (ld_frag128 addr:$src3)))]>, VEX_W;
+ (ld_frag128 addr:$src3)))]>, VEX_W,
+ Sched<[WriteFMALd, ReadAfterLd]>;
def mr : FMA4<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>;
+ (OpNode VR128:$src1, (ld_frag128 addr:$src2), VR128:$src3))]>,
+ Sched<[WriteFMALd, ReadAfterLd]>;
let isCommutable = 1 in
def Yrr : FMA4<opc, MRMSrcRegOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
@@ -360,50 +456,52 @@ multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst,
(OpVT256 (OpNode VR256:$src1, VR256:$src2, VR256:$src3)))]>,
- VEX_W, VEX_L;
+ VEX_W, VEX_L, Sched<[WriteFMA]>;
def Yrm : FMA4<opc, MRMSrcMemOp4, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, f256mem:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst, (OpNode VR256:$src1, VR256:$src2,
- (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L;
+ (ld_frag256 addr:$src3)))]>, VEX_W, VEX_L,
+ Sched<[WriteFMALd, ReadAfterLd]>;
def Ymr : FMA4<opc, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR256:$dst, (OpNode VR256:$src1,
- (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L;
+ (ld_frag256 addr:$src2), VR256:$src3))]>, VEX_L,
+ Sched<[WriteFMALd, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
def rr_REV : FMA4<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- FoldGenData<NAME#rr>;
+ Sched<[WriteFMA]>, FoldGenData<NAME#rr>;
def Yrr_REV : FMA4<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, VR256:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"), []>,
- VEX_L, FoldGenData<NAME#Yrr>;
+ VEX_L, Sched<[WriteFMA]>, FoldGenData<NAME#Yrr>;
} // isCodeGenOnly = 1
}
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32>,
- fma4s_int<0x6A, "vfmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfmadd_ss>;
+ fma4s_int<0x6A, "vfmaddss", ssmem, v4f32, sse_load_f32,
+ X86Fmadd4s>;
defm VFMSUBSS4 : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32>,
- fma4s_int<0x6E, "vfmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfmsub_ss>;
+ fma4s_int<0x6E, "vfmsubss", ssmem, v4f32, sse_load_f32,
+ X86Fmsub4s>;
defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
X86Fnmadd, loadf32>,
- fma4s_int<0x7A, "vfnmaddss", ssmem, sse_load_f32,
- int_x86_fma_vfnmadd_ss>;
+ fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32, sse_load_f32,
+ X86Fnmadd4s>;
defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
X86Fnmsub, loadf32>,
- fma4s_int<0x7E, "vfnmsubss", ssmem, sse_load_f32,
- int_x86_fma_vfnmsub_ss>;
+ fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32, sse_load_f32,
+ X86Fnmsub4s>;
// Packed Instructions
defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32>;
@@ -422,19 +520,19 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64>,
- fma4s_int<0x6B, "vfmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfmadd_sd>;
+ fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64, sse_load_f64,
+ X86Fmadd4s>;
defm VFMSUBSD4 : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64>,
- fma4s_int<0x6F, "vfmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfmsub_sd>;
+ fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64, sse_load_f64,
+ X86Fmsub4s>;
defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
X86Fnmadd, loadf64>,
- fma4s_int<0x7B, "vfnmaddsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmadd_sd>;
+ fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64, sse_load_f64,
+ X86Fnmadd4s>;
defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
X86Fnmsub, loadf64>,
- fma4s_int<0x7F, "vfnmsubsd", sdmem, sse_load_f64,
- int_x86_fma_vfnmsub_sd>;
+ fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64, sse_load_f64,
+ X86Fnmsub4s>;
// Packed Instructions
defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64>;
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index 11b1d070ef2f..7e89a4111d86 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -57,24 +57,24 @@ def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m", SDTX86CwdStore,
// FPStack pattern fragments
//===----------------------------------------------------------------------===//
-def fpimm0 : PatLeaf<(fpimm), [{
- return N->isExactlyValue(+0.0);
+def fpimm0 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(+0.0);
}]>;
-def fpimmneg0 : PatLeaf<(fpimm), [{
- return N->isExactlyValue(-0.0);
+def fpimmneg0 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(-0.0);
}]>;
-def fpimm1 : PatLeaf<(fpimm), [{
- return N->isExactlyValue(+1.0);
+def fpimm1 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(+1.0);
}]>;
-def fpimmneg1 : PatLeaf<(fpimm), [{
- return N->isExactlyValue(-1.0);
+def fpimmneg1 : FPImmLeaf<fAny, [{
+ return Imm.isExactlyValue(-1.0);
}]>;
-// Some 'special' instructions
-let usesCustomInserter = 1 in { // Expanded after instruction selection.
+// Some 'special' instructions - expanded after instruction selection.
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def FP32_TO_INT16_IN_MEM : PseudoI<(outs), (ins i16mem:$dst, RFP32:$src),
[(X86fp_to_i16mem RFP32:$src, addr:$dst)]>;
def FP32_TO_INT32_IN_MEM : PseudoI<(outs), (ins i32mem:$dst, RFP32:$src),
@@ -118,10 +118,12 @@ let usesCustomInserter = 1 in { // Expanded after instruction selection.
// f32 instructions can use SSE1 and are predicated on FPStackf32 == !SSE1.
// f64 instructions can use SSE2 and are predicated on FPStackf64 == !SSE2.
// f80 instructions cannot use SSE and use neither of these.
-class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
- FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32]>;
-class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
- FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64]>;
+class FpIf32<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin = NoItinerary> :
+ FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32]>;
+class FpIf64<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin = NoItinerary> :
+ FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64]>;
// Factoring for arithmetic.
multiclass FPBinary_rr<SDNode OpNode> {
@@ -235,24 +237,29 @@ def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
let Defs = [FPSW] in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
+let hasNoSchedulingInfo = 1 in {
defm ADD : FPBinary_rr<fadd>;
defm SUB : FPBinary_rr<fsub>;
defm MUL : FPBinary_rr<fmul>;
defm DIV : FPBinary_rr<fdiv>;
+}
+
// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
defm ADD : FPBinary<fadd, MRM0m, "add">;
defm SUB : FPBinary<fsub, MRM4m, "sub">;
defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
}
+
let SchedRW = [WriteFMulLd] in {
defm MUL : FPBinary<fmul, MRM1m, "mul">;
}
+
let SchedRW = [WriteFDivLd] in {
defm DIV : FPBinary<fdiv, MRM6m, "div">;
defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
}
-}
+} // Defs = [FPSW]
class FPST0rInst<Format fp, string asm>
: FPI<0xD8, fp, (outs), (ins RST:$op), asm>;
@@ -274,6 +281,8 @@ def SUB_FPrST0 : FPrST0PInst<MRM5r, "fsub{r}p\t$op">;
def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t$op">;
def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st(0), $op|$op, st(0)}">;
def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t$op">;
+def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
+def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
} // SchedRW
let SchedRW = [WriteFMul] in {
def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t$op">;
@@ -289,84 +298,98 @@ def DIVR_FrST0 : FPrST0Inst <MRM6r, "fdiv{|r}\t{%st(0), $op|$op, st(0)}">;
def DIVR_FPrST0 : FPrST0PInst<MRM6r, "fdiv{|r}p\t$op">;
} // SchedRW
-def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
-def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
-
// Unary operations.
-multiclass FPUnary<SDNode OpNode, Format fp, string asmstring> {
+multiclass FPUnary<SDNode OpNode, Format fp, string asmstring,
+ InstrItinClass itin> {
def _Fp32 : FpIf32<(outs RFP32:$dst), (ins RFP32:$src), OneArgFPRW,
- [(set RFP32:$dst, (OpNode RFP32:$src))]>;
+ [(set RFP32:$dst, (OpNode RFP32:$src))], itin>;
def _Fp64 : FpIf64<(outs RFP64:$dst), (ins RFP64:$src), OneArgFPRW,
- [(set RFP64:$dst, (OpNode RFP64:$src))]>;
+ [(set RFP64:$dst, (OpNode RFP64:$src))], itin>;
def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
- [(set RFP80:$dst, (OpNode RFP80:$src))]>;
-def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
+ [(set RFP80:$dst, (OpNode RFP80:$src))], itin>;
+def _F : FPI<0xD9, fp, (outs), (ins), asmstring, itin>;
}
let Defs = [FPSW] in {
-defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
-defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
-let SchedRW = [WriteFSqrt] in {
-defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
+
+let SchedRW = [WriteVecLogic] in {
+defm CHS : FPUnary<fneg, MRM_E0, "fchs", IIC_FSIGN>;
+defm ABS : FPUnary<fabs, MRM_E1, "fabs", IIC_FSIGN>;
+}
+
+let SchedRW = [WriteFSqrt] in
+defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt", IIC_FSQRT>;
+
+let SchedRW = [WriteMicrocoded] in {
+defm SIN : FPUnary<fsin, MRM_FE, "fsin", IIC_FSINCOS>;
+defm COS : FPUnary<fcos, MRM_FF, "fcos", IIC_FSINCOS>;
}
-defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
-defm COS : FPUnary<fcos, MRM_FF, "fcos">;
+let SchedRW = [WriteFAdd] in {
let hasSideEffects = 0 in {
def TST_Fp32 : FpIf32<(outs), (ins RFP32:$src), OneArgFP, []>;
def TST_Fp64 : FpIf64<(outs), (ins RFP64:$src), OneArgFP, []>;
def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
-}
-def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
+} // hasSideEffects
+
+def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst", IIC_FCOMI>;
+} // SchedRW
} // Defs = [FPSW]
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
+let SchedRW = [WriteFAddLd] in {
def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
-def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
-def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
+def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
+def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+
+def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
+def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
def FICOM32m : FPI<0xDA, MRM2m, (outs), (ins i32mem:$src), "ficom{l}\t$src">;
def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
+} // SchedRW
-def FCOM64m : FPI<0xDC, MRM2m, (outs), (ins f64mem:$src), "fcom{l}\t$src">;
-def FCOMP64m : FPI<0xDC, MRM3m, (outs), (ins f64mem:$src), "fcomp{l}\t$src">;
+let SchedRW = [WriteMicrocoded] in {
+def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
+def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
-def FICOM16m : FPI<0xDE, MRM2m, (outs), (ins i16mem:$src), "ficom{s}\t$src">;
-def FICOMP16m: FPI<0xDE, MRM3m, (outs), (ins i16mem:$src), "ficomp{s}\t$src">;
-
def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
+} // SchedRW
// Floating point cmovs.
-class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
- FpI_<outs, ins, fp, pattern>, Requires<[FPStackf32, HasCMov]>;
-class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern> :
- FpI_<outs, ins, fp, pattern>, Requires<[FPStackf64, HasCMov]>;
+class FpIf32CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin> :
+ FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf32, HasCMov]>;
+class FpIf64CMov<dag outs, dag ins, FPFormat fp, list<dag> pattern,
+ InstrItinClass itin> :
+ FpI_<outs, ins, fp, pattern, itin>, Requires<[FPStackf64, HasCMov]>;
multiclass FPCMov<PatLeaf cc> {
def _Fp32 : FpIf32CMov<(outs RFP32:$dst), (ins RFP32:$src1, RFP32:$src2),
CondMovFP,
[(set RFP32:$dst, (X86cmov RFP32:$src1, RFP32:$src2,
- cc, EFLAGS))]>;
+ cc, EFLAGS))], IIC_FCMOV>;
def _Fp64 : FpIf64CMov<(outs RFP64:$dst), (ins RFP64:$src1, RFP64:$src2),
CondMovFP,
[(set RFP64:$dst, (X86cmov RFP64:$src1, RFP64:$src2,
- cc, EFLAGS))]>;
+ cc, EFLAGS))], IIC_FCMOV>;
def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src1, RFP80:$src2),
CondMovFP,
[(set RFP80:$dst, (X86cmov RFP80:$src1, RFP80:$src2,
- cc, EFLAGS))]>,
+ cc, EFLAGS))], IIC_FCMOV>,
Requires<[HasCMov]>;
}
let Defs = [FPSW] in {
+let SchedRW = [WriteFAdd] in {
let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
defm CMOVB : FPCMov<X86_COND_B>;
defm CMOVBE : FPCMov<X86_COND_BE>;
@@ -381,24 +404,26 @@ defm CMOVNP : FPCMov<X86_COND_NP>;
let Predicates = [HasCMov] in {
// These are not factored because there's no clean way to pass DA/DB.
def CMOVB_F : FPI<0xDA, MRM0r, (outs), (ins RST:$op),
- "fcmovb\t{$op, %st(0)|st(0), $op}">;
+ "fcmovb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVBE_F : FPI<0xDA, MRM2r, (outs), (ins RST:$op),
- "fcmovbe\t{$op, %st(0)|st(0), $op}">;
+ "fcmovbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVE_F : FPI<0xDA, MRM1r, (outs), (ins RST:$op),
- "fcmove\t{$op, %st(0)|st(0), $op}">;
+ "fcmove\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVP_F : FPI<0xDA, MRM3r, (outs), (ins RST:$op),
- "fcmovu\t{$op, %st(0)|st(0), $op}">;
+ "fcmovu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVNB_F : FPI<0xDB, MRM0r, (outs), (ins RST:$op),
- "fcmovnb\t{$op, %st(0)|st(0), $op}">;
+ "fcmovnb\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVNBE_F: FPI<0xDB, MRM2r, (outs), (ins RST:$op),
- "fcmovnbe\t{$op, %st(0)|st(0), $op}">;
+ "fcmovnbe\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVNE_F : FPI<0xDB, MRM1r, (outs), (ins RST:$op),
- "fcmovne\t{$op, %st(0)|st(0), $op}">;
+ "fcmovne\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RST:$op),
- "fcmovnu\t{$op, %st(0)|st(0), $op}">;
+ "fcmovnu\t{$op, %st(0)|st(0), $op}", IIC_FCMOV>;
} // Predicates = [HasCMov]
+} // SchedRW
// Floating point loads & stores.
+let SchedRW = [WriteLoad] in {
let canFoldAsLoad = 1 in {
def LD_Fp32m : FpIf32<(outs RFP32:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP32:$dst, (loadf32 addr:$src))]>;
@@ -407,7 +432,7 @@ let isReMaterializable = 1 in
[(set RFP64:$dst, (loadf64 addr:$src))]>;
def LD_Fp80m : FpI_<(outs RFP80:$dst), (ins f80mem:$src), ZeroArgFP,
[(set RFP80:$dst, (loadf80 addr:$src))]>;
-}
+} // canFoldAsLoad
def LD_Fp32m64 : FpIf64<(outs RFP64:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP64:$dst, (f64 (extloadf32 addr:$src)))]>;
def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
@@ -432,7 +457,9 @@ def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
[(set RFP80:$dst, (X86fild addr:$src, i32))]>;
def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
[(set RFP80:$dst, (X86fild addr:$src, i64))]>;
+} // SchedRW
+let SchedRW = [WriteStore] in {
def ST_Fp32m : FpIf32<(outs), (ins f32mem:$op, RFP32:$src), OneArgFP,
[(store RFP32:$src, addr:$op)]>;
def ST_Fp64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP,
@@ -451,9 +478,11 @@ def ST_FpP64m32 : FpIf64<(outs), (ins f32mem:$op, RFP64:$src), OneArgFP, []>;
def ST_FpP64m : FpIf64<(outs), (ins f64mem:$op, RFP64:$src), OneArgFP, []>;
def ST_FpP80m32 : FpI_<(outs), (ins f32mem:$op, RFP80:$src), OneArgFP, []>;
def ST_FpP80m64 : FpI_<(outs), (ins f64mem:$op, RFP80:$src), OneArgFP, []>;
-}
+} // mayStore
+
def ST_FpP80m : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
[(store RFP80:$src, addr:$op)]>;
+
let mayStore = 1, hasSideEffects = 0 in {
def IST_Fp16m32 : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
def IST_Fp32m32 : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
@@ -464,7 +493,8 @@ def IST_Fp64m64 : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
def IST_Fp16m80 : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp32m80 : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
def IST_Fp64m80 : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
-}
+} // mayStore
+} // SchedRW
let mayLoad = 1, SchedRW = [WriteLoad] in {
def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src",
@@ -504,7 +534,7 @@ def IST_FP64m : FPI<0xDF, MRM7m, (outs), (ins i64mem:$dst), "fistp{ll}\t$dst",
}
// FISTTP requires SSE3 even though it's a FPStack op.
-let Predicates = [HasSSE3] in {
+let Predicates = [HasSSE3], SchedRW = [WriteStore] in {
def ISTT_Fp16m32 : FpI_<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP,
[(X86fp_to_i16mem RFP32:$src, addr:$op)]>;
def ISTT_Fp32m32 : FpI_<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
@@ -543,7 +573,7 @@ def XCH_F : FPI<0xD9, MRM1r, (outs), (ins RST:$op), "fxch\t$op", IIC_FXCH>;
}
// Floating point constant loads.
-let isReMaterializable = 1 in {
+let isReMaterializable = 1, SchedRW = [WriteZero] in {
def LD_Fp032 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
[(set RFP32:$dst, fpimm0)]>;
def LD_Fp132 : FpIf32<(outs RFP32:$dst), (ins), ZeroArgFP,
@@ -615,19 +645,19 @@ let Defs = [AX], Uses = [FPSW] in
def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
(outs), (ins), "fnstsw\t{%ax|ax}",
[(set AX, (X86fp_stsw FPSW))], IIC_FNSTSW>;
-
+let Defs = [FPSW] in
def FNSTCW16m : I<0xD9, MRM7m, // [mem16] = X87 control world
(outs), (ins i16mem:$dst), "fnstcw\t$dst",
[(X86fp_cwd_get16 addr:$dst)], IIC_FNSTCW>;
} // SchedRW
-let mayLoad = 1 in
+let Defs = [FPSW], mayLoad = 1 in
def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
(outs), (ins i16mem:$dst), "fldcw\t$dst", [], IIC_FLDCW>,
Sched<[WriteLoad]>;
// FPU control instructions
let SchedRW = [WriteMicrocoded] in {
-let Defs = [FPSW] in
+let Defs = [FPSW] in {
def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", [], IIC_FNINIT>;
def FFREE : FPI<0xDD, MRM0r, (outs), (ins RST:$reg),
"ffree\t$reg", IIC_FFREE>;
@@ -635,16 +665,16 @@ def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RST:$reg),
"ffreep\t$reg", IIC_FFREE>;
// Clear exceptions
-
-let Defs = [FPSW] in
def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", [], IIC_FNCLEX>;
+} // Defs = [FPSW]
} // SchedRW
// Operandless floating-point instructions for the disassembler.
let SchedRW = [WriteMicrocoded] in {
-def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
-
def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", [], IIC_FNOP>;
+
+let Defs = [FPSW] in {
+def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", [], IIC_WAIT>;
def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", [], IIC_FXAM>;
def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", [], IIC_FLDL>;
def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", [], IIC_FLDL>;
@@ -665,20 +695,20 @@ def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", [], IIC_FSINCOS>;
def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", [], IIC_FRNDINT>;
def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", [], IIC_FSCALE>;
def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
+} // Defs = [FPSW]
-let Predicates = [HasFXSR] in {
- def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
- "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
- def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
- "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
- IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
- def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>,
- TB;
- def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
- "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
- IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
-} // Predicates = [FeatureFXSR]
+def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB,
+ Requires<[HasFXSR]>;
+def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
+ "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+ IIC_FXSAVE>, TB, Requires<[HasFXSR, In64BitMode]>;
+def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>,
+ TB, Requires<[HasFXSR]>;
+def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
+ "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+ IIC_FXRSTOR>, TB, Requires<[HasFXSR, In64BitMode]>;
} // SchedRW
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index bfcbf71d252f..2a6ed02fadab 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -157,9 +157,10 @@ def EncEVEX : Encoding<3>;
class OperandSize<bits<2> val> {
bits<2> Value = val;
}
-def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
-def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
-def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+def OpSizeFixed : OperandSize<0>; // Never needs a 0x66 prefix.
+def OpSize16 : OperandSize<1>; // Needs 0x66 prefix in 32-bit mode.
+def OpSize32 : OperandSize<2>; // Needs 0x66 prefix in 16-bit mode.
+def OpSizeIgnore : OperandSize<3>; // Takes 0x66 prefix, never emits.
// Address size for encodings that change based on mode.
class AddressSize<bits<2> val> {
@@ -174,6 +175,7 @@ def AdSize64 : AddressSize<3>; // Encodes a 64-bit address.
// emitter that various prefix bytes are required.
class OpSize16 { OperandSize OpSize = OpSize16; }
class OpSize32 { OperandSize OpSize = OpSize32; }
+class OpSizeIgnore { OperandSize OpSize = OpSizeIgnore; }
class AdSize16 { AddressSize AdSize = AdSize16; }
class AdSize32 { AddressSize AdSize = AdSize32; }
class AdSize64 { AddressSize AdSize = AdSize64; }
@@ -231,6 +233,9 @@ class FoldGenData<string _RegisterForm> {
string FoldGenRegForm = _RegisterForm;
}
+// Mark the instruction as "illegal to memory fold/unfold"
+class NotMemoryFoldable { bit isMemoryFoldable = 0; }
+
class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
string AsmStr,
InstrItinClass itin,
@@ -314,6 +319,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
// instruction to replace the current one in case it got picked during generation.
string FoldGenRegForm = ?;
+ bit isMemoryFoldable = 1; // Is it allowed to memory fold/unfold this instruction?
+
// TSFlags layout should be kept in sync with X86BaseInfo.h.
let TSFlags{6-0} = FormBits;
let TSFlags{8-7} = OpSizeBits;
@@ -822,7 +829,7 @@ class AVX512PIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
class AVX512PI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, Domain d, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin, d>, Requires<[HasAVX512]>;
-class AVX512FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
+class AVX512FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
EVEX_4V, Requires<[HasAVX512]>;
@@ -839,34 +846,44 @@ class AVX512<bits<8> o, Format F, dag outs, dag ins, string asm,
class AES8I<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = IIC_AES>
: I<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, T8PD,
- Requires<[HasAES]>;
+ Requires<[NoAVX, HasAES]>;
class AESAI<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag> pattern, InstrItinClass itin = NoItinerary>
: Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
- Requires<[HasAES]>;
+ Requires<[NoAVX, HasAES]>;
// PCLMUL Instruction Templates
class PCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
- Requires<[HasPCLMUL]>;
-
-class AVXPCLMULIi8<bits<8> o, Format F, dag outs, dag ins, string asm,
- list<dag>pattern, InstrItinClass itin = NoItinerary>
- : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD,
- VEX_4V, Requires<[HasAVX, HasPCLMUL]>;
+ : Ii8<o, F, outs, ins, asm, pattern, itin, SSEPackedInt>, TAPD;
// FMA3 Instruction Templates
class FMA3<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: I<o, F, outs, ins, asm, pattern, itin>, T8PD,
- VEX_4V, FMASC, Requires<[HasFMA, NoVLX]>;
+ VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoVLX]>;
+class FMA3S<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoFMA4, NoAVX512]>;
+class FMA3S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : I<o, F, outs, ins, asm, pattern, itin>, T8PD,
+ VEX_4V, FMASC, Requires<[HasFMA, NoAVX512]>;
// FMA4 Instruction Templates
class FMA4<bits<8> o, Format F, dag outs, dag ins, string asm,
list<dag>pattern, InstrItinClass itin = NoItinerary>
: Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4, NoVLX]>;
+class FMA4S<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
+ VEX_4V, FMASC, Requires<[HasFMA4, NoAVX512]>;
+class FMA4S_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
+ list<dag>pattern, InstrItinClass itin = NoItinerary>
+ : Ii8Reg<o, F, outs, ins, asm, pattern, itin>, TAPD,
VEX_4V, FMASC, Requires<[HasFMA4]>;
// XOP 2, 3 and 4 Operand Instruction Template
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 8b5bbf24f6f6..ebbef00c01d9 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -56,8 +56,6 @@ def X86fxor : SDNode<"X86ISD::FXOR", SDTFPBinOp,
def X86fandn : SDNode<"X86ISD::FANDN", SDTFPBinOp>;
def X86frsqrt : SDNode<"X86ISD::FRSQRT", SDTFPUnaryOp>;
def X86frcp : SDNode<"X86ISD::FRCP", SDTFPUnaryOp>;
-def X86frsqrt14s: SDNode<"X86ISD::FRSQRTS", SDTFPBinOp>;
-def X86frcp14s : SDNode<"X86ISD::FRCPS", SDTFPBinOp>;
def X86fhadd : SDNode<"X86ISD::FHADD", SDTFPBinOp>;
def X86fhsub : SDNode<"X86ISD::FHSUB", SDTFPBinOp>;
def X86hadd : SDNode<"X86ISD::HADD", SDTIntBinOp>;
@@ -146,8 +144,11 @@ def X86fpextRnd : SDNode<"X86ISD::VFPEXTS_RND",
SDTCisSameSizeAs<0, 2>,
SDTCisVT<3, i32>]>>;
-def X86vshldq : SDNode<"X86ISD::VSHLDQ", SDTIntShiftOp>;
-def X86vshrdq : SDNode<"X86ISD::VSRLDQ", SDTIntShiftOp>;
+def X86vshiftimm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVT<2, i8>, SDTCisInt<0>]>;
+
+def X86vshldq : SDNode<"X86ISD::VSHLDQ", X86vshiftimm>;
+def X86vshrdq : SDNode<"X86ISD::VSRLDQ", X86vshiftimm>;
def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
@@ -164,15 +165,16 @@ def X86CmpMaskCC :
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>]>;
def X86CmpMaskCCRound :
SDTypeProfile<1, 4, [SDTCisVec<0>,SDTCVecEltisVT<0, i1>,
- SDTCisVec<1>, SDTCisSameAs<2, 1>,
+ SDTCisVec<1>, SDTCisFP<1>, SDTCisSameAs<2, 1>,
SDTCisSameNumEltsAs<0, 1>, SDTCisVT<3, i8>,
SDTCisVT<4, i32>]>;
def X86CmpMaskCCScalar :
- SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+ SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>]>;
def X86CmpMaskCCScalarRound :
- SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>,
- SDTCisVT<4, i32>]>;
+ SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisFP<1>, SDTCisSameAs<1, 2>,
+ SDTCisVT<3, i8>, SDTCisVT<4, i32>]>;
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
def X86cmpmRnd : SDNode<"X86ISD::CMPM_RND", X86CmpMaskCCRound>;
@@ -180,23 +182,25 @@ def X86cmpmu : SDNode<"X86ISD::CMPMU", X86CmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsRnd : SDNode<"X86ISD::FSETCCM_RND", X86CmpMaskCCScalarRound>;
-def X86vshl : SDNode<"X86ISD::VSHL",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisVec<2>]>>;
-def X86vsrl : SDNode<"X86ISD::VSRL",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisVec<2>]>>;
-def X86vsra : SDNode<"X86ISD::VSRA",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisVec<2>]>>;
+def X86phminpos: SDNode<"X86ISD::PHMINPOS",
+ SDTypeProfile<1, 1, [SDTCisVT<0, v8i16>, SDTCisVT<1, v8i16>]>>;
+
+def X86vshiftuniform : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisVec<2>, SDTCisInt<0>,
+ SDTCisInt<1>]>;
+
+def X86vshl : SDNode<"X86ISD::VSHL", X86vshiftuniform>;
+def X86vsrl : SDNode<"X86ISD::VSRL", X86vshiftuniform>;
+def X86vsra : SDNode<"X86ISD::VSRA", X86vshiftuniform>;
-def X86vsrav : SDNode<"X86ISD::VSRAV" ,
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>]>>;
+def X86vshiftvariable : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>, SDTCisInt<0>]>;
-def X86vshli : SDNode<"X86ISD::VSHLI", SDTIntShiftOp>;
-def X86vsrli : SDNode<"X86ISD::VSRLI", SDTIntShiftOp>;
-def X86vsrai : SDNode<"X86ISD::VSRAI", SDTIntShiftOp>;
+def X86vsrav : SDNode<"X86ISD::VSRAV", X86vshiftvariable>;
+
+def X86vshli : SDNode<"X86ISD::VSHLI", X86vshiftimm>;
+def X86vsrli : SDNode<"X86ISD::VSRLI", X86vshiftimm>;
+def X86vsrai : SDNode<"X86ISD::VSRAI", X86vshiftimm>;
def X86kshiftl : SDNode<"X86ISD::KSHIFTL",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
@@ -207,31 +211,20 @@ def X86kshiftr : SDNode<"X86ISD::KSHIFTR",
SDTCisSameAs<0, 1>,
SDTCisVT<2, i8>]>>;
-def X86vrotli : SDNode<"X86ISD::VROTLI", SDTIntShiftOp>;
-def X86vrotri : SDNode<"X86ISD::VROTRI", SDTIntShiftOp>;
-
-def X86vprot : SDNode<"X86ISD::VPROT",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>]>>;
-def X86vproti : SDNode<"X86ISD::VPROTI",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisVT<2, i8>]>>;
+def X86vrotli : SDNode<"X86ISD::VROTLI", X86vshiftimm>;
+def X86vrotri : SDNode<"X86ISD::VROTRI", X86vshiftimm>;
-def X86vpshl : SDNode<"X86ISD::VPSHL",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>]>>;
-def X86vpsha : SDNode<"X86ISD::VPSHA",
- SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
- SDTCisSameAs<0,2>]>>;
+def X86vpshl : SDNode<"X86ISD::VPSHL", X86vshiftvariable>;
+def X86vpsha : SDNode<"X86ISD::VPSHA", X86vshiftvariable>;
def X86vpcom : SDNode<"X86ISD::VPCOM",
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
- SDTCisVT<3, i8>]>>;
+ SDTCisVT<3, i8>, SDTCisInt<0>]>>;
def X86vpcomu : SDNode<"X86ISD::VPCOMU",
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
- SDTCisVT<3, i8>]>>;
+ SDTCisVT<3, i8>, SDTCisInt<0>]>>;
def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
@@ -267,12 +260,6 @@ def X86testnm : SDNode<"X86ISD::TESTNM", SDTX86Testm, [SDNPCommutative]>;
def X86movmsk : SDNode<"X86ISD::MOVMSK",
SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVec<1>]>>;
-def X86select : SDNode<"X86ISD::SELECT",
- SDTypeProfile<1, 3, [SDTCVecEltisVT<1, i1>,
- SDTCisSameAs<0, 2>,
- SDTCisSameAs<2, 3>,
- SDTCisSameNumEltsAs<0, 1>]>>;
-
def X86selects : SDNode<"X86ISD::SELECTS",
SDTypeProfile<1, 3, [SDTCisVT<1, v1i1>,
SDTCisSameAs<0, 2>,
@@ -314,6 +301,10 @@ def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
+def SDTFPBinOpImm: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i32>]>;
def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>,
SDTCisSameAs<0,1>,
SDTCisSameAs<0,2>,
@@ -326,6 +317,9 @@ def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
SDTCisSameNumEltsAs<0, 3>,
SDTCisVT<4, i32>,
SDTCisVT<5, i32>]>;
+def SDTFPUnaryOpImm: SDTypeProfile<1, 2, [SDTCisFP<0>, SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisVT<2, i32>]>;
def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
SDTCisSameAs<0,1>,
SDTCisVT<2, i32>,
@@ -352,9 +346,26 @@ def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
SDTCisFP<0>, SDTCisVT<4, i32>]>;
-def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
+def X86PAlignr : SDNode<"X86ISD::PALIGNR",
+ SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i8>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisVT<3, i8>]>>;
def X86VAlign : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
+def X86VShld : SDNode<"X86ISD::VSHLD", SDTShuff3OpI>;
+def X86VShrd : SDNode<"X86ISD::VSHRD", SDTShuff3OpI>;
+def X86VShldv : SDNode<"X86ISD::VSHLDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>>;
+def X86VShrdv : SDNode<"X86ISD::VSHRDV",
+ SDTypeProfile<1, 3, [SDTCisVec<0>,
+ SDTCisSameAs<0,1>,
+ SDTCisSameAs<0,2>,
+ SDTCisSameAs<0,3>]>>;
+
def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
def X86PShufd : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -431,10 +442,14 @@ def X86VPerm2x128 : SDNode<"X86ISD::VPERM2X128", SDTShuff3OpI>;
def X86VFixupimm : SDNode<"X86ISD::VFIXUPIMM", SDTFPTernaryOpImmRound>;
def X86VFixupimmScalar : SDNode<"X86ISD::VFIXUPIMMS", SDTFPTernaryOpImmRound>;
-def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImmRound>;
-def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImmRound>;
-def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
-def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImmRound>;
+def X86VRange : SDNode<"X86ISD::VRANGE", SDTFPBinOpImm>;
+def X86VRangeRnd : SDNode<"X86ISD::VRANGE_RND", SDTFPBinOpImmRound>;
+def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>;
+def X86VReduceRnd : SDNode<"X86ISD::VREDUCE_RND", SDTFPUnaryOpImmRound>;
+def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>;
+def X86VRndScaleRnd: SDNode<"X86ISD::VRNDSCALE_RND", SDTFPUnaryOpImmRound>;
+def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>;
+def X86VGetMantRnd : SDNode<"X86ISD::VGETMANT_RND", SDTFPUnaryOpImmRound>;
def X86Vfpclass : SDNode<"X86ISD::VFPCLASS",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
SDTCisFP<1>,
@@ -450,9 +465,10 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86Vextract : SDNode<"X86ISD::VEXTRACT", SDTypeProfile<1, 2,
- [SDTCisVec<1>,
- SDTCisPtrTy<2>]>, []>;
+def X86kextract : SDNode<"ISD::EXTRACT_VECTOR_ELT",
+ SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
+ SDTCVecEltisVT<1, i1>,
+ SDTCisPtrTy<2>]>>;
def X86Blendi : SDNode<"X86ISD::BLENDI", SDTBlend>;
@@ -477,19 +493,31 @@ def X86fsqrtRnds : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
def X86fgetexpRnd : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
def X86fgetexpRnds : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
-def X86Fmadd : SDNode<"X86ISD::FMADD", SDTFPTernaryOp>;
-def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp>;
-def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp>;
-def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp>;
-def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp>;
-def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp>;
+def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmaddsub : SDNode<"X86ISD::FMADDSUB", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmsubadd : SDNode<"X86ISD::FMSUBADD", SDTFPTernaryOp, [SDNPCommutative]>;
+
+def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound, [SDNPCommutative]>;
+
+// Scalar FMA4 intrinsics which zero the non-scalar bits.
+def X86Fmadd4s : SDNode<"X86ISD::FMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmadd4s : SDNode<"X86ISD::FNMADD4S", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmsub4s : SDNode<"X86ISD::FMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmsub4s : SDNode<"X86ISD::FNMSUB4S", SDTFPTernaryOp, [SDNPCommutative]>;
-def X86FmaddRnd : SDNode<"X86ISD::FMADD_RND", SDTFmaRound>;
-def X86FnmaddRnd : SDNode<"X86ISD::FNMADD_RND", SDTFmaRound>;
-def X86FmsubRnd : SDNode<"X86ISD::FMSUB_RND", SDTFmaRound>;
-def X86FnmsubRnd : SDNode<"X86ISD::FNMSUB_RND", SDTFmaRound>;
-def X86FmaddsubRnd : SDNode<"X86ISD::FMADDSUB_RND", SDTFmaRound>;
-def X86FmsubaddRnd : SDNode<"X86ISD::FMSUBADD_RND", SDTFmaRound>;
+// Scalar FMA intrinsics with passthru bits in operand 1.
+def X86Fmadds1 : SDNode<"X86ISD::FMADDS1", SDTFPTernaryOp>;
+def X86Fnmadds1 : SDNode<"X86ISD::FNMADDS1", SDTFPTernaryOp>;
+def X86Fmsubs1 : SDNode<"X86ISD::FMSUBS1", SDTFPTernaryOp>;
+def X86Fnmsubs1 : SDNode<"X86ISD::FNMSUBS1", SDTFPTernaryOp>;
// Scalar FMA intrinsics with passthru bits in operand 1.
def X86FmaddRnds1 : SDNode<"X86ISD::FMADDS1_RND", SDTFmaRound>;
@@ -497,26 +525,49 @@ def X86FnmaddRnds1 : SDNode<"X86ISD::FNMADDS1_RND", SDTFmaRound>;
def X86FmsubRnds1 : SDNode<"X86ISD::FMSUBS1_RND", SDTFmaRound>;
def X86FnmsubRnds1 : SDNode<"X86ISD::FNMSUBS1_RND", SDTFmaRound>;
+def X86Fmadds3 : SDNode<"X86ISD::FMADDS3", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmadds3 : SDNode<"X86ISD::FNMADDS3", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmsubs3 : SDNode<"X86ISD::FMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fnmsubs3 : SDNode<"X86ISD::FNMSUBS3", SDTFPTernaryOp, [SDNPCommutative]>;
+
// Scalar FMA intrinsics with passthru bits in operand 3.
-def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound>;
-def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound>;
-def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound>;
-def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound>;
+def X86FmaddRnds3 : SDNode<"X86ISD::FMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FnmaddRnds3 : SDNode<"X86ISD::FNMADDS3_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FmsubRnds3 : SDNode<"X86ISD::FMSUBS3_RND", SDTFmaRound, [SDNPCommutative]>;
+def X86FnmsubRnds3 : SDNode<"X86ISD::FNMSUBS3_RND", SDTFmaRound, [SDNPCommutative]>;
def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
-def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma>;
-def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma>;
+def x86vpmadd52l : SDNode<"X86ISD::VPMADD52L", SDTIFma, [SDNPCommutative]>;
+def x86vpmadd52h : SDNode<"X86ISD::VPMADD52H", SDTIFma, [SDNPCommutative]>;
+
+def X86rsqrt14 : SDNode<"X86ISD::RSQRT14", SDTFPUnaryOp>;
+def X86rcp14 : SDNode<"X86ISD::RCP14", SDTFPUnaryOp>;
+
+// VNNI
+def SDTVnni : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
+ SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def X86Vpdpbusd : SDNode<"X86ISD::VPDPBUSD", SDTVnni>;
+def X86Vpdpbusds : SDNode<"X86ISD::VPDPBUSDS", SDTVnni>;
+def X86Vpdpwssd : SDNode<"X86ISD::VPDPWSSD", SDTVnni>;
+def X86Vpdpwssds : SDNode<"X86ISD::VPDPWSSDS", SDTVnni>;
def X86rsqrt28 : SDNode<"X86ISD::RSQRT28", SDTFPUnaryOpRound>;
def X86rcp28 : SDNode<"X86ISD::RCP28", SDTFPUnaryOpRound>;
def X86exp2 : SDNode<"X86ISD::EXP2", SDTFPUnaryOpRound>;
+def X86rsqrt14s : SDNode<"X86ISD::RSQRT14S", SDTFPBinOp>;
+def X86rcp14s : SDNode<"X86ISD::RCP14S", SDTFPBinOp>;
def X86rsqrt28s : SDNode<"X86ISD::RSQRT28S", SDTFPBinOpRound>;
def X86rcp28s : SDNode<"X86ISD::RCP28S", SDTFPBinOpRound>;
-def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImmRound>;
-def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImmRound>;
-def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImmRound>;
+def X86Ranges : SDNode<"X86ISD::VRANGES", SDTFPBinOpImm>;
+def X86RndScales : SDNode<"X86ISD::VRNDSCALES", SDTFPBinOpImm>;
+def X86Reduces : SDNode<"X86ISD::VREDUCES", SDTFPBinOpImm>;
+def X86GetMants : SDNode<"X86ISD::VGETMANTS", SDTFPBinOpImm>;
+def X86RangesRnd : SDNode<"X86ISD::VRANGES_RND", SDTFPBinOpImmRound>;
+def X86RndScalesRnd : SDNode<"X86ISD::VRNDSCALES_RND", SDTFPBinOpImmRound>;
+def X86ReducesRnd : SDNode<"X86ISD::VREDUCES_RND", SDTFPBinOpImmRound>;
+def X86GetMantsRnd : SDNode<"X86ISD::VGETMANTS_RND", SDTFPBinOpImmRound>;
def SDT_PCMPISTRI : SDTypeProfile<2, 3, [SDTCisVT<0, i32>, SDTCisVT<1, i32>,
SDTCisVT<2, v16i8>, SDTCisVT<3, v16i8>,
@@ -534,6 +585,13 @@ def X86compress: SDNode<"X86ISD::COMPRESS", SDTypeProfile<1, 1,
def X86expand : SDNode<"X86ISD::EXPAND", SDTypeProfile<1, 1,
[SDTCisSameAs<0, 1>, SDTCisVec<1>]>, []>;
+// vpshufbitqmb
+def X86Vpshufbitqmb : SDNode<"X86ISD::VPSHUFBITQMB",
+ SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<1,2>,
+ SDTCVecEltisVT<0,i1>,
+ SDTCisSameNumEltsAs<0,1>]>>;
+
def SDTintToFPRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisFP<0>,
SDTCisSameAs<0,1>, SDTCisInt<2>,
SDTCisVT<3, i32>]>;
@@ -588,7 +646,12 @@ def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
def X86cvtp2UInt : SDNode<"X86ISD::CVTP2UI", SDTFloatToInt>;
+
def X86cvtph2ps : SDNode<"X86ISD::CVTPH2PS",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, i16>]> >;
+
+def X86cvtph2psRnd : SDNode<"X86ISD::CVTPH2PS_RND",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, i16>,
SDTCisVT<2, i32>]> >;
@@ -610,6 +673,11 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
def X86cvt2mask : SDNode<"X86ISD::CVT2MASK", SDTIntTruncOp>;
+// galois field arithmetic
+def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
+def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
+def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
+
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//
@@ -643,109 +711,82 @@ def sdmem : Operand<v2f64> {
// Vector load wrappers to prevent folding of non-temporal aligned loads on
// supporting targets.
-def vec128load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return !Subtarget->hasSSE41() || !cast<LoadSDNode>(N)->isNonTemporal() ||
- cast<LoadSDNode>(N)->getAlignment() < 16;
-}]>;
-def vec256load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return !Subtarget->hasAVX2() || !cast<LoadSDNode>(N)->isNonTemporal() ||
- cast<LoadSDNode>(N)->getAlignment() < 32;
-}]>;
-def vec512load : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return !Subtarget->hasAVX512() || !cast<LoadSDNode>(N)->isNonTemporal() ||
- cast<LoadSDNode>(N)->getAlignment() < 64;
+def vecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ return !useNonTemporalLoad(cast<LoadSDNode>(N));
}]>;
// 128-bit load pattern fragments
// NOTE: all 128-bit integer vector loads are promoted to v2i64
-def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vec128load node:$ptr))>;
-def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vec128load node:$ptr))>;
-def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vec128load node:$ptr))>;
+def loadv4f32 : PatFrag<(ops node:$ptr), (v4f32 (vecload node:$ptr))>;
+def loadv2f64 : PatFrag<(ops node:$ptr), (v2f64 (vecload node:$ptr))>;
+def loadv2i64 : PatFrag<(ops node:$ptr), (v2i64 (vecload node:$ptr))>;
// 256-bit load pattern fragments
// NOTE: all 256-bit integer vector loads are promoted to v4i64
-def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vec256load node:$ptr))>;
-def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vec256load node:$ptr))>;
-def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vec256load node:$ptr))>;
+def loadv8f32 : PatFrag<(ops node:$ptr), (v8f32 (vecload node:$ptr))>;
+def loadv4f64 : PatFrag<(ops node:$ptr), (v4f64 (vecload node:$ptr))>;
+def loadv4i64 : PatFrag<(ops node:$ptr), (v4i64 (vecload node:$ptr))>;
// 512-bit load pattern fragments
-def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vec512load node:$ptr))>;
-def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vec512load node:$ptr))>;
-def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vec512load node:$ptr))>;
+def loadv16f32 : PatFrag<(ops node:$ptr), (v16f32 (vecload node:$ptr))>;
+def loadv8f64 : PatFrag<(ops node:$ptr), (v8f64 (vecload node:$ptr))>;
+def loadv8i64 : PatFrag<(ops node:$ptr), (v8i64 (vecload node:$ptr))>;
// 128-/256-/512-bit extload pattern fragments
def extloadv2f32 : PatFrag<(ops node:$ptr), (v2f64 (extloadvf32 node:$ptr))>;
def extloadv4f32 : PatFrag<(ops node:$ptr), (v4f64 (extloadvf32 node:$ptr))>;
def extloadv8f32 : PatFrag<(ops node:$ptr), (v8f64 (extloadvf32 node:$ptr))>;
-// Like 'store', but always requires 128-bit vector alignment.
+// Like 'store', but always requires vector size alignment.
def alignedstore : PatFrag<(ops node:$val, node:$ptr),
(store node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-// Like 'store', but always requires 256-bit vector alignment.
-def alignedstore256 : PatFrag<(ops node:$val, node:$ptr),
- (store node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAlignment() >= 32;
-}]>;
-
-// Like 'store', but always requires 512-bit vector alignment.
-def alignedstore512 : PatFrag<(ops node:$val, node:$ptr),
- (store node:$val, node:$ptr), [{
- return cast<StoreSDNode>(N)->getAlignment() >= 64;
+ auto *St = cast<StoreSDNode>(N);
+ return St->getAlignment() >= St->getMemoryVT().getStoreSize();
}]>;
// Like 'load', but always requires 128-bit vector alignment.
-def alignedload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 16;
-}]>;
-
-// Like 'load', but always requires 256-bit vector alignment.
-def alignedload256 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 32;
-}]>;
-
-// Like 'load', but always requires 512-bit vector alignment.
-def alignedload512 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 64;
+def alignedvecload : PatFrag<(ops node:$ptr), (load node:$ptr), [{
+ auto *Ld = cast<LoadSDNode>(N);
+ return Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize() &&
+ !useNonTemporalLoad(cast<LoadSDNode>(N));
}]>;
// 128-bit aligned load pattern fragments
// NOTE: all 128-bit integer vector loads are promoted to v2i64
def alignedloadv4f32 : PatFrag<(ops node:$ptr),
- (v4f32 (alignedload node:$ptr))>;
+ (v4f32 (alignedvecload node:$ptr))>;
def alignedloadv2f64 : PatFrag<(ops node:$ptr),
- (v2f64 (alignedload node:$ptr))>;
+ (v2f64 (alignedvecload node:$ptr))>;
def alignedloadv2i64 : PatFrag<(ops node:$ptr),
- (v2i64 (alignedload node:$ptr))>;
+ (v2i64 (alignedvecload node:$ptr))>;
// 256-bit aligned load pattern fragments
// NOTE: all 256-bit integer vector loads are promoted to v4i64
def alignedloadv8f32 : PatFrag<(ops node:$ptr),
- (v8f32 (alignedload256 node:$ptr))>;
+ (v8f32 (alignedvecload node:$ptr))>;
def alignedloadv4f64 : PatFrag<(ops node:$ptr),
- (v4f64 (alignedload256 node:$ptr))>;
+ (v4f64 (alignedvecload node:$ptr))>;
def alignedloadv4i64 : PatFrag<(ops node:$ptr),
- (v4i64 (alignedload256 node:$ptr))>;
+ (v4i64 (alignedvecload node:$ptr))>;
// 512-bit aligned load pattern fragments
def alignedloadv16f32 : PatFrag<(ops node:$ptr),
- (v16f32 (alignedload512 node:$ptr))>;
+ (v16f32 (alignedvecload node:$ptr))>;
def alignedloadv8f64 : PatFrag<(ops node:$ptr),
- (v8f64 (alignedload512 node:$ptr))>;
+ (v8f64 (alignedvecload node:$ptr))>;
def alignedloadv8i64 : PatFrag<(ops node:$ptr),
- (v8i64 (alignedload512 node:$ptr))>;
+ (v8i64 (alignedvecload node:$ptr))>;
-// Like 'vec128load', but uses special alignment checks suitable for use in
+// Like 'vecload', but uses special alignment checks suitable for use in
// memory operands in most SSE instructions, which are required to
// be naturally aligned on some targets but not on others. If the subtarget
// allows unaligned accesses, match any load, though this may require
// setting a feature bit in the processor (on startup, for example).
// Opteron 10h and later implement such a feature.
-def memop : PatFrag<(ops node:$ptr), (vec128load node:$ptr), [{
+def memop : PatFrag<(ops node:$ptr), (vecload node:$ptr), [{
+ auto *Ld = cast<LoadSDNode>(N);
return Subtarget->hasSSEUnalignedMem() ||
- cast<LoadSDNode>(N)->getAlignment() >= 16;
+ Ld->getAlignment() >= Ld->getMemoryVT().getStoreSize();
}]>;
// 128-bit memop pattern fragments
@@ -754,117 +795,87 @@ def memopv4f32 : PatFrag<(ops node:$ptr), (v4f32 (memop node:$ptr))>;
def memopv2f64 : PatFrag<(ops node:$ptr), (v2f64 (memop node:$ptr))>;
def memopv2i64 : PatFrag<(ops node:$ptr), (v2i64 (memop node:$ptr))>;
-// SSSE3 uses MMX registers for some instructions. They aren't aligned on a
-// 16-byte boundary.
-// FIXME: 8 byte alignment for mmx reads is not required
-def memop64 : PatFrag<(ops node:$ptr), (load node:$ptr), [{
- return cast<LoadSDNode>(N)->getAlignment() >= 8;
-}]>;
-
-def memopmmx : PatFrag<(ops node:$ptr), (x86mmx (memop64 node:$ptr))>;
-
-def X86masked_gather : SDNode<"X86ISD::MGATHER", SDTMaskedGather,
- [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+def X86masked_gather : SDNode<"X86ISD::MGATHER",
+ SDTypeProfile<2, 3, [SDTCisVec<0>,
+ SDTCisVec<1>, SDTCisInt<1>,
+ SDTCisSameAs<0, 2>,
+ SDTCisSameAs<1, 3>,
+ SDTCisPtrTy<4>]>,
+ [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
+
+def X86masked_scatter : SDNode<"X86ISD::MSCATTER",
+ SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
+ SDTCisSameAs<0, 2>,
+ SDTCVecEltisVT<0, i1>,
+ SDTCisPtrTy<3>]>,
+ [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v4i32 ||
- Mgt->getBasePtr().getValueType() == MVT::v4i32);
- return false;
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
+ return Mgt->getIndex().getValueType() == MVT::v4i32;
}]>;
def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v8i32 ||
- Mgt->getBasePtr().getValueType() == MVT::v8i32);
- return false;
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
+ return Mgt->getIndex().getValueType() == MVT::v8i32;
}]>;
def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
- Mgt->getBasePtr().getValueType() == MVT::v2i64);
- return false;
-}]>;
-def X86mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(X86masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (X86MaskedGatherSDNode *Mgt = dyn_cast<X86MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v2i64 ||
- Mgt->getBasePtr().getValueType() == MVT::v2i64) &&
- (Mgt->getMemoryVT() == MVT::v2i32 ||
- Mgt->getMemoryVT() == MVT::v2f32);
- return false;
+ X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
+ return Mgt->getIndex().getValueType() == MVT::v2i64;
}]>;
def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v4i64 ||
- Mgt->getBasePtr().getValueType() == MVT::v4i64);
- return false;
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
+ return Mgt->getIndex().getValueType() == MVT::v4i64;
}]>;
def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v8i64 ||
- Mgt->getBasePtr().getValueType() == MVT::v8i64);
- return false;
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
+ return Mgt->getIndex().getValueType() == MVT::v8i64;
}]>;
def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_gather node:$src1, node:$src2, node:$src3) , [{
- if (MaskedGatherSDNode *Mgt = dyn_cast<MaskedGatherSDNode>(N))
- return (Mgt->getIndex().getValueType() == MVT::v16i32 ||
- Mgt->getBasePtr().getValueType() == MVT::v16i32);
- return false;
+ (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
+ return Mgt->getIndex().getValueType() == MVT::v16i32;
}]>;
def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_scatter node:$src1, node:$src2, node:$src3) , [{
- if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
- return (Sc->getIndex().getValueType() == MVT::v2i64 ||
- Sc->getBasePtr().getValueType() == MVT::v2i64);
- return false;
+ (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
+ return Sc->getIndex().getValueType() == MVT::v2i64;
}]>;
def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_scatter node:$src1, node:$src2, node:$src3) , [{
- if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
- return (Sc->getIndex().getValueType() == MVT::v4i32 ||
- Sc->getBasePtr().getValueType() == MVT::v4i32);
- return false;
+ (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
+ return Sc->getIndex().getValueType() == MVT::v4i32;
}]>;
def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_scatter node:$src1, node:$src2, node:$src3) , [{
- if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
- return (Sc->getIndex().getValueType() == MVT::v4i64 ||
- Sc->getBasePtr().getValueType() == MVT::v4i64);
- return false;
+ (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
+ return Sc->getIndex().getValueType() == MVT::v4i64;
}]>;
def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_scatter node:$src1, node:$src2, node:$src3) , [{
- if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
- return (Sc->getIndex().getValueType() == MVT::v8i32 ||
- Sc->getBasePtr().getValueType() == MVT::v8i32);
- return false;
+ (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
+ return Sc->getIndex().getValueType() == MVT::v8i32;
}]>;
def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_scatter node:$src1, node:$src2, node:$src3) , [{
- if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
- return (Sc->getIndex().getValueType() == MVT::v8i64 ||
- Sc->getBasePtr().getValueType() == MVT::v8i64);
- return false;
+ (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
+ return Sc->getIndex().getValueType() == MVT::v8i64;
}]>;
def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_scatter node:$src1, node:$src2, node:$src3) , [{
- if (MaskedScatterSDNode *Sc = dyn_cast<MaskedScatterSDNode>(N))
- return (Sc->getIndex().getValueType() == MVT::v16i32 ||
- Sc->getBasePtr().getValueType() == MVT::v16i32);
- return false;
+ (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
+ X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
+ return Sc->getIndex().getValueType() == MVT::v16i32;
}]>;
// 128-bit bitconvert pattern fragments
@@ -927,53 +938,48 @@ def BYTE_imm : SDNodeXForm<imm, [{
// EXTRACT_get_vextract128_imm xform function: convert extract_subvector index
// to VEXTRACTF128/VEXTRACTI128 imm.
def EXTRACT_get_vextract128_imm : SDNodeXForm<extract_subvector, [{
- return getI8Imm(X86::getExtractVEXTRACT128Immediate(N), SDLoc(N));
+ return getExtractVEXTRACTImmediate(N, 128, SDLoc(N));
}]>;
// INSERT_get_vinsert128_imm xform function: convert insert_subvector index to
// VINSERTF128/VINSERTI128 imm.
def INSERT_get_vinsert128_imm : SDNodeXForm<insert_subvector, [{
- return getI8Imm(X86::getInsertVINSERT128Immediate(N), SDLoc(N));
+ return getInsertVINSERTImmediate(N, 128, SDLoc(N));
}]>;
// EXTRACT_get_vextract256_imm xform function: convert extract_subvector index
// to VEXTRACTF64x4 imm.
def EXTRACT_get_vextract256_imm : SDNodeXForm<extract_subvector, [{
- return getI8Imm(X86::getExtractVEXTRACT256Immediate(N), SDLoc(N));
+ return getExtractVEXTRACTImmediate(N, 256, SDLoc(N));
}]>;
// INSERT_get_vinsert256_imm xform function: convert insert_subvector index to
// VINSERTF64x4 imm.
def INSERT_get_vinsert256_imm : SDNodeXForm<insert_subvector, [{
- return getI8Imm(X86::getInsertVINSERT256Immediate(N), SDLoc(N));
+ return getInsertVINSERTImmediate(N, 256, SDLoc(N));
}]>;
def vextract128_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
- node:$index), [{
- return X86::isVEXTRACT128Index(N);
-}], EXTRACT_get_vextract128_imm>;
+ node:$index), [{}],
+ EXTRACT_get_vextract128_imm>;
def vinsert128_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
(insert_subvector node:$bigvec, node:$smallvec,
- node:$index), [{
- return X86::isVINSERT128Index(N);
-}], INSERT_get_vinsert128_imm>;
-
+ node:$index), [{}],
+ INSERT_get_vinsert128_imm>;
def vextract256_extract : PatFrag<(ops node:$bigvec, node:$index),
(extract_subvector node:$bigvec,
- node:$index), [{
- return X86::isVEXTRACT256Index(N);
-}], EXTRACT_get_vextract256_imm>;
+ node:$index), [{}],
+ EXTRACT_get_vextract256_imm>;
def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
node:$index),
(insert_subvector node:$bigvec, node:$smallvec,
- node:$index), [{
- return X86::isVINSERT256Index(N);
-}], INSERT_get_vinsert256_imm>;
+ node:$index), [{}],
+ INSERT_get_vinsert256_imm>;
def X86mload : PatFrag<(ops node:$src1, node:$src2, node:$src3),
(masked_load node:$src1, node:$src2, node:$src3), [{
@@ -1136,8 +1142,3 @@ def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3)
(X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
}]>;
-
-def assertzext_i1 :
- PatFrag<(ops node:$src), (assertzext node:$src), [{
- return cast<VTSDNode>(N->getOperand(1))->getVT() == MVT::i1;
-}]>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 34d4816a2518..7ca1c58184f6 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -47,8 +47,9 @@ using namespace llvm;
#include "X86GenInstrInfo.inc"
static cl::opt<bool>
-NoFusing("disable-spill-fusing",
- cl::desc("Disable fusing of spill code into instructions"));
+ NoFusing("disable-spill-fusing",
+ cl::desc("Disable fusing of spill code into instructions"),
+ cl::Hidden);
static cl::opt<bool>
PrintFailedFusing("print-failed-fuse-candidates",
cl::desc("Print instructions that the allocator wants to"
@@ -122,12 +123,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
Subtarget(STI), RI(STI.getTargetTriple()) {
static const X86MemoryFoldTableEntry MemoryFoldTable2Addr[] = {
+ { X86::ADC16ri, X86::ADC16mi, 0 },
+ { X86::ADC16ri8, X86::ADC16mi8, 0 },
+ { X86::ADC16rr, X86::ADC16mr, 0 },
{ X86::ADC32ri, X86::ADC32mi, 0 },
{ X86::ADC32ri8, X86::ADC32mi8, 0 },
{ X86::ADC32rr, X86::ADC32mr, 0 },
{ X86::ADC64ri32, X86::ADC64mi32, 0 },
{ X86::ADC64ri8, X86::ADC64mi8, 0 },
{ X86::ADC64rr, X86::ADC64mr, 0 },
+ { X86::ADC8ri, X86::ADC8mi, 0 },
+ { X86::ADC8ri8, X86::ADC8mi8, 0 },
+ { X86::ADC8rr, X86::ADC8mr, 0 },
{ X86::ADD16ri, X86::ADD16mi, 0 },
{ X86::ADD16ri8, X86::ADD16mi8, 0 },
{ X86::ADD16ri_DB, X86::ADD16mi, TB_NO_REVERSE },
@@ -147,6 +154,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::ADD64rr, X86::ADD64mr, 0 },
{ X86::ADD64rr_DB, X86::ADD64mr, TB_NO_REVERSE },
{ X86::ADD8ri, X86::ADD8mi, 0 },
+ { X86::ADD8ri8, X86::ADD8mi8, 0 },
{ X86::ADD8rr, X86::ADD8mr, 0 },
{ X86::AND16ri, X86::AND16mi, 0 },
{ X86::AND16ri8, X86::AND16mi8, 0 },
@@ -158,7 +166,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::AND64ri8, X86::AND64mi8, 0 },
{ X86::AND64rr, X86::AND64mr, 0 },
{ X86::AND8ri, X86::AND8mi, 0 },
+ { X86::AND8ri8, X86::AND8mi8, 0 },
{ X86::AND8rr, X86::AND8mr, 0 },
+ { X86::BTC16ri8, X86::BTC16mi8, 0 },
+ { X86::BTC32ri8, X86::BTC32mi8, 0 },
+ { X86::BTC64ri8, X86::BTC64mi8, 0 },
+ { X86::BTR16ri8, X86::BTR16mi8, 0 },
+ { X86::BTR32ri8, X86::BTR32mi8, 0 },
+ { X86::BTR64ri8, X86::BTR64mi8, 0 },
+ { X86::BTS16ri8, X86::BTS16mi8, 0 },
+ { X86::BTS32ri8, X86::BTS32mi8, 0 },
+ { X86::BTS64ri8, X86::BTS64mi8, 0 },
{ X86::DEC16r, X86::DEC16m, 0 },
{ X86::DEC32r, X86::DEC32m, 0 },
{ X86::DEC64r, X86::DEC64m, 0 },
@@ -185,7 +203,32 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::OR64ri8, X86::OR64mi8, 0 },
{ X86::OR64rr, X86::OR64mr, 0 },
{ X86::OR8ri, X86::OR8mi, 0 },
+ { X86::OR8ri8, X86::OR8mi8, 0 },
{ X86::OR8rr, X86::OR8mr, 0 },
+ { X86::RCL16r1, X86::RCL16m1, 0 },
+ { X86::RCL16rCL, X86::RCL16mCL, 0 },
+ { X86::RCL16ri, X86::RCL16mi, 0 },
+ { X86::RCL32r1, X86::RCL32m1, 0 },
+ { X86::RCL32rCL, X86::RCL32mCL, 0 },
+ { X86::RCL32ri, X86::RCL32mi, 0 },
+ { X86::RCL64r1, X86::RCL64m1, 0 },
+ { X86::RCL64rCL, X86::RCL64mCL, 0 },
+ { X86::RCL64ri, X86::RCL64mi, 0 },
+ { X86::RCL8r1, X86::RCL8m1, 0 },
+ { X86::RCL8rCL, X86::RCL8mCL, 0 },
+ { X86::RCL8ri, X86::RCL8mi, 0 },
+ { X86::RCR16r1, X86::RCR16m1, 0 },
+ { X86::RCR16rCL, X86::RCR16mCL, 0 },
+ { X86::RCR16ri, X86::RCR16mi, 0 },
+ { X86::RCR32r1, X86::RCR32m1, 0 },
+ { X86::RCR32rCL, X86::RCR32mCL, 0 },
+ { X86::RCR32ri, X86::RCR32mi, 0 },
+ { X86::RCR64r1, X86::RCR64m1, 0 },
+ { X86::RCR64rCL, X86::RCR64mCL, 0 },
+ { X86::RCR64ri, X86::RCR64mi, 0 },
+ { X86::RCR8r1, X86::RCR8m1, 0 },
+ { X86::RCR8rCL, X86::RCR8mCL, 0 },
+ { X86::RCR8ri, X86::RCR8mi, 0 },
{ X86::ROL16r1, X86::ROL16m1, 0 },
{ X86::ROL16rCL, X86::ROL16mCL, 0 },
{ X86::ROL16ri, X86::ROL16mi, 0 },
@@ -222,12 +265,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SAR8r1, X86::SAR8m1, 0 },
{ X86::SAR8rCL, X86::SAR8mCL, 0 },
{ X86::SAR8ri, X86::SAR8mi, 0 },
+ { X86::SBB16ri, X86::SBB16mi, 0 },
+ { X86::SBB16ri8, X86::SBB16mi8, 0 },
+ { X86::SBB16rr, X86::SBB16mr, 0 },
{ X86::SBB32ri, X86::SBB32mi, 0 },
{ X86::SBB32ri8, X86::SBB32mi8, 0 },
{ X86::SBB32rr, X86::SBB32mr, 0 },
{ X86::SBB64ri32, X86::SBB64mi32, 0 },
{ X86::SBB64ri8, X86::SBB64mi8, 0 },
{ X86::SBB64rr, X86::SBB64mr, 0 },
+ { X86::SBB8ri, X86::SBB8mi, 0 },
+ { X86::SBB8ri8, X86::SBB8mi8, 0 },
+ { X86::SBB8rr, X86::SBB8mr, 0 },
{ X86::SHL16r1, X86::SHL16m1, 0 },
{ X86::SHL16rCL, X86::SHL16mCL, 0 },
{ X86::SHL16ri, X86::SHL16mi, 0 },
@@ -274,6 +323,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SUB64ri8, X86::SUB64mi8, 0 },
{ X86::SUB64rr, X86::SUB64mr, 0 },
{ X86::SUB8ri, X86::SUB8mi, 0 },
+ { X86::SUB8ri8, X86::SUB8mi8, 0 },
{ X86::SUB8rr, X86::SUB8mr, 0 },
{ X86::XOR16ri, X86::XOR16mi, 0 },
{ X86::XOR16ri8, X86::XOR16mi8, 0 },
@@ -285,6 +335,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::XOR64ri8, X86::XOR64mi8, 0 },
{ X86::XOR64rr, X86::XOR64mr, 0 },
{ X86::XOR8ri, X86::XOR8mi, 0 },
+ { X86::XOR8ri8, X86::XOR8mi8, 0 },
{ X86::XOR8rr, X86::XOR8mr, 0 }
};
@@ -375,9 +426,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::TAILJMPr64, X86::TAILJMPm64, TB_FOLDED_LOAD },
{ X86::TAILJMPr64_REX, X86::TAILJMPm64_REX, TB_FOLDED_LOAD },
{ X86::TEST16ri, X86::TEST16mi, TB_FOLDED_LOAD },
+ { X86::TEST16rr, X86::TEST16mr, TB_FOLDED_LOAD },
{ X86::TEST32ri, X86::TEST32mi, TB_FOLDED_LOAD },
+ { X86::TEST32rr, X86::TEST32mr, TB_FOLDED_LOAD },
{ X86::TEST64ri32, X86::TEST64mi32, TB_FOLDED_LOAD },
+ { X86::TEST64rr, X86::TEST64mr, TB_FOLDED_LOAD },
{ X86::TEST8ri, X86::TEST8mi, TB_FOLDED_LOAD },
+ { X86::TEST8rr, X86::TEST8mr, TB_FOLDED_LOAD },
// AVX 128-bit versions of foldable instructions
{ X86::VEXTRACTPSrr,X86::VEXTRACTPSmr, TB_FOLDED_STORE },
@@ -504,14 +559,30 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::CMP32rr, X86::CMP32rm, 0 },
{ X86::CMP64rr, X86::CMP64rm, 0 },
{ X86::CMP8rr, X86::CMP8rm, 0 },
+ { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
+ { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
+ { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
+ { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
+ { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
+ { X86::CVTSD2SI64rr_Int, X86::CVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSD2SIrr_Int, X86::CVTSD2SIrm_Int, TB_NO_REVERSE },
{ X86::CVTSD2SSrr, X86::CVTSD2SSrm, 0 },
- { X86::CVTSI2SD64rr, X86::CVTSI2SD64rm, 0 },
+ { X86::CVTSI642SDrr, X86::CVTSI642SDrm, 0 },
{ X86::CVTSI2SDrr, X86::CVTSI2SDrm, 0 },
- { X86::CVTSI2SS64rr, X86::CVTSI2SS64rm, 0 },
+ { X86::CVTSI642SSrr, X86::CVTSI642SSrm, 0 },
{ X86::CVTSI2SSrr, X86::CVTSI2SSrm, 0 },
{ X86::CVTSS2SDrr, X86::CVTSS2SDrm, 0 },
+ { X86::CVTSS2SI64rr_Int, X86::CVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTSS2SIrr_Int, X86::CVTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
+ { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
{ X86::CVTTSD2SI64rr, X86::CVTTSD2SI64rm, 0 },
+ { X86::CVTTSD2SI64rr_Int,X86::CVTTSD2SI64rm_Int, TB_NO_REVERSE },
{ X86::CVTTSD2SIrr, X86::CVTTSD2SIrm, 0 },
+ { X86::CVTTSD2SIrr_Int, X86::CVTTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::CVTTSS2SI64rr_Int,X86::CVTTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::CVTTSS2SIrr_Int, X86::CVTTSS2SIrm_Int, TB_NO_REVERSE },
{ X86::CVTTSS2SI64rr, X86::CVTTSS2SI64rm, 0 },
{ X86::CVTTSS2SIrr, X86::CVTTSS2SIrm, 0 },
{ X86::IMUL16rri, X86::IMUL16rmi, 0 },
@@ -522,22 +593,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::IMUL64rri8, X86::IMUL64rmi8, 0 },
{ X86::Int_COMISDrr, X86::Int_COMISDrm, TB_NO_REVERSE },
{ X86::Int_COMISSrr, X86::Int_COMISSrm, TB_NO_REVERSE },
- { X86::CVTSD2SI64rr, X86::CVTSD2SI64rm, TB_NO_REVERSE },
- { X86::CVTSD2SIrr, X86::CVTSD2SIrm, TB_NO_REVERSE },
- { X86::CVTSS2SI64rr, X86::CVTSS2SI64rm, TB_NO_REVERSE },
- { X86::CVTSS2SIrr, X86::CVTSS2SIrm, TB_NO_REVERSE },
- { X86::CVTDQ2PDrr, X86::CVTDQ2PDrm, TB_NO_REVERSE },
- { X86::CVTDQ2PSrr, X86::CVTDQ2PSrm, TB_ALIGN_16 },
- { X86::CVTPD2DQrr, X86::CVTPD2DQrm, TB_ALIGN_16 },
- { X86::CVTPD2PSrr, X86::CVTPD2PSrm, TB_ALIGN_16 },
- { X86::CVTPS2DQrr, X86::CVTPS2DQrm, TB_ALIGN_16 },
- { X86::CVTPS2PDrr, X86::CVTPS2PDrm, TB_NO_REVERSE },
- { X86::CVTTPD2DQrr, X86::CVTTPD2DQrm, TB_ALIGN_16 },
- { X86::CVTTPS2DQrr, X86::CVTTPS2DQrm, TB_ALIGN_16 },
- { X86::Int_CVTTSD2SI64rr,X86::Int_CVTTSD2SI64rm, TB_NO_REVERSE },
- { X86::Int_CVTTSD2SIrr, X86::Int_CVTTSD2SIrm, TB_NO_REVERSE },
- { X86::Int_CVTTSS2SI64rr,X86::Int_CVTTSS2SI64rm, TB_NO_REVERSE },
- { X86::Int_CVTTSS2SIrr, X86::Int_CVTTSS2SIrm, TB_NO_REVERSE },
{ X86::Int_UCOMISDrr, X86::Int_UCOMISDrm, TB_NO_REVERSE },
{ X86::Int_UCOMISSrr, X86::Int_UCOMISSrm, TB_NO_REVERSE },
{ X86::MOV16rr, X86::MOV16rm, 0 },
@@ -608,10 +663,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::SQRTSDr_Int, X86::SQRTSDm_Int, TB_NO_REVERSE },
{ X86::SQRTSSr, X86::SQRTSSm, 0 },
{ X86::SQRTSSr_Int, X86::SQRTSSm_Int, TB_NO_REVERSE },
- { X86::TEST16rr, X86::TEST16rm, 0 },
- { X86::TEST32rr, X86::TEST32rm, 0 },
- { X86::TEST64rr, X86::TEST64rm, 0 },
- { X86::TEST8rr, X86::TEST8rm, 0 },
// FIXME: TEST*rr EAX,EAX ---> CMP [mem], 0
{ X86::UCOMISDrr, X86::UCOMISDrm, 0 },
{ X86::UCOMISSrr, X86::UCOMISSrm, 0 },
@@ -643,17 +694,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::Int_VUCOMISDrr, X86::Int_VUCOMISDrm, TB_NO_REVERSE },
{ X86::Int_VUCOMISSrr, X86::Int_VUCOMISSrm, TB_NO_REVERSE },
{ X86::VCVTTSD2SI64rr, X86::VCVTTSD2SI64rm, 0 },
- { X86::Int_VCVTTSD2SI64rr,X86::Int_VCVTTSD2SI64rm,TB_NO_REVERSE },
+ { X86::VCVTTSD2SI64rr_Int,X86::VCVTTSD2SI64rm_Int,TB_NO_REVERSE },
{ X86::VCVTTSD2SIrr, X86::VCVTTSD2SIrm, 0 },
- { X86::Int_VCVTTSD2SIrr,X86::Int_VCVTTSD2SIrm, TB_NO_REVERSE },
+ { X86::VCVTTSD2SIrr_Int,X86::VCVTTSD2SIrm_Int, TB_NO_REVERSE },
{ X86::VCVTTSS2SI64rr, X86::VCVTTSS2SI64rm, 0 },
- { X86::Int_VCVTTSS2SI64rr,X86::Int_VCVTTSS2SI64rm,TB_NO_REVERSE },
+ { X86::VCVTTSS2SI64rr_Int,X86::VCVTTSS2SI64rm_Int,TB_NO_REVERSE },
{ X86::VCVTTSS2SIrr, X86::VCVTTSS2SIrm, 0 },
- { X86::Int_VCVTTSS2SIrr,X86::Int_VCVTTSS2SIrm, TB_NO_REVERSE },
- { X86::VCVTSD2SI64rr, X86::VCVTSD2SI64rm, TB_NO_REVERSE },
- { X86::VCVTSD2SIrr, X86::VCVTSD2SIrm, TB_NO_REVERSE },
- { X86::VCVTSS2SI64rr, X86::VCVTSS2SI64rm, TB_NO_REVERSE },
- { X86::VCVTSS2SIrr, X86::VCVTSS2SIrm, TB_NO_REVERSE },
+ { X86::VCVTTSS2SIrr_Int,X86::VCVTTSS2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SI64rr_Int, X86::VCVTSD2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSD2SIrr_Int, X86::VCVTSD2SIrm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SI64rr_Int, X86::VCVTSS2SI64rm_Int, TB_NO_REVERSE },
+ { X86::VCVTSS2SIrr_Int, X86::VCVTSS2SIrm_Int, TB_NO_REVERSE },
{ X86::VCVTDQ2PDrr, X86::VCVTDQ2PDrm, TB_NO_REVERSE },
{ X86::VCVTDQ2PSrr, X86::VCVTDQ2PSrm, 0 },
{ X86::VCVTPD2DQrr, X86::VCVTPD2DQrm, 0 },
@@ -714,12 +765,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VUCOMISSrr, X86::VUCOMISSrm, 0 },
// AVX 256-bit foldable instructions
- { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDYrr, X86::VCVTDQ2PDYrm, 0 },
{ X86::VCVTDQ2PSYrr, X86::VCVTDQ2PSYrm, 0 },
{ X86::VCVTPD2DQYrr, X86::VCVTPD2DQYrm, 0 },
{ X86::VCVTPD2PSYrr, X86::VCVTPD2PSYrm, 0 },
{ X86::VCVTPS2DQYrr, X86::VCVTPS2DQYrm, 0 },
- { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, TB_NO_REVERSE },
+ { X86::VCVTPS2PDYrr, X86::VCVTPS2PDYrm, 0 },
{ X86::VCVTTPD2DQYrr, X86::VCVTTPD2DQYrm, 0 },
{ X86::VCVTTPS2DQYrr, X86::VCVTTPS2DQYrm, 0 },
{ X86::VMOVAPDYrr, X86::VMOVAPDYrm, TB_ALIGN_32 },
@@ -879,6 +930,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX-512 foldable instructions
{ X86::VBROADCASTSSZr, X86::VBROADCASTSSZm, TB_NO_REVERSE },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZm, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZrr, X86::VCVTDQ2PDZrm, 0 },
+ { X86::VCVTPD2PSZrr, X86::VCVTPD2PSZrm, 0 },
+ { X86::VCVTUDQ2PDZrr, X86::VCVTUDQ2PDZrm, 0 },
{ X86::VMOV64toPQIZrr, X86::VMOVQI2PQIZrm, 0 },
{ X86::VMOV64toSDZrr, X86::VMOV64toSDZrm, 0 },
{ X86::VMOVDI2PDIZrr, X86::VMOVDI2PDIZrm, 0 },
@@ -923,14 +977,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPSHUFDZri, X86::VPSHUFDZmi, 0 },
{ X86::VPSHUFHWZri, X86::VPSHUFHWZmi, 0 },
{ X86::VPSHUFLWZri, X86::VPSHUFLWZmi, 0 },
- { X86::VPSLLDQZ512rr, X86::VPSLLDQZ512rm, 0 },
+ { X86::VPSLLDQZrr, X86::VPSLLDQZrm, 0 },
{ X86::VPSLLDZri, X86::VPSLLDZmi, 0 },
{ X86::VPSLLQZri, X86::VPSLLQZmi, 0 },
{ X86::VPSLLWZri, X86::VPSLLWZmi, 0 },
{ X86::VPSRADZri, X86::VPSRADZmi, 0 },
{ X86::VPSRAQZri, X86::VPSRAQZmi, 0 },
{ X86::VPSRAWZri, X86::VPSRAWZmi, 0 },
- { X86::VPSRLDQZ512rr, X86::VPSRLDQZ512rm, 0 },
+ { X86::VPSRLDQZrr, X86::VPSRLDQZrm, 0 },
{ X86::VPSRLDZri, X86::VPSRLDZmi, 0 },
{ X86::VPSRLQZri, X86::VPSRLQZmi, 0 },
{ X86::VPSRLWZri, X86::VPSRLWZmi, 0 },
@@ -938,6 +992,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX-512 foldable instructions (256-bit versions)
{ X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256m, TB_NO_REVERSE },
{ X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256m, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ256rr, X86::VCVTDQ2PDZ256rm, 0 },
+ { X86::VCVTPD2PSZ256rr, X86::VCVTPD2PSZ256rm, 0 },
+ { X86::VCVTUDQ2PDZ256rr, X86::VCVTUDQ2PDZ256rm, 0 },
{ X86::VMOVAPDZ256rr, X86::VMOVAPDZ256rm, TB_ALIGN_32 },
{ X86::VMOVAPSZ256rr, X86::VMOVAPSZ256rm, TB_ALIGN_32 },
{ X86::VMOVDQA32Z256rr, X86::VMOVDQA32Z256rm, TB_ALIGN_32 },
@@ -989,6 +1046,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
// AVX-512 foldable instructions (128-bit versions)
{ X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128m, TB_NO_REVERSE },
+ { X86::VCVTDQ2PDZ128rr, X86::VCVTDQ2PDZ128rm, TB_NO_REVERSE },
+ { X86::VCVTPD2PSZ128rr, X86::VCVTPD2PSZ128rm, 0 },
+ { X86::VCVTUDQ2PDZ128rr, X86::VCVTUDQ2PDZ128rm, TB_NO_REVERSE },
{ X86::VMOVAPDZ128rr, X86::VMOVAPDZ128rm, TB_ALIGN_16 },
{ X86::VMOVAPSZ128rr, X86::VMOVAPSZ128rm, TB_ALIGN_16 },
{ X86::VMOVDQA32Z128rr, X86::VMOVDQA32Z128rm, TB_ALIGN_16 },
@@ -1135,9 +1195,13 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::CMPPDrri, X86::CMPPDrmi, TB_ALIGN_16 },
{ X86::CMPPSrri, X86::CMPPSrmi, TB_ALIGN_16 },
{ X86::CMPSDrr, X86::CMPSDrm, 0 },
+ { X86::CMPSDrr_Int, X86::CMPSDrm_Int, TB_NO_REVERSE },
{ X86::CMPSSrr, X86::CMPSSrm, 0 },
+ { X86::CMPSSrr_Int, X86::CMPSSrm_Int, TB_NO_REVERSE },
{ X86::CRC32r32r32, X86::CRC32r32m32, 0 },
{ X86::CRC32r64r64, X86::CRC32r64m64, 0 },
+ { X86::CVTSD2SSrr_Int, X86::CVTSD2SSrm_Int, TB_NO_REVERSE },
+ { X86::CVTSS2SDrr_Int, X86::CVTSS2SDrm_Int, TB_NO_REVERSE },
{ X86::DIVPDrr, X86::DIVPDrm, TB_ALIGN_16 },
{ X86::DIVPSrr, X86::DIVPSrm, TB_ALIGN_16 },
{ X86::DIVSDrr, X86::DIVSDrm, 0 },
@@ -1153,14 +1217,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::IMUL16rr, X86::IMUL16rm, 0 },
{ X86::IMUL32rr, X86::IMUL32rm, 0 },
{ X86::IMUL64rr, X86::IMUL64rm, 0 },
- { X86::Int_CMPSDrr, X86::Int_CMPSDrm, TB_NO_REVERSE },
- { X86::Int_CMPSSrr, X86::Int_CMPSSrm, TB_NO_REVERSE },
- { X86::Int_CVTSD2SSrr, X86::Int_CVTSD2SSrm, TB_NO_REVERSE },
- { X86::Int_CVTSI2SD64rr,X86::Int_CVTSI2SD64rm, 0 },
- { X86::Int_CVTSI2SDrr, X86::Int_CVTSI2SDrm, 0 },
- { X86::Int_CVTSI2SS64rr,X86::Int_CVTSI2SS64rm, 0 },
- { X86::Int_CVTSI2SSrr, X86::Int_CVTSI2SSrm, 0 },
- { X86::Int_CVTSS2SDrr, X86::Int_CVTSS2SDrm, TB_NO_REVERSE },
+ { X86::CVTSI642SDrr_Int,X86::CVTSI642SDrm_Int, 0 },
+ { X86::CVTSI2SDrr_Int, X86::CVTSI2SDrm_Int, 0 },
+ { X86::CVTSI642SSrr_Int,X86::CVTSI642SSrm_Int, 0 },
+ { X86::CVTSI2SSrr_Int, X86::CVTSI2SSrm_Int, 0 },
{ X86::MAXPDrr, X86::MAXPDrm, TB_ALIGN_16 },
{ X86::MAXCPDrr, X86::MAXCPDrm, TB_ALIGN_16 },
{ X86::MAXPSrr, X86::MAXPSrm, TB_ALIGN_16 },
@@ -1405,14 +1465,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::PMULHRWrr, X86::PMULHRWrm, 0 },
// AVX 128-bit versions of foldable instructions
- { X86::VCVTSI2SD64rr, X86::VCVTSI2SD64rm, 0 },
- { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm, 0 },
+ { X86::VCVTSI642SDrr, X86::VCVTSI642SDrm, 0 },
+ { X86::VCVTSI642SDrr_Int, X86::VCVTSI642SDrm_Int, 0 },
{ X86::VCVTSI2SDrr, X86::VCVTSI2SDrm, 0 },
- { X86::Int_VCVTSI2SDrr, X86::Int_VCVTSI2SDrm, 0 },
- { X86::VCVTSI2SS64rr, X86::VCVTSI2SS64rm, 0 },
- { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm, 0 },
+ { X86::VCVTSI2SDrr_Int, X86::VCVTSI2SDrm_Int, 0 },
+ { X86::VCVTSI642SSrr, X86::VCVTSI642SSrm, 0 },
+ { X86::VCVTSI642SSrr_Int, X86::VCVTSI642SSrm_Int, 0 },
{ X86::VCVTSI2SSrr, X86::VCVTSI2SSrm, 0 },
- { X86::Int_VCVTSI2SSrr, X86::Int_VCVTSI2SSrm, 0 },
+ { X86::VCVTSI2SSrr_Int, X86::VCVTSI2SSrm_Int, 0 },
{ X86::VADDPDrr, X86::VADDPDrm, 0 },
{ X86::VADDPSrr, X86::VADDPSrm, 0 },
{ X86::VADDSDrr, X86::VADDSDrm, 0 },
@@ -1432,7 +1492,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VCMPPDrri, X86::VCMPPDrmi, 0 },
{ X86::VCMPPSrri, X86::VCMPPSrmi, 0 },
{ X86::VCMPSDrr, X86::VCMPSDrm, 0 },
+ { X86::VCMPSDrr_Int, X86::VCMPSDrm_Int, TB_NO_REVERSE },
{ X86::VCMPSSrr, X86::VCMPSSrm, 0 },
+ { X86::VCMPSSrr_Int, X86::VCMPSSrm_Int, TB_NO_REVERSE },
{ X86::VDIVPDrr, X86::VDIVPDrm, 0 },
{ X86::VDIVPSrr, X86::VDIVPSrm, 0 },
{ X86::VDIVSDrr, X86::VDIVSDrm, 0 },
@@ -1445,8 +1507,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VHADDPSrr, X86::VHADDPSrm, 0 },
{ X86::VHSUBPDrr, X86::VHSUBPDrm, 0 },
{ X86::VHSUBPSrr, X86::VHSUBPSrm, 0 },
- { X86::Int_VCMPSDrr, X86::Int_VCMPSDrm, TB_NO_REVERSE },
- { X86::Int_VCMPSSrr, X86::Int_VCMPSSrm, TB_NO_REVERSE },
{ X86::VMAXCPDrr, X86::VMAXCPDrm, 0 },
{ X86::VMAXCPSrr, X86::VMAXCPSrm, 0 },
{ X86::VMAXCSDrr, X86::VMAXCSDrm, 0 },
@@ -1982,7 +2042,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPMULUDQZrr, X86::VPMULUDQZrm, 0 },
{ X86::VPORDZrr, X86::VPORDZrm, 0 },
{ X86::VPORQZrr, X86::VPORQZrm, 0 },
- { X86::VPSADBWZ512rr, X86::VPSADBWZ512rm, 0 },
+ { X86::VPSADBWZrr, X86::VPSADBWZrm, 0 },
{ X86::VPSHUFBZrr, X86::VPSHUFBZrm, 0 },
{ X86::VPSLLDZrr, X86::VPSLLDZrm, 0 },
{ X86::VPSLLQZrr, X86::VPSLLQZrm, 0 },
@@ -2528,6 +2588,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2PDrr, X86::VPERMT2PDrm, 0 },
{ X86::VPERMT2Qrr, X86::VPERMT2Qrm, 0 },
{ X86::VPERMT2Wrr, X86::VPERMT2Wrm, 0 },
+ { X86::VPMADD52HUQZr, X86::VPMADD52HUQZm, 0 },
+ { X86::VPMADD52LUQZr, X86::VPMADD52LUQZm, 0 },
{ X86::VPTERNLOGDZrri, X86::VPTERNLOGDZrmi, 0 },
{ X86::VPTERNLOGQZrri, X86::VPTERNLOGQZrmi, 0 },
@@ -2544,6 +2606,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2PS256rr, X86::VPERMT2PS256rm, 0 },
{ X86::VPERMT2Q256rr, X86::VPERMT2Q256rm, 0 },
{ X86::VPERMT2W256rr, X86::VPERMT2W256rm, 0 },
+ { X86::VPMADD52HUQZ256r, X86::VPMADD52HUQZ256m, 0 },
+ { X86::VPMADD52LUQZ256r, X86::VPMADD52LUQZ256m, 0 },
{ X86::VPTERNLOGDZ256rri, X86::VPTERNLOGDZ256rmi, 0 },
{ X86::VPTERNLOGQZ256rri, X86::VPTERNLOGQZ256rmi, 0 },
@@ -2560,6 +2624,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2PS128rr, X86::VPERMT2PS128rm, 0 },
{ X86::VPERMT2Q128rr, X86::VPERMT2Q128rm, 0 },
{ X86::VPERMT2W128rr, X86::VPERMT2W128rm, 0 },
+ { X86::VPMADD52HUQZ128r, X86::VPMADD52HUQZ128m, 0 },
+ { X86::VPMADD52LUQZ128r, X86::VPMADD52LUQZ128m, 0 },
{ X86::VPTERNLOGDZ128rri, X86::VPTERNLOGDZ128rmi, 0 },
{ X86::VPTERNLOGQZ128rri, X86::VPTERNLOGQZ128rmi, 0 },
@@ -3234,6 +3300,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2Qrrk, X86::VPERMT2Qrmk, 0 },
{ X86::VPERMT2Wrrk, X86::VPERMT2Wrmk, 0 },
{ X86::VPERMWZrrk, X86::VPERMWZrmk, 0 },
+ { X86::VPMADD52HUQZrk, X86::VPMADD52HUQZmk, 0 },
+ { X86::VPMADD52LUQZrk, X86::VPMADD52LUQZmk, 0 },
{ X86::VPMADDUBSWZrrk, X86::VPMADDUBSWZrmk, 0 },
{ X86::VPMADDWDZrrk, X86::VPMADDWDZrmk, 0 },
{ X86::VPMAXSBZrrk, X86::VPMAXSBZrmk, 0 },
@@ -3376,6 +3444,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2Q256rrk, X86::VPERMT2Q256rmk, 0 },
{ X86::VPERMT2W256rrk, X86::VPERMT2W256rmk, 0 },
{ X86::VPERMWZ256rrk, X86::VPERMWZ256rmk, 0 },
+ { X86::VPMADD52HUQZ256rk, X86::VPMADD52HUQZ256mk, 0 },
+ { X86::VPMADD52LUQZ256rk, X86::VPMADD52LUQZ256mk, 0 },
{ X86::VPMADDUBSWZ256rrk, X86::VPMADDUBSWZ256rmk, 0 },
{ X86::VPMADDWDZ256rrk, X86::VPMADDWDZ256rmk, 0 },
{ X86::VPMAXSBZ256rrk, X86::VPMAXSBZ256rmk, 0 },
@@ -3509,6 +3579,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2Q128rrk, X86::VPERMT2Q128rmk, 0 },
{ X86::VPERMT2W128rrk, X86::VPERMT2W128rmk, 0 },
{ X86::VPERMWZ128rrk, X86::VPERMWZ128rmk, 0 },
+ { X86::VPMADD52HUQZ128rk, X86::VPMADD52HUQZ128mk, 0 },
+ { X86::VPMADD52LUQZ128rk, X86::VPMADD52LUQZ128mk, 0 },
{ X86::VPMADDUBSWZ128rrk, X86::VPMADDUBSWZ128rmk, 0 },
{ X86::VPMADDWDZ128rrk, X86::VPMADDWDZ128rmk, 0 },
{ X86::VPMAXSBZ128rrk, X86::VPMAXSBZ128rmk, 0 },
@@ -3597,6 +3669,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2PDrrkz, X86::VPERMT2PDrmkz, 0 },
{ X86::VPERMT2Qrrkz, X86::VPERMT2Qrmkz, 0 },
{ X86::VPERMT2Wrrkz, X86::VPERMT2Wrmkz, 0 },
+ { X86::VPMADD52HUQZrkz, X86::VPMADD52HUQZmkz, 0 },
+ { X86::VPMADD52LUQZrkz, X86::VPMADD52LUQZmkz, 0 },
{ X86::VPTERNLOGDZrrikz, X86::VPTERNLOGDZrmikz, 0 },
{ X86::VPTERNLOGQZrrikz, X86::VPTERNLOGQZrmikz, 0 },
@@ -3613,6 +3687,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2PS256rrkz, X86::VPERMT2PS256rmkz, 0 },
{ X86::VPERMT2Q256rrkz, X86::VPERMT2Q256rmkz, 0 },
{ X86::VPERMT2W256rrkz, X86::VPERMT2W256rmkz, 0 },
+ { X86::VPMADD52HUQZ256rkz, X86::VPMADD52HUQZ256mkz, 0 },
+ { X86::VPMADD52LUQZ256rkz, X86::VPMADD52LUQZ256mkz, 0 },
{ X86::VPTERNLOGDZ256rrikz,X86::VPTERNLOGDZ256rmikz, 0 },
{ X86::VPTERNLOGQZ256rrikz,X86::VPTERNLOGQZ256rmikz, 0 },
@@ -3629,6 +3705,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
{ X86::VPERMT2PS128rrkz, X86::VPERMT2PS128rmkz, 0 },
{ X86::VPERMT2Q128rrkz, X86::VPERMT2Q128rmkz, 0 },
{ X86::VPERMT2W128rrkz, X86::VPERMT2W128rmkz, 0 },
+ { X86::VPMADD52HUQZ128rkz, X86::VPMADD52HUQZ128mkz, 0 },
+ { X86::VPMADD52LUQZ128rkz, X86::VPMADD52LUQZ128mkz, 0 },
{ X86::VPTERNLOGDZ128rrikz,X86::VPTERNLOGDZ128rmikz, 0 },
{ X86::VPTERNLOGQZ128rrikz,X86::VPTERNLOGQZ128rmikz, 0 },
};
@@ -4391,7 +4469,7 @@ MachineInstr *X86InstrInfo::convertToThreeAddressWithLEA(
unsigned leaInReg2 = 0;
MachineInstr *InsMI2 = nullptr;
if (Src == Src2) {
- // ADD16rr %reg1028<kill>, %reg1028
+ // ADD16rr killed %reg1028, %reg1028
// just a single insert_subreg.
addRegReg(MIB, leaInReg, true, leaInReg, false);
} else {
@@ -5119,7 +5197,6 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
WorkingMI.setDesc(get(Opc));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
- break;
}
case X86::BLENDPDrri:
case X86::BLENDPSrri:
@@ -5171,24 +5248,18 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VMOVSSrr: Opc = X86::VBLENDPSrri; Mask = 0x0E; break;
}
- // MOVSD/MOVSS's 2nd operand is a FR64/FR32 reg class - we need to copy
- // this over to a VR128 class like the 1st operand to use a BLENDPD/BLENDPS.
- auto &MRI = MI.getParent()->getParent()->getRegInfo();
- auto VR128RC = MRI.getRegClass(MI.getOperand(1).getReg());
- unsigned VR128 = MRI.createVirtualRegister(VR128RC);
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(TargetOpcode::COPY),
- VR128)
- .addReg(MI.getOperand(2).getReg());
-
auto &WorkingMI = cloneIfNew(MI);
WorkingMI.setDesc(get(Opc));
- WorkingMI.getOperand(2).setReg(VR128);
WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
case X86::PCLMULQDQrr:
- case X86::VPCLMULQDQrr:{
+ case X86::VPCLMULQDQrr:
+ case X86::VPCLMULQDQYrr:
+ case X86::VPCLMULQDQZrr:
+ case X86::VPCLMULQDQZ128rr:
+ case X86::VPCLMULQDQZ256rr: {
// SRC1 64bits = Imm[0] ? SRC1[127:64] : SRC1[63:0]
// SRC2 64bits = Imm[4] ? SRC2[127:64] : SRC2[63:0]
unsigned Imm = MI.getOperand(3).getImm();
@@ -5631,6 +5702,41 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
case X86::VPTERNLOGQZ256rmbikz:
case X86::VPTERNLOGQZrmbikz:
return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+ case X86::VPMADD52HUQZ128r:
+ case X86::VPMADD52HUQZ128rk:
+ case X86::VPMADD52HUQZ128rkz:
+ case X86::VPMADD52HUQZ256r:
+ case X86::VPMADD52HUQZ256rk:
+ case X86::VPMADD52HUQZ256rkz:
+ case X86::VPMADD52HUQZr:
+ case X86::VPMADD52HUQZrk:
+ case X86::VPMADD52HUQZrkz:
+ case X86::VPMADD52LUQZ128r:
+ case X86::VPMADD52LUQZ128rk:
+ case X86::VPMADD52LUQZ128rkz:
+ case X86::VPMADD52LUQZ256r:
+ case X86::VPMADD52LUQZ256rk:
+ case X86::VPMADD52LUQZ256rkz:
+ case X86::VPMADD52LUQZr:
+ case X86::VPMADD52LUQZrk:
+ case X86::VPMADD52LUQZrkz: {
+ unsigned CommutableOpIdx1 = 2;
+ unsigned CommutableOpIdx2 = 3;
+ if (Desc.TSFlags & X86II::EVEX_K) {
+ // Skip the mask register.
+ ++CommutableOpIdx1;
+ ++CommutableOpIdx2;
+ }
+ if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2,
+ CommutableOpIdx1, CommutableOpIdx2))
+ return false;
+ if (!MI.getOperand(SrcOpIdx1).isReg() ||
+ !MI.getOperand(SrcOpIdx2).isReg())
+ // No idea.
+ return false;
+ return true;
+ }
+
default:
const X86InstrFMA3Group *FMA3Group =
X86InstrFMA3Info::getFMA3Group(MI.getOpcode());
@@ -7113,16 +7219,20 @@ inline static bool isDefConvertible(MachineInstr &MI) {
case X86::OR8ri: case X86::OR64rr: case X86::OR32rr:
case X86::OR16rr: case X86::OR8rr: case X86::OR64rm:
case X86::OR32rm: case X86::OR16rm: case X86::OR8rm:
+ case X86::ADC64ri32: case X86::ADC64ri8: case X86::ADC32ri:
+ case X86::ADC32ri8: case X86::ADC16ri: case X86::ADC16ri8:
+ case X86::ADC8ri: case X86::ADC64rr: case X86::ADC32rr:
+ case X86::ADC16rr: case X86::ADC8rr: case X86::ADC64rm:
+ case X86::ADC32rm: case X86::ADC16rm: case X86::ADC8rm:
+ case X86::SBB64ri32: case X86::SBB64ri8: case X86::SBB32ri:
+ case X86::SBB32ri8: case X86::SBB16ri: case X86::SBB16ri8:
+ case X86::SBB8ri: case X86::SBB64rr: case X86::SBB32rr:
+ case X86::SBB16rr: case X86::SBB8rr: case X86::SBB64rm:
+ case X86::SBB32rm: case X86::SBB16rm: case X86::SBB8rm:
case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
case X86::SAR8r1: case X86::SAR16r1: case X86::SAR32r1:case X86::SAR64r1:
case X86::SHR8r1: case X86::SHR16r1: case X86::SHR32r1:case X86::SHR64r1:
case X86::SHL8r1: case X86::SHL16r1: case X86::SHL32r1:case X86::SHL64r1:
- case X86::ADC32ri: case X86::ADC32ri8:
- case X86::ADC32rr: case X86::ADC64ri32:
- case X86::ADC64ri8: case X86::ADC64rr:
- case X86::SBB32ri: case X86::SBB32ri8:
- case X86::SBB32rr: case X86::SBB64ri32:
- case X86::SBB64ri8: case X86::SBB64rr:
case X86::ANDN32rr: case X86::ANDN32rm:
case X86::ANDN64rr: case X86::ANDN64rm:
case X86::BEXTR32rr: case X86::BEXTR64rr:
@@ -7144,6 +7254,22 @@ inline static bool isDefConvertible(MachineInstr &MI) {
case X86::TZCNT16rr: case X86::TZCNT16rm:
case X86::TZCNT32rr: case X86::TZCNT32rm:
case X86::TZCNT64rr: case X86::TZCNT64rm:
+ case X86::BEXTRI32ri: case X86::BEXTRI32mi:
+ case X86::BEXTRI64ri: case X86::BEXTRI64mi:
+ case X86::BLCFILL32rr: case X86::BLCFILL32rm:
+ case X86::BLCFILL64rr: case X86::BLCFILL64rm:
+ case X86::BLCI32rr: case X86::BLCI32rm:
+ case X86::BLCI64rr: case X86::BLCI64rm:
+ case X86::BLCIC32rr: case X86::BLCIC32rm:
+ case X86::BLCIC64rr: case X86::BLCIC64rm:
+ case X86::BLCMSK32rr: case X86::BLCMSK32rm:
+ case X86::BLCMSK64rr: case X86::BLCMSK64rm:
+ case X86::BLCS32rr: case X86::BLCS32rm:
+ case X86::BLCS64rr: case X86::BLCS64rm:
+ case X86::BLSFILL32rr: case X86::BLSFILL32rm:
+ case X86::BLSFILL64rr: case X86::BLSFILL64rm:
+ case X86::BLSIC32rr: case X86::BLSIC32rm:
+ case X86::BLSIC64rr: case X86::BLSIC64rm:
return true;
}
}
@@ -7349,6 +7475,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
}
if (OldCC == X86::COND_INVALID) return false;
}
+ X86::CondCode ReplacementCC = X86::COND_INVALID;
if (IsCmpZero) {
switch (OldCC) {
default: break;
@@ -7368,31 +7495,32 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
default:
return false;
case X86::COND_E:
+ ReplacementCC = NewCC;
break;
case X86::COND_NE:
- NewCC = GetOppositeBranchCondition(NewCC);
+ ReplacementCC = GetOppositeBranchCondition(NewCC);
break;
}
} else if (IsSwapped) {
// If we have SUB(r1, r2) and CMP(r2, r1), the condition code needs
// to be changed from r2 > r1 to r1 < r2, from r2 < r1 to r1 > r2, etc.
// We swap the condition code and synthesize the new opcode.
- NewCC = getSwappedCondition(OldCC);
- if (NewCC == X86::COND_INVALID) return false;
+ ReplacementCC = getSwappedCondition(OldCC);
+ if (ReplacementCC == X86::COND_INVALID) return false;
}
- if ((ShouldUpdateCC || IsSwapped) && NewCC != OldCC) {
+ if ((ShouldUpdateCC || IsSwapped) && ReplacementCC != OldCC) {
// Synthesize the new opcode.
bool HasMemoryOperand = Instr.hasOneMemOperand();
unsigned NewOpc;
if (Instr.isBranch())
- NewOpc = GetCondBranchFromCond(NewCC);
+ NewOpc = GetCondBranchFromCond(ReplacementCC);
else if(OpcIsSET)
- NewOpc = getSETFromCond(NewCC, HasMemoryOperand);
+ NewOpc = getSETFromCond(ReplacementCC, HasMemoryOperand);
else {
unsigned DstReg = Instr.getOperand(0).getReg();
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
- NewOpc = getCMovFromCond(NewCC, TRI->getRegSizeInBits(*DstRC)/8,
+ NewOpc = getCMovFromCond(ReplacementCC, TRI->getRegSizeInBits(*DstRC)/8,
HasMemoryOperand);
}
@@ -7504,7 +7632,7 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
/// This is used for mapping:
/// %xmm4 = V_SET0
/// to:
-/// %xmm4 = PXORrr %xmm4<undef>, %xmm4<undef>
+/// %xmm4 = PXORrr undef %xmm4, undef %xmm4
///
static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
const MCInstrDesc &Desc) {
@@ -7597,7 +7725,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
bool NeedsDwarfCFI =
!IsWin64Prologue &&
- (MF.getMMI().hasDebugInfo() || MF.getFunction()->needsUnwindTableEntry());
+ (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry());
bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
if (EmitCFI) {
TFL->BuildCFI(MBB, I, DL,
@@ -7633,6 +7761,18 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
MIB.addReg(Reg, RegState::Kill).addImm(1).addReg(0).addImm(0).addReg(0);
}
+static bool expandXorFP(MachineInstrBuilder &MIB, const TargetInstrInfo &TII) {
+ MachineBasicBlock &MBB = *MIB->getParent();
+ MachineFunction &MF = *MBB.getParent();
+ const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+ const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
+ unsigned XorOp =
+ MIB->getOpcode() == X86::XOR64_FP ? X86::XOR64rr : X86::XOR32rr;
+ MIB->setDesc(TII.get(XorOp));
+ MIB.addReg(TRI->getFrameRegister(MF), RegState::Undef);
+ return true;
+}
+
// This is used to handle spills for 128/256-bit registers when we have AVX512,
// but not VLX. If it uses an extended register we need to use an instruction
// that loads the lower 128/256-bit, but is available with only AVX512F.
@@ -7705,9 +7845,16 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case X86::FsFLD0SS:
case X86::FsFLD0SD:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VXORPSrr : X86::XORPSrr));
- case X86::AVX_SET0:
+ case X86::AVX_SET0: {
assert(HasAVX && "AVX not supported");
- return Expand2AddrUndef(MIB, get(X86::VXORPSYrr));
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+ unsigned SrcReg = MIB->getOperand(0).getReg();
+ unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ MIB->getOperand(0).setReg(XReg);
+ Expand2AddrUndef(MIB, get(X86::VXORPSrr));
+ MIB.addReg(SrcReg, RegState::ImplicitDefine);
+ return true;
+ }
case X86::AVX512_128_SET0:
case X86::AVX512_FsFLD0SS:
case X86::AVX512_FsFLD0SD: {
@@ -7718,24 +7865,26 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
return Expand2AddrUndef(MIB,
get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
// Extended register without VLX. Use a larger XOR.
- SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
+ SrcReg =
+ TRI->getMatchingSuperReg(SrcReg, X86::sub_xmm, &X86::VR512RegClass);
MIB->getOperand(0).setReg(SrcReg);
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
- case X86::AVX512_256_SET0: {
+ case X86::AVX512_256_SET0:
+ case X86::AVX512_512_SET0: {
bool HasVLX = Subtarget.hasVLX();
unsigned SrcReg = MIB->getOperand(0).getReg();
const TargetRegisterInfo *TRI = &getRegisterInfo();
- if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
- return Expand2AddrUndef(MIB,
- get(HasVLX ? X86::VPXORDZ256rr : X86::VXORPSYrr));
- // Extended register without VLX. Use a larger XOR.
- SrcReg = TRI->getMatchingSuperReg(SrcReg, X86::sub_ymm, &X86::VR512RegClass);
- MIB->getOperand(0).setReg(SrcReg);
+ if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
+ unsigned XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
+ MIB->getOperand(0).setReg(XReg);
+ Expand2AddrUndef(MIB,
+ get(HasVLX ? X86::VPXORDZ128rr : X86::VXORPSrr));
+ MIB.addReg(SrcReg, RegState::ImplicitDefine);
+ return true;
+ }
return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
}
- case X86::AVX512_512_SET0:
- return Expand2AddrUndef(MIB, get(X86::VPXORDZrr));
case X86::V_SETALLONES:
return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr));
case X86::AVX2_SETALLONES:
@@ -7818,10 +7967,287 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
case TargetOpcode::LOAD_STACK_GUARD:
expandLoadStackGuard(MIB, *this);
return true;
+ case X86::XOR64_FP:
+ case X86::XOR32_FP:
+ return expandXorFP(MIB, *this);
}
return false;
}
+/// Return true for all instructions that only update
+/// the first 32 or 64-bits of the destination register and leave the rest
+/// unmodified. This can be used to avoid folding loads if the instructions
+/// only update part of the destination register, and the non-updated part is
+/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
+/// instructions breaks the partial register dependency and it can improve
+/// performance. e.g.:
+///
+/// movss (%rdi), %xmm0
+/// cvtss2sd %xmm0, %xmm0
+///
+/// Instead of
+/// cvtss2sd (%rdi), %xmm0
+///
+/// FIXME: This should be turned into a TSFlags.
+///
+static bool hasPartialRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::CVTSI2SSrr:
+ case X86::CVTSI2SSrm:
+ case X86::CVTSI642SSrr:
+ case X86::CVTSI642SSrm:
+ case X86::CVTSI2SDrr:
+ case X86::CVTSI2SDrm:
+ case X86::CVTSI642SDrr:
+ case X86::CVTSI642SDrm:
+ case X86::CVTSD2SSrr:
+ case X86::CVTSD2SSrm:
+ case X86::CVTSS2SDrr:
+ case X86::CVTSS2SDrm:
+ case X86::MOVHPDrm:
+ case X86::MOVHPSrm:
+ case X86::MOVLPDrm:
+ case X86::MOVLPSrm:
+ case X86::RCPSSr:
+ case X86::RCPSSm:
+ case X86::RCPSSr_Int:
+ case X86::RCPSSm_Int:
+ case X86::ROUNDSDr:
+ case X86::ROUNDSDm:
+ case X86::ROUNDSSr:
+ case X86::ROUNDSSm:
+ case X86::RSQRTSSr:
+ case X86::RSQRTSSm:
+ case X86::RSQRTSSr_Int:
+ case X86::RSQRTSSm_Int:
+ case X86::SQRTSSr:
+ case X86::SQRTSSm:
+ case X86::SQRTSSr_Int:
+ case X86::SQRTSSm_Int:
+ case X86::SQRTSDr:
+ case X86::SQRTSDm:
+ case X86::SQRTSDr_Int:
+ case X86::SQRTSDm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExecutionDepsFix pass how many idle
+/// instructions we would like before a partial register update.
+unsigned X86InstrInfo::getPartialRegUpdateClearance(
+ const MachineInstr &MI, unsigned OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
+ return 0;
+
+ // If MI is marked as reading Reg, the partial register update is wanted.
+ const MachineOperand &MO = MI.getOperand(0);
+ unsigned Reg = MO.getReg();
+ if (TargetRegisterInfo::isVirtualRegister(Reg)) {
+ if (MO.readsReg() || MI.readsVirtualRegister(Reg))
+ return 0;
+ } else {
+ if (MI.readsRegister(Reg, TRI))
+ return 0;
+ }
+
+ // If any instructions in the clearance range are reading Reg, insert a
+ // dependency breaking instruction, which is inexpensive and is likely to
+ // be hidden in other instruction's cycles.
+ return PartialRegUpdateClearance;
+}
+
+// Return true for any instruction the copies the high bits of the first source
+// operand into the unused high bits of the destination operand.
+static bool hasUndefRegUpdate(unsigned Opcode) {
+ switch (Opcode) {
+ case X86::VCVTSI2SSrr:
+ case X86::VCVTSI2SSrm:
+ case X86::VCVTSI2SSrr_Int:
+ case X86::VCVTSI2SSrm_Int:
+ case X86::VCVTSI642SSrr:
+ case X86::VCVTSI642SSrm:
+ case X86::VCVTSI642SSrr_Int:
+ case X86::VCVTSI642SSrm_Int:
+ case X86::VCVTSI2SDrr:
+ case X86::VCVTSI2SDrm:
+ case X86::VCVTSI2SDrr_Int:
+ case X86::VCVTSI2SDrm_Int:
+ case X86::VCVTSI642SDrr:
+ case X86::VCVTSI642SDrm:
+ case X86::VCVTSI642SDrr_Int:
+ case X86::VCVTSI642SDrm_Int:
+ case X86::VCVTSD2SSrr:
+ case X86::VCVTSD2SSrm:
+ case X86::VCVTSD2SSrr_Int:
+ case X86::VCVTSD2SSrm_Int:
+ case X86::VCVTSS2SDrr:
+ case X86::VCVTSS2SDrm:
+ case X86::VCVTSS2SDrr_Int:
+ case X86::VCVTSS2SDrm_Int:
+ case X86::VRCPSSr:
+ case X86::VRCPSSr_Int:
+ case X86::VRCPSSm:
+ case X86::VRCPSSm_Int:
+ case X86::VROUNDSDr:
+ case X86::VROUNDSDm:
+ case X86::VROUNDSDr_Int:
+ case X86::VROUNDSDm_Int:
+ case X86::VROUNDSSr:
+ case X86::VROUNDSSm:
+ case X86::VROUNDSSr_Int:
+ case X86::VROUNDSSm_Int:
+ case X86::VRSQRTSSr:
+ case X86::VRSQRTSSr_Int:
+ case X86::VRSQRTSSm:
+ case X86::VRSQRTSSm_Int:
+ case X86::VSQRTSSr:
+ case X86::VSQRTSSr_Int:
+ case X86::VSQRTSSm:
+ case X86::VSQRTSSm_Int:
+ case X86::VSQRTSDr:
+ case X86::VSQRTSDr_Int:
+ case X86::VSQRTSDm:
+ case X86::VSQRTSDm_Int:
+ // AVX-512
+ case X86::VCVTSI2SSZrr:
+ case X86::VCVTSI2SSZrm:
+ case X86::VCVTSI2SSZrr_Int:
+ case X86::VCVTSI2SSZrrb_Int:
+ case X86::VCVTSI2SSZrm_Int:
+ case X86::VCVTSI642SSZrr:
+ case X86::VCVTSI642SSZrm:
+ case X86::VCVTSI642SSZrr_Int:
+ case X86::VCVTSI642SSZrrb_Int:
+ case X86::VCVTSI642SSZrm_Int:
+ case X86::VCVTSI2SDZrr:
+ case X86::VCVTSI2SDZrm:
+ case X86::VCVTSI2SDZrr_Int:
+ case X86::VCVTSI2SDZrrb_Int:
+ case X86::VCVTSI2SDZrm_Int:
+ case X86::VCVTSI642SDZrr:
+ case X86::VCVTSI642SDZrm:
+ case X86::VCVTSI642SDZrr_Int:
+ case X86::VCVTSI642SDZrrb_Int:
+ case X86::VCVTSI642SDZrm_Int:
+ case X86::VCVTUSI2SSZrr:
+ case X86::VCVTUSI2SSZrm:
+ case X86::VCVTUSI2SSZrr_Int:
+ case X86::VCVTUSI2SSZrrb_Int:
+ case X86::VCVTUSI2SSZrm_Int:
+ case X86::VCVTUSI642SSZrr:
+ case X86::VCVTUSI642SSZrm:
+ case X86::VCVTUSI642SSZrr_Int:
+ case X86::VCVTUSI642SSZrrb_Int:
+ case X86::VCVTUSI642SSZrm_Int:
+ case X86::VCVTUSI2SDZrr:
+ case X86::VCVTUSI2SDZrm:
+ case X86::VCVTUSI2SDZrr_Int:
+ case X86::VCVTUSI2SDZrm_Int:
+ case X86::VCVTUSI642SDZrr:
+ case X86::VCVTUSI642SDZrm:
+ case X86::VCVTUSI642SDZrr_Int:
+ case X86::VCVTUSI642SDZrrb_Int:
+ case X86::VCVTUSI642SDZrm_Int:
+ case X86::VCVTSD2SSZrr:
+ case X86::VCVTSD2SSZrr_Int:
+ case X86::VCVTSD2SSZrrb_Int:
+ case X86::VCVTSD2SSZrm:
+ case X86::VCVTSD2SSZrm_Int:
+ case X86::VCVTSS2SDZrr:
+ case X86::VCVTSS2SDZrr_Int:
+ case X86::VCVTSS2SDZrrb_Int:
+ case X86::VCVTSS2SDZrm:
+ case X86::VCVTSS2SDZrm_Int:
+ case X86::VRNDSCALESDr:
+ case X86::VRNDSCALESDr_Int:
+ case X86::VRNDSCALESDrb_Int:
+ case X86::VRNDSCALESDm:
+ case X86::VRNDSCALESDm_Int:
+ case X86::VRNDSCALESSr:
+ case X86::VRNDSCALESSr_Int:
+ case X86::VRNDSCALESSrb_Int:
+ case X86::VRNDSCALESSm:
+ case X86::VRNDSCALESSm_Int:
+ case X86::VRCP14SSrr:
+ case X86::VRCP14SSrm:
+ case X86::VRSQRT14SSrr:
+ case X86::VRSQRT14SSrm:
+ case X86::VSQRTSSZr:
+ case X86::VSQRTSSZr_Int:
+ case X86::VSQRTSSZrb_Int:
+ case X86::VSQRTSSZm:
+ case X86::VSQRTSSZm_Int:
+ case X86::VSQRTSDZr:
+ case X86::VSQRTSDZr_Int:
+ case X86::VSQRTSDZrb_Int:
+ case X86::VSQRTSDZm:
+ case X86::VSQRTSDZm_Int:
+ return true;
+ }
+
+ return false;
+}
+
+/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// before certain undef register reads.
+///
+/// This catches the VCVTSI2SD family of instructions:
+///
+/// vcvtsi2sdq %rax, undef %xmm0, %xmm14
+///
+/// We should to be careful *not* to catch VXOR idioms which are presumably
+/// handled specially in the pipeline:
+///
+/// vxorps undef %xmm1, undef %xmm1, %xmm1
+///
+/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
+/// high bits that are passed-through are not live.
+unsigned
+X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
+ const TargetRegisterInfo *TRI) const {
+ if (!hasUndefRegUpdate(MI.getOpcode()))
+ return 0;
+
+ // Set the OpNum parameter to the first source operand.
+ OpNum = 1;
+
+ const MachineOperand &MO = MI.getOperand(OpNum);
+ if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
+ return UndefRegClearance;
+ }
+ return 0;
+}
+
+void X86InstrInfo::breakPartialRegDependency(
+ MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
+ unsigned Reg = MI.getOperand(OpNum).getReg();
+ // If MI kills this register, the false dependence is already broken.
+ if (MI.killsRegister(Reg, TRI))
+ return;
+
+ if (X86::VR128RegClass.contains(Reg)) {
+ // These instructions are all floating point domain, so xorps is the best
+ // choice.
+ unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
+ .addReg(Reg, RegState::Undef)
+ .addReg(Reg, RegState::Undef);
+ MI.addRegisterKilled(Reg, TRI, true);
+ } else if (X86::VR256RegClass.contains(Reg)) {
+ // Use vxorps to clear the full ymm register.
+ // It wants to read and write the xmm sub-register.
+ unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
+ BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
+ .addReg(XReg, RegState::Undef)
+ .addReg(XReg, RegState::Undef)
+ .addReg(Reg, RegState::ImplicitDefine);
+ MI.addRegisterKilled(Reg, TRI, true);
+ }
+}
+
static void addOperands(MachineInstrBuilder &MIB, ArrayRef<MachineOperand> MOs,
int PtrOffset = 0) {
unsigned NumAddrOps = MOs.size();
@@ -7976,18 +8402,23 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
unsigned Size, unsigned Align, bool AllowCommute) const {
const DenseMap<unsigned,
std::pair<uint16_t, uint16_t> > *OpcodeTablePtr = nullptr;
- bool isCallRegIndirect = Subtarget.callRegIndirect();
+ bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
bool isTwoAddrFold = false;
// For CPUs that favor the register form of a call or push,
// do not fold loads into calls or pushes, unless optimizing for size
// aggressively.
- if (isCallRegIndirect && !MF.getFunction()->optForMinSize() &&
+ if (isSlowTwoMemOps && !MF.getFunction().optForMinSize() &&
(MI.getOpcode() == X86::CALL32r || MI.getOpcode() == X86::CALL64r ||
MI.getOpcode() == X86::PUSH16r || MI.getOpcode() == X86::PUSH32r ||
MI.getOpcode() == X86::PUSH64r))
return nullptr;
+ // Avoid partial register update stalls unless optimizing for size.
+ // TODO: we should block undef reg update as well.
+ if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ return nullptr;
+
unsigned NumOps = MI.getDesc().getNumOperands();
bool isTwoAddr =
NumOps > 1 && MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) != -1;
@@ -8142,276 +8573,6 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
return nullptr;
}
-/// Return true for all instructions that only update
-/// the first 32 or 64-bits of the destination register and leave the rest
-/// unmodified. This can be used to avoid folding loads if the instructions
-/// only update part of the destination register, and the non-updated part is
-/// not needed. e.g. cvtss2sd, sqrtss. Unfolding the load from these
-/// instructions breaks the partial register dependency and it can improve
-/// performance. e.g.:
-///
-/// movss (%rdi), %xmm0
-/// cvtss2sd %xmm0, %xmm0
-///
-/// Instead of
-/// cvtss2sd (%rdi), %xmm0
-///
-/// FIXME: This should be turned into a TSFlags.
-///
-static bool hasPartialRegUpdate(unsigned Opcode) {
- switch (Opcode) {
- case X86::CVTSI2SSrr:
- case X86::CVTSI2SSrm:
- case X86::CVTSI2SS64rr:
- case X86::CVTSI2SS64rm:
- case X86::CVTSI2SDrr:
- case X86::CVTSI2SDrm:
- case X86::CVTSI2SD64rr:
- case X86::CVTSI2SD64rm:
- case X86::CVTSD2SSrr:
- case X86::CVTSD2SSrm:
- case X86::CVTSS2SDrr:
- case X86::CVTSS2SDrm:
- case X86::MOVHPDrm:
- case X86::MOVHPSrm:
- case X86::MOVLPDrm:
- case X86::MOVLPSrm:
- case X86::RCPSSr:
- case X86::RCPSSm:
- case X86::RCPSSr_Int:
- case X86::RCPSSm_Int:
- case X86::ROUNDSDr:
- case X86::ROUNDSDm:
- case X86::ROUNDSSr:
- case X86::ROUNDSSm:
- case X86::RSQRTSSr:
- case X86::RSQRTSSm:
- case X86::RSQRTSSr_Int:
- case X86::RSQRTSSm_Int:
- case X86::SQRTSSr:
- case X86::SQRTSSm:
- case X86::SQRTSSr_Int:
- case X86::SQRTSSm_Int:
- case X86::SQRTSDr:
- case X86::SQRTSDm:
- case X86::SQRTSDr_Int:
- case X86::SQRTSDm_Int:
- return true;
- }
-
- return false;
-}
-
-/// Inform the ExecutionDepsFix pass how many idle
-/// instructions we would like before a partial register update.
-unsigned X86InstrInfo::getPartialRegUpdateClearance(
- const MachineInstr &MI, unsigned OpNum,
- const TargetRegisterInfo *TRI) const {
- if (OpNum != 0 || !hasPartialRegUpdate(MI.getOpcode()))
- return 0;
-
- // If MI is marked as reading Reg, the partial register update is wanted.
- const MachineOperand &MO = MI.getOperand(0);
- unsigned Reg = MO.getReg();
- if (TargetRegisterInfo::isVirtualRegister(Reg)) {
- if (MO.readsReg() || MI.readsVirtualRegister(Reg))
- return 0;
- } else {
- if (MI.readsRegister(Reg, TRI))
- return 0;
- }
-
- // If any instructions in the clearance range are reading Reg, insert a
- // dependency breaking instruction, which is inexpensive and is likely to
- // be hidden in other instruction's cycles.
- return PartialRegUpdateClearance;
-}
-
-// Return true for any instruction the copies the high bits of the first source
-// operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode) {
- switch (Opcode) {
- case X86::VCVTSI2SSrr:
- case X86::VCVTSI2SSrm:
- case X86::Int_VCVTSI2SSrr:
- case X86::Int_VCVTSI2SSrm:
- case X86::VCVTSI2SS64rr:
- case X86::VCVTSI2SS64rm:
- case X86::Int_VCVTSI2SS64rr:
- case X86::Int_VCVTSI2SS64rm:
- case X86::VCVTSI2SDrr:
- case X86::VCVTSI2SDrm:
- case X86::Int_VCVTSI2SDrr:
- case X86::Int_VCVTSI2SDrm:
- case X86::VCVTSI2SD64rr:
- case X86::VCVTSI2SD64rm:
- case X86::Int_VCVTSI2SD64rr:
- case X86::Int_VCVTSI2SD64rm:
- case X86::VCVTSD2SSrr:
- case X86::VCVTSD2SSrm:
- case X86::Int_VCVTSD2SSrr:
- case X86::Int_VCVTSD2SSrm:
- case X86::VCVTSS2SDrr:
- case X86::VCVTSS2SDrm:
- case X86::Int_VCVTSS2SDrr:
- case X86::Int_VCVTSS2SDrm:
- case X86::VRCPSSr:
- case X86::VRCPSSr_Int:
- case X86::VRCPSSm:
- case X86::VRCPSSm_Int:
- case X86::VROUNDSDr:
- case X86::VROUNDSDm:
- case X86::VROUNDSDr_Int:
- case X86::VROUNDSDm_Int:
- case X86::VROUNDSSr:
- case X86::VROUNDSSm:
- case X86::VROUNDSSr_Int:
- case X86::VROUNDSSm_Int:
- case X86::VRSQRTSSr:
- case X86::VRSQRTSSr_Int:
- case X86::VRSQRTSSm:
- case X86::VRSQRTSSm_Int:
- case X86::VSQRTSSr:
- case X86::VSQRTSSr_Int:
- case X86::VSQRTSSm:
- case X86::VSQRTSSm_Int:
- case X86::VSQRTSDr:
- case X86::VSQRTSDr_Int:
- case X86::VSQRTSDm:
- case X86::VSQRTSDm_Int:
- // AVX-512
- case X86::VCVTSI2SSZrr:
- case X86::VCVTSI2SSZrm:
- case X86::VCVTSI2SSZrr_Int:
- case X86::VCVTSI2SSZrrb_Int:
- case X86::VCVTSI2SSZrm_Int:
- case X86::VCVTSI642SSZrr:
- case X86::VCVTSI642SSZrm:
- case X86::VCVTSI642SSZrr_Int:
- case X86::VCVTSI642SSZrrb_Int:
- case X86::VCVTSI642SSZrm_Int:
- case X86::VCVTSI2SDZrr:
- case X86::VCVTSI2SDZrm:
- case X86::VCVTSI2SDZrr_Int:
- case X86::VCVTSI2SDZrrb_Int:
- case X86::VCVTSI2SDZrm_Int:
- case X86::VCVTSI642SDZrr:
- case X86::VCVTSI642SDZrm:
- case X86::VCVTSI642SDZrr_Int:
- case X86::VCVTSI642SDZrrb_Int:
- case X86::VCVTSI642SDZrm_Int:
- case X86::VCVTUSI2SSZrr:
- case X86::VCVTUSI2SSZrm:
- case X86::VCVTUSI2SSZrr_Int:
- case X86::VCVTUSI2SSZrrb_Int:
- case X86::VCVTUSI2SSZrm_Int:
- case X86::VCVTUSI642SSZrr:
- case X86::VCVTUSI642SSZrm:
- case X86::VCVTUSI642SSZrr_Int:
- case X86::VCVTUSI642SSZrrb_Int:
- case X86::VCVTUSI642SSZrm_Int:
- case X86::VCVTUSI2SDZrr:
- case X86::VCVTUSI2SDZrm:
- case X86::VCVTUSI2SDZrr_Int:
- case X86::VCVTUSI2SDZrm_Int:
- case X86::VCVTUSI642SDZrr:
- case X86::VCVTUSI642SDZrm:
- case X86::VCVTUSI642SDZrr_Int:
- case X86::VCVTUSI642SDZrrb_Int:
- case X86::VCVTUSI642SDZrm_Int:
- case X86::VCVTSD2SSZrr:
- case X86::VCVTSD2SSZrr_Int:
- case X86::VCVTSD2SSZrrb_Int:
- case X86::VCVTSD2SSZrm:
- case X86::VCVTSD2SSZrm_Int:
- case X86::VCVTSS2SDZrr:
- case X86::VCVTSS2SDZrr_Int:
- case X86::VCVTSS2SDZrrb_Int:
- case X86::VCVTSS2SDZrm:
- case X86::VCVTSS2SDZrm_Int:
- case X86::VRNDSCALESDr:
- case X86::VRNDSCALESDrb:
- case X86::VRNDSCALESDm:
- case X86::VRNDSCALESSr:
- case X86::VRNDSCALESSrb:
- case X86::VRNDSCALESSm:
- case X86::VRCP14SSrr:
- case X86::VRCP14SSrm:
- case X86::VRSQRT14SSrr:
- case X86::VRSQRT14SSrm:
- case X86::VSQRTSSZr:
- case X86::VSQRTSSZr_Int:
- case X86::VSQRTSSZrb_Int:
- case X86::VSQRTSSZm:
- case X86::VSQRTSSZm_Int:
- case X86::VSQRTSDZr:
- case X86::VSQRTSDZr_Int:
- case X86::VSQRTSDZrb_Int:
- case X86::VSQRTSDZm:
- case X86::VSQRTSDZm_Int:
- return true;
- }
-
- return false;
-}
-
-/// Inform the ExecutionDepsFix pass how many idle instructions we would like
-/// before certain undef register reads.
-///
-/// This catches the VCVTSI2SD family of instructions:
-///
-/// vcvtsi2sdq %rax, %xmm0<undef>, %xmm14
-///
-/// We should to be careful *not* to catch VXOR idioms which are presumably
-/// handled specially in the pipeline:
-///
-/// vxorps %xmm1<undef>, %xmm1<undef>, %xmm1
-///
-/// Like getPartialRegUpdateClearance, this makes a strong assumption that the
-/// high bits that are passed-through are not live.
-unsigned
-X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
- const TargetRegisterInfo *TRI) const {
- if (!hasUndefRegUpdate(MI.getOpcode()))
- return 0;
-
- // Set the OpNum parameter to the first source operand.
- OpNum = 1;
-
- const MachineOperand &MO = MI.getOperand(OpNum);
- if (MO.isUndef() && TargetRegisterInfo::isPhysicalRegister(MO.getReg())) {
- return UndefRegClearance;
- }
- return 0;
-}
-
-void X86InstrInfo::breakPartialRegDependency(
- MachineInstr &MI, unsigned OpNum, const TargetRegisterInfo *TRI) const {
- unsigned Reg = MI.getOperand(OpNum).getReg();
- // If MI kills this register, the false dependence is already broken.
- if (MI.killsRegister(Reg, TRI))
- return;
-
- if (X86::VR128RegClass.contains(Reg)) {
- // These instructions are all floating point domain, so xorps is the best
- // choice.
- unsigned Opc = Subtarget.hasAVX() ? X86::VXORPSrr : X86::XORPSrr;
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(Opc), Reg)
- .addReg(Reg, RegState::Undef)
- .addReg(Reg, RegState::Undef);
- MI.addRegisterKilled(Reg, TRI, true);
- } else if (X86::VR256RegClass.contains(Reg)) {
- // Use vxorps to clear the full ymm register.
- // It wants to read and write the xmm sub-register.
- unsigned XReg = TRI->getSubReg(Reg, X86::sub_xmm);
- BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), get(X86::VXORPSrr), XReg)
- .addReg(XReg, RegState::Undef)
- .addReg(XReg, RegState::Undef)
- .addReg(Reg, RegState::ImplicitDefine);
- MI.addRegisterKilled(Reg, TRI, true);
- }
-}
-
MachineInstr *
X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
ArrayRef<unsigned> Ops,
@@ -8423,7 +8584,8 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
// Unless optimizing for size, don't fold to avoid partial
// register update stalls
- if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ // TODO: we should block undef reg update as well.
+ if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
return nullptr;
// Don't fold subreg spills, or reloads that use a high subreg.
@@ -8498,7 +8660,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// instruction isn't scalar (SS).
switch (UserOpc) {
case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
- case X86::Int_CMPSSrr: case X86::Int_VCMPSSrr: case X86::VCMPSSZrr_Int:
+ case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int:
case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
@@ -8549,7 +8711,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
// instruction isn't scalar (SD).
switch (UserOpc) {
case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
- case X86::Int_CMPSDrr: case X86::Int_VCMPSDrr: case X86::VCMPSDZrr_Int:
+ case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int:
case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
@@ -8621,7 +8783,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
if (NoFusing) return nullptr;
// Avoid partial register update stalls unless optimizing for size.
- if (!MF.getFunction()->optForSize() && hasPartialRegUpdate(MI.getOpcode()))
+ // TODO: we should block undef reg update as well.
+ if (!MF.getFunction().optForSize() && hasPartialRegUpdate(MI.getOpcode()))
return nullptr;
// Determine the alignment of the load.
@@ -8717,16 +8880,16 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
Type *Ty;
unsigned Opc = LoadMI.getOpcode();
if (Opc == X86::FsFLD0SS || Opc == X86::AVX512_FsFLD0SS)
- Ty = Type::getFloatTy(MF.getFunction()->getContext());
+ Ty = Type::getFloatTy(MF.getFunction().getContext());
else if (Opc == X86::FsFLD0SD || Opc == X86::AVX512_FsFLD0SD)
- Ty = Type::getDoubleTy(MF.getFunction()->getContext());
+ Ty = Type::getDoubleTy(MF.getFunction().getContext());
else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16);
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8);
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
else
- Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
+ Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);
bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
Opc == X86::AVX512_512_SETALLONES ||
@@ -9301,6 +9464,16 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::ORPSrr, X86::ORPDrr, X86::PORrr },
{ X86::XORPSrm, X86::XORPDrm, X86::PXORrm },
{ X86::XORPSrr, X86::XORPDrr, X86::PXORrr },
+ { X86::UNPCKLPDrm, X86::UNPCKLPDrm, X86::PUNPCKLQDQrm },
+ { X86::MOVLHPSrr, X86::UNPCKLPDrr, X86::PUNPCKLQDQrr },
+ { X86::UNPCKHPDrm, X86::UNPCKHPDrm, X86::PUNPCKHQDQrm },
+ { X86::UNPCKHPDrr, X86::UNPCKHPDrr, X86::PUNPCKHQDQrr },
+ { X86::UNPCKLPSrm, X86::UNPCKLPSrm, X86::PUNPCKLDQrm },
+ { X86::UNPCKLPSrr, X86::UNPCKLPSrr, X86::PUNPCKLDQrr },
+ { X86::UNPCKHPSrm, X86::UNPCKHPSrm, X86::PUNPCKHDQrm },
+ { X86::UNPCKHPSrr, X86::UNPCKHPSrr, X86::PUNPCKHDQrr },
+ { X86::EXTRACTPSmr, X86::EXTRACTPSmr, X86::PEXTRDmr },
+ { X86::EXTRACTPSrr, X86::EXTRACTPSrr, X86::PEXTRDrr },
// AVX 128-bit support
{ X86::VMOVAPSmr, X86::VMOVAPDmr, X86::VMOVDQAmr },
{ X86::VMOVAPSrm, X86::VMOVAPDrm, X86::VMOVDQArm },
@@ -9321,6 +9494,16 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VORPSrr, X86::VORPDrr, X86::VPORrr },
{ X86::VXORPSrm, X86::VXORPDrm, X86::VPXORrm },
{ X86::VXORPSrr, X86::VXORPDrr, X86::VPXORrr },
+ { X86::VUNPCKLPDrm, X86::VUNPCKLPDrm, X86::VPUNPCKLQDQrm },
+ { X86::VMOVLHPSrr, X86::VUNPCKLPDrr, X86::VPUNPCKLQDQrr },
+ { X86::VUNPCKHPDrm, X86::VUNPCKHPDrm, X86::VPUNPCKHQDQrm },
+ { X86::VUNPCKHPDrr, X86::VUNPCKHPDrr, X86::VPUNPCKHQDQrr },
+ { X86::VUNPCKLPSrm, X86::VUNPCKLPSrm, X86::VPUNPCKLDQrm },
+ { X86::VUNPCKLPSrr, X86::VUNPCKLPSrr, X86::VPUNPCKLDQrr },
+ { X86::VUNPCKHPSrm, X86::VUNPCKHPSrm, X86::VPUNPCKHDQrm },
+ { X86::VUNPCKHPSrr, X86::VUNPCKHPSrr, X86::VPUNPCKHDQrr },
+ { X86::VEXTRACTPSmr, X86::VEXTRACTPSmr, X86::VPEXTRDmr },
+ { X86::VEXTRACTPSrr, X86::VEXTRACTPSrr, X86::VPEXTRDrr },
// AVX 256-bit support
{ X86::VMOVAPSYmr, X86::VMOVAPDYmr, X86::VMOVDQAYmr },
{ X86::VMOVAPSYrm, X86::VMOVAPDYrm, X86::VMOVDQAYrm },
@@ -9328,6 +9511,10 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VMOVUPSYmr, X86::VMOVUPDYmr, X86::VMOVDQUYmr },
{ X86::VMOVUPSYrm, X86::VMOVUPDYrm, X86::VMOVDQUYrm },
{ X86::VMOVNTPSYmr, X86::VMOVNTPDYmr, X86::VMOVNTDQYmr },
+ { X86::VPERMPSYrm, X86::VPERMPSYrm, X86::VPERMDYrm },
+ { X86::VPERMPSYrr, X86::VPERMPSYrr, X86::VPERMDYrr },
+ { X86::VPERMPDYmi, X86::VPERMPDYmi, X86::VPERMQYmi },
+ { X86::VPERMPDYri, X86::VPERMPDYri, X86::VPERMQYri },
// AVX512 support
{ X86::VMOVLPSZ128mr, X86::VMOVLPDZ128mr, X86::VMOVPQI2QIZmr },
{ X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
@@ -9347,6 +9534,76 @@ static const uint16_t ReplaceableInstrs[][3] = {
{ X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
{ X86::VBROADCASTSDZr, X86::VBROADCASTSDZr, X86::VPBROADCASTQZr },
{ X86::VBROADCASTSDZm, X86::VBROADCASTSDZm, X86::VPBROADCASTQZm },
+ { X86::VINSERTF32x4Zrr, X86::VINSERTF32x4Zrr, X86::VINSERTI32x4Zrr },
+ { X86::VINSERTF32x4Zrm, X86::VINSERTF32x4Zrm, X86::VINSERTI32x4Zrm },
+ { X86::VINSERTF32x8Zrr, X86::VINSERTF32x8Zrr, X86::VINSERTI32x8Zrr },
+ { X86::VINSERTF32x8Zrm, X86::VINSERTF32x8Zrm, X86::VINSERTI32x8Zrm },
+ { X86::VINSERTF64x2Zrr, X86::VINSERTF64x2Zrr, X86::VINSERTI64x2Zrr },
+ { X86::VINSERTF64x2Zrm, X86::VINSERTF64x2Zrm, X86::VINSERTI64x2Zrm },
+ { X86::VINSERTF64x4Zrr, X86::VINSERTF64x4Zrr, X86::VINSERTI64x4Zrr },
+ { X86::VINSERTF64x4Zrm, X86::VINSERTF64x4Zrm, X86::VINSERTI64x4Zrm },
+ { X86::VINSERTF32x4Z256rr,X86::VINSERTF32x4Z256rr,X86::VINSERTI32x4Z256rr },
+ { X86::VINSERTF32x4Z256rm,X86::VINSERTF32x4Z256rm,X86::VINSERTI32x4Z256rm },
+ { X86::VINSERTF64x2Z256rr,X86::VINSERTF64x2Z256rr,X86::VINSERTI64x2Z256rr },
+ { X86::VINSERTF64x2Z256rm,X86::VINSERTF64x2Z256rm,X86::VINSERTI64x2Z256rm },
+ { X86::VEXTRACTF32x4Zrr, X86::VEXTRACTF32x4Zrr, X86::VEXTRACTI32x4Zrr },
+ { X86::VEXTRACTF32x4Zmr, X86::VEXTRACTF32x4Zmr, X86::VEXTRACTI32x4Zmr },
+ { X86::VEXTRACTF32x8Zrr, X86::VEXTRACTF32x8Zrr, X86::VEXTRACTI32x8Zrr },
+ { X86::VEXTRACTF32x8Zmr, X86::VEXTRACTF32x8Zmr, X86::VEXTRACTI32x8Zmr },
+ { X86::VEXTRACTF64x2Zrr, X86::VEXTRACTF64x2Zrr, X86::VEXTRACTI64x2Zrr },
+ { X86::VEXTRACTF64x2Zmr, X86::VEXTRACTF64x2Zmr, X86::VEXTRACTI64x2Zmr },
+ { X86::VEXTRACTF64x4Zrr, X86::VEXTRACTF64x4Zrr, X86::VEXTRACTI64x4Zrr },
+ { X86::VEXTRACTF64x4Zmr, X86::VEXTRACTF64x4Zmr, X86::VEXTRACTI64x4Zmr },
+ { X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTF32x4Z256rr,X86::VEXTRACTI32x4Z256rr },
+ { X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTF32x4Z256mr,X86::VEXTRACTI32x4Z256mr },
+ { X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTF64x2Z256rr,X86::VEXTRACTI64x2Z256rr },
+ { X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTF64x2Z256mr,X86::VEXTRACTI64x2Z256mr },
+ { X86::VPERMILPSmi, X86::VPERMILPSmi, X86::VPSHUFDmi },
+ { X86::VPERMILPSri, X86::VPERMILPSri, X86::VPSHUFDri },
+ { X86::VPERMILPSZ128mi, X86::VPERMILPSZ128mi, X86::VPSHUFDZ128mi },
+ { X86::VPERMILPSZ128ri, X86::VPERMILPSZ128ri, X86::VPSHUFDZ128ri },
+ { X86::VPERMILPSZ256mi, X86::VPERMILPSZ256mi, X86::VPSHUFDZ256mi },
+ { X86::VPERMILPSZ256ri, X86::VPERMILPSZ256ri, X86::VPSHUFDZ256ri },
+ { X86::VPERMILPSZmi, X86::VPERMILPSZmi, X86::VPSHUFDZmi },
+ { X86::VPERMILPSZri, X86::VPERMILPSZri, X86::VPSHUFDZri },
+ { X86::VPERMPSZ256rm, X86::VPERMPSZ256rm, X86::VPERMDZ256rm },
+ { X86::VPERMPSZ256rr, X86::VPERMPSZ256rr, X86::VPERMDZ256rr },
+ { X86::VPERMPDZ256mi, X86::VPERMPDZ256mi, X86::VPERMQZ256mi },
+ { X86::VPERMPDZ256ri, X86::VPERMPDZ256ri, X86::VPERMQZ256ri },
+ { X86::VPERMPDZ256rm, X86::VPERMPDZ256rm, X86::VPERMQZ256rm },
+ { X86::VPERMPDZ256rr, X86::VPERMPDZ256rr, X86::VPERMQZ256rr },
+ { X86::VPERMPSZrm, X86::VPERMPSZrm, X86::VPERMDZrm },
+ { X86::VPERMPSZrr, X86::VPERMPSZrr, X86::VPERMDZrr },
+ { X86::VPERMPDZmi, X86::VPERMPDZmi, X86::VPERMQZmi },
+ { X86::VPERMPDZri, X86::VPERMPDZri, X86::VPERMQZri },
+ { X86::VPERMPDZrm, X86::VPERMPDZrm, X86::VPERMQZrm },
+ { X86::VPERMPDZrr, X86::VPERMPDZrr, X86::VPERMQZrr },
+ { X86::VUNPCKLPDZ256rm, X86::VUNPCKLPDZ256rm, X86::VPUNPCKLQDQZ256rm },
+ { X86::VUNPCKLPDZ256rr, X86::VUNPCKLPDZ256rr, X86::VPUNPCKLQDQZ256rr },
+ { X86::VUNPCKHPDZ256rm, X86::VUNPCKHPDZ256rm, X86::VPUNPCKHQDQZ256rm },
+ { X86::VUNPCKHPDZ256rr, X86::VUNPCKHPDZ256rr, X86::VPUNPCKHQDQZ256rr },
+ { X86::VUNPCKLPSZ256rm, X86::VUNPCKLPSZ256rm, X86::VPUNPCKLDQZ256rm },
+ { X86::VUNPCKLPSZ256rr, X86::VUNPCKLPSZ256rr, X86::VPUNPCKLDQZ256rr },
+ { X86::VUNPCKHPSZ256rm, X86::VUNPCKHPSZ256rm, X86::VPUNPCKHDQZ256rm },
+ { X86::VUNPCKHPSZ256rr, X86::VUNPCKHPSZ256rr, X86::VPUNPCKHDQZ256rr },
+ { X86::VUNPCKLPDZ128rm, X86::VUNPCKLPDZ128rm, X86::VPUNPCKLQDQZ128rm },
+ { X86::VMOVLHPSZrr, X86::VUNPCKLPDZ128rr, X86::VPUNPCKLQDQZ128rr },
+ { X86::VUNPCKHPDZ128rm, X86::VUNPCKHPDZ128rm, X86::VPUNPCKHQDQZ128rm },
+ { X86::VUNPCKHPDZ128rr, X86::VUNPCKHPDZ128rr, X86::VPUNPCKHQDQZ128rr },
+ { X86::VUNPCKLPSZ128rm, X86::VUNPCKLPSZ128rm, X86::VPUNPCKLDQZ128rm },
+ { X86::VUNPCKLPSZ128rr, X86::VUNPCKLPSZ128rr, X86::VPUNPCKLDQZ128rr },
+ { X86::VUNPCKHPSZ128rm, X86::VUNPCKHPSZ128rm, X86::VPUNPCKHDQZ128rm },
+ { X86::VUNPCKHPSZ128rr, X86::VUNPCKHPSZ128rr, X86::VPUNPCKHDQZ128rr },
+ { X86::VUNPCKLPDZrm, X86::VUNPCKLPDZrm, X86::VPUNPCKLQDQZrm },
+ { X86::VUNPCKLPDZrr, X86::VUNPCKLPDZrr, X86::VPUNPCKLQDQZrr },
+ { X86::VUNPCKHPDZrm, X86::VUNPCKHPDZrm, X86::VPUNPCKHQDQZrm },
+ { X86::VUNPCKHPDZrr, X86::VUNPCKHPDZrr, X86::VPUNPCKHQDQZrr },
+ { X86::VUNPCKLPSZrm, X86::VUNPCKLPSZrm, X86::VPUNPCKLDQZrm },
+ { X86::VUNPCKLPSZrr, X86::VUNPCKLPSZrr, X86::VPUNPCKLDQZrr },
+ { X86::VUNPCKHPSZrm, X86::VUNPCKHPSZrm, X86::VPUNPCKHDQZrm },
+ { X86::VUNPCKHPSZrr, X86::VUNPCKHPSZrr, X86::VPUNPCKHDQZrr },
+ { X86::VEXTRACTPSZmr, X86::VEXTRACTPSZmr, X86::VPEXTRDZmr },
+ { X86::VEXTRACTPSZrr, X86::VEXTRACTPSZrr, X86::VPEXTRDZrr },
};
static const uint16_t ReplaceableInstrsAVX2[][3] = {
@@ -9368,6 +9625,20 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
{ X86::VBROADCASTSDYrr, X86::VBROADCASTSDYrr, X86::VPBROADCASTQYrr},
{ X86::VBROADCASTSDYrm, X86::VBROADCASTSDYrm, X86::VPBROADCASTQYrm},
{ X86::VBROADCASTF128, X86::VBROADCASTF128, X86::VBROADCASTI128 },
+ { X86::VBLENDPSrri, X86::VBLENDPSrri, X86::VPBLENDDrri },
+ { X86::VBLENDPSrmi, X86::VBLENDPSrmi, X86::VPBLENDDrmi },
+ { X86::VBLENDPSYrri, X86::VBLENDPSYrri, X86::VPBLENDDYrri },
+ { X86::VBLENDPSYrmi, X86::VBLENDPSYrmi, X86::VPBLENDDYrmi },
+ { X86::VPERMILPSYmi, X86::VPERMILPSYmi, X86::VPSHUFDYmi },
+ { X86::VPERMILPSYri, X86::VPERMILPSYri, X86::VPSHUFDYri },
+ { X86::VUNPCKLPDYrm, X86::VUNPCKLPDYrm, X86::VPUNPCKLQDQYrm },
+ { X86::VUNPCKLPDYrr, X86::VUNPCKLPDYrr, X86::VPUNPCKLQDQYrr },
+ { X86::VUNPCKHPDYrm, X86::VUNPCKHPDYrm, X86::VPUNPCKHQDQYrm },
+ { X86::VUNPCKHPDYrr, X86::VUNPCKHPDYrr, X86::VPUNPCKHQDQYrr },
+ { X86::VUNPCKLPSYrm, X86::VUNPCKLPSYrm, X86::VPUNPCKLDQYrm },
+ { X86::VUNPCKLPSYrr, X86::VUNPCKLPSYrr, X86::VPUNPCKLDQYrr },
+ { X86::VUNPCKHPSYrm, X86::VUNPCKHPSYrm, X86::VPUNPCKHDQYrm },
+ { X86::VUNPCKHPSYrr, X86::VUNPCKHPSYrr, X86::VPUNPCKHDQYrr },
};
static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
@@ -9787,9 +10058,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
case X86::VDIVPDZ256rr:
case X86::VDIVPDZ256rrk:
case X86::VDIVPDZ256rrkz:
- case X86::VDIVPDZrb:
- case X86::VDIVPDZrbk:
- case X86::VDIVPDZrbkz:
+ case X86::VDIVPDZrrb:
+ case X86::VDIVPDZrrbk:
+ case X86::VDIVPDZrrbkz:
case X86::VDIVPDZrm:
case X86::VDIVPDZrmb:
case X86::VDIVPDZrmbk:
@@ -9817,9 +10088,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
case X86::VDIVPSZ256rr:
case X86::VDIVPSZ256rrk:
case X86::VDIVPSZ256rrkz:
- case X86::VDIVPSZrb:
- case X86::VDIVPSZrbk:
- case X86::VDIVPSZrbkz:
+ case X86::VDIVPSZrrb:
+ case X86::VDIVPSZrrbk:
+ case X86::VDIVPSZrrbkz:
case X86::VDIVPSZrm:
case X86::VDIVPSZrmb:
case X86::VDIVPSZrmbk:
@@ -9837,9 +10108,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
case X86::VDIVSDZrr_Int:
case X86::VDIVSDZrr_Intk:
case X86::VDIVSDZrr_Intkz:
- case X86::VDIVSDZrrb:
- case X86::VDIVSDZrrbk:
- case X86::VDIVSDZrrbkz:
+ case X86::VDIVSDZrrb_Int:
+ case X86::VDIVSDZrrb_Intk:
+ case X86::VDIVSDZrrb_Intkz:
case X86::VDIVSSZrm:
case X86::VDIVSSZrr:
case X86::VDIVSSZrm_Int:
@@ -9848,9 +10119,9 @@ bool X86InstrInfo::isHighLatencyDef(int opc) const {
case X86::VDIVSSZrr_Int:
case X86::VDIVSSZrr_Intk:
case X86::VDIVSSZrr_Intkz:
- case X86::VDIVSSZrrb:
- case X86::VDIVSSZrrbk:
- case X86::VDIVSSZrrbkz:
+ case X86::VDIVSSZrrb_Int:
+ case X86::VDIVSSZrrb_Intk:
+ case X86::VDIVSSZrrb_Intkz:
case X86::VSQRTPDZ128m:
case X86::VSQRTPDZ128mb:
case X86::VSQRTPDZ128mbk:
@@ -10419,7 +10690,7 @@ namespace {
LDTLSCleanup() : MachineFunctionPass(ID) {}
bool runOnMachineFunction(MachineFunction &MF) override {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
X86MachineFunctionInfo *MFI = MF.getInfo<X86MachineFunctionInfo>();
@@ -10528,29 +10799,72 @@ char LDTLSCleanup::ID = 0;
FunctionPass*
llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
-unsigned X86InstrInfo::getOutliningBenefit(size_t SequenceSize,
- size_t Occurrences,
- bool CanBeTailCall) const {
- unsigned NotOutlinedSize = SequenceSize * Occurrences;
- unsigned OutlinedSize;
-
- // Is it a tail call?
- if (CanBeTailCall) {
- // If yes, we don't have to include a return instruction-- it's already in
- // our sequence. So we have one occurrence of the sequence + #Occurrences
- // calls.
- OutlinedSize = SequenceSize + Occurrences;
- } else {
- // If not, add one for the return instruction.
- OutlinedSize = (SequenceSize + 1) + Occurrences;
- }
+/// Constants defining how certain sequences should be outlined.
+///
+/// \p MachineOutlinerDefault implies that the function is called with a call
+/// instruction, and a return must be emitted for the outlined function frame.
+///
+/// That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> call OUTLINED_FUNCTION I1
+/// I3 I2
+/// I3
+/// ret
+///
+/// * Call construction overhead: 1 (call instruction)
+/// * Frame construction overhead: 1 (return instruction)
+///
+/// \p MachineOutlinerTailCall implies that the function is being tail called.
+/// A jump is emitted instead of a call, and the return is already present in
+/// the outlined sequence. That is,
+///
+/// I1 OUTLINED_FUNCTION:
+/// I2 --> jmp OUTLINED_FUNCTION I1
+/// ret I2
+/// ret
+///
+/// * Call construction overhead: 1 (jump instruction)
+/// * Frame construction overhead: 0 (don't need to return)
+///
+enum MachineOutlinerClass {
+ MachineOutlinerDefault,
+ MachineOutlinerTailCall
+};
- // Return the number of instructions saved by outlining this sequence.
- return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+X86GenInstrInfo::MachineOutlinerInfo
+X86InstrInfo::getOutlininingCandidateInfo(
+ std::vector<
+ std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
+ &RepeatedSequenceLocs) const {
+
+ if (RepeatedSequenceLocs[0].second->isTerminator())
+ return MachineOutlinerInfo(1, // Number of instructions to emit call.
+ 0, // Number of instructions to emit frame.
+ MachineOutlinerTailCall, // Type of call.
+ MachineOutlinerTailCall // Type of frame.
+ );
+
+ return MachineOutlinerInfo(1, 1, MachineOutlinerDefault,
+ MachineOutlinerDefault);
}
-bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
- return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF,
+ bool OutlineFromLinkOnceODRs) const {
+ const Function &F = MF.getFunction();
+
+ // Does the function use a red zone? If it does, then we can't risk messing
+ // with the stack.
+ if (!F.hasFnAttribute(Attribute::NoRedZone))
+ return false;
+
+ // If we *don't* want to outline from things that could potentially be deduped
+ // then return false.
+ if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
+ return false;
+
+ // This function is viable for outlining, so return true.
+ return true;
}
X86GenInstrInfo::MachineOutlinerInstrType
@@ -10580,7 +10894,7 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const {
// FIXME: There are instructions which are being manually built without
// explicit uses/defs so we also have to check the MCInstrDesc. We should be
// able to remove the extra checks once those are fixed up. For example,
- // sometimes we might get something like %RAX<def> = POP64r 1. This won't be
+ // sometimes we might get something like %rax = POP64r 1. This won't be
// caught by modifiesRegister or readsRegister even though the instruction
// really ought to be formed so that modifiesRegister/readsRegister would
// catch it.
@@ -10610,10 +10924,10 @@ X86InstrInfo::getOutliningType(MachineInstr &MI) const {
void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
MachineFunction &MF,
- bool IsTailCall) const {
-
+ const MachineOutlinerInfo &MInfo)
+ const {
// If we're a tail call, we already have a return, so don't do anything.
- if (IsTailCall)
+ if (MInfo.FrameConstructionID == MachineOutlinerTailCall)
return;
// We're a normal call, so our sequence doesn't have a return instruction.
@@ -10624,15 +10938,16 @@ void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
MachineFunction &MF,
- bool IsTailCall) const {}
+ const MachineOutlinerInfo &MInfo)
+ const {}
MachineBasicBlock::iterator
X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
MachineBasicBlock::iterator &It,
MachineFunction &MF,
- bool IsTailCall) const {
+ const MachineOutlinerInfo &MInfo) const {
// Is it a tail call?
- if (IsTailCall) {
+ if (MInfo.CallConstructionID == MachineOutlinerTailCall) {
// Yes, just insert a JMP.
It = MBB.insert(It,
BuildMI(MF, DebugLoc(), get(X86::JMP_1))
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index e64876073ccf..02a09c340cef 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -18,19 +18,19 @@
#include "X86InstrFMA3Info.h"
#include "X86RegisterInfo.h"
#include "llvm/ADT/DenseMap.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#define GET_INSTRINFO_HEADER
#include "X86GenInstrInfo.inc"
namespace llvm {
- class MachineInstrBuilder;
- class X86RegisterInfo;
- class X86Subtarget;
+class MachineInstrBuilder;
+class X86RegisterInfo;
+class X86Subtarget;
namespace X86 {
- // X86 specific condition code. These correspond to X86_*_COND in
- // X86InstrInfo.td. They must be kept in synch.
+// X86 specific condition code. These correspond to X86_*_COND in
+// X86InstrInfo.td. They must be kept in synch.
enum CondCode {
COND_A = 0,
COND_AE = 1,
@@ -83,18 +83,17 @@ CondCode getCondFromCMovOpc(unsigned Opc);
/// GetOppositeBranchCondition - Return the inverse of the specified cond,
/// e.g. turning COND_E to COND_NE.
CondCode GetOppositeBranchCondition(CondCode CC);
-} // end namespace X86;
-
+} // namespace X86
/// isGlobalStubReference - Return true if the specified TargetFlag operand is
/// a reference to a stub for a global, not the global itself.
inline static bool isGlobalStubReference(unsigned char TargetFlag) {
switch (TargetFlag) {
- case X86II::MO_DLLIMPORT: // dllimport stub.
- case X86II::MO_GOTPCREL: // rip-relative GOT reference.
- case X86II::MO_GOT: // normal GOT reference.
- case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
- case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DLLIMPORT: // dllimport stub.
+ case X86II::MO_GOTPCREL: // rip-relative GOT reference.
+ case X86II::MO_GOT: // normal GOT reference.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Normal $non_lazy_ptr ref.
+ case X86II::MO_DARWIN_NONLAZY: // Normal $non_lazy_ptr ref.
return true;
default:
return false;
@@ -106,11 +105,11 @@ inline static bool isGlobalStubReference(unsigned char TargetFlag) {
/// is true, the addressing mode has the PIC base register added in (e.g. EBX).
inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
switch (TargetFlag) {
- case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
- case X86II::MO_GOT: // isPICStyleGOT: other global.
- case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
- case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
- case X86II::MO_TLVP: // ??? Pretty sure..
+ case X86II::MO_GOTOFF: // isPICStyleGOT: local global.
+ case X86II::MO_GOT: // isPICStyleGOT: other global.
+ case X86II::MO_PIC_BASE_OFFSET: // Darwin local global.
+ case X86II::MO_DARWIN_NONLAZY_PIC_BASE: // Darwin/32 external global.
+ case X86II::MO_TLVP: // ??? Pretty sure..
return true;
default:
return false;
@@ -118,9 +117,8 @@ inline static bool isGlobalRelativeToPICBase(unsigned char TargetFlag) {
}
inline static bool isScale(const MachineOperand &MO) {
- return MO.isImm() &&
- (MO.getImm() == 1 || MO.getImm() == 2 ||
- MO.getImm() == 4 || MO.getImm() == 8);
+ return MO.isImm() && (MO.getImm() == 1 || MO.getImm() == 2 ||
+ MO.getImm() == 4 || MO.getImm() == 8);
}
inline static bool isLeaMem(const MachineInstr &MI, unsigned Op) {
@@ -150,8 +148,8 @@ class X86InstrInfo final : public X86GenInstrInfo {
/// RegOp2MemOpTable3Addr, RegOp2MemOpTable0, RegOp2MemOpTable1,
/// RegOp2MemOpTable2, RegOp2MemOpTable3 - Load / store folding opcode maps.
///
- typedef DenseMap<unsigned,
- std::pair<uint16_t, uint16_t> > RegOp2MemOpTableType;
+ typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>>
+ RegOp2MemOpTableType;
RegOp2MemOpTableType RegOp2MemOpTable2Addr;
RegOp2MemOpTableType RegOp2MemOpTable0;
RegOp2MemOpTableType RegOp2MemOpTable1;
@@ -161,13 +159,13 @@ class X86InstrInfo final : public X86GenInstrInfo {
/// MemOp2RegOpTable - Load / store unfolding opcode map.
///
- typedef DenseMap<unsigned,
- std::pair<uint16_t, uint16_t> > MemOp2RegOpTableType;
+ typedef DenseMap<unsigned, std::pair<uint16_t, uint16_t>>
+ MemOp2RegOpTableType;
MemOp2RegOpTableType MemOp2RegOpTable;
static void AddTableEntry(RegOp2MemOpTableType &R2MTable,
- MemOp2RegOpTableType &M2RTable,
- uint16_t RegOp, uint16_t MemOp, uint16_t Flags);
+ MemOp2RegOpTableType &M2RTable, uint16_t RegOp,
+ uint16_t MemOp, uint16_t Flags);
virtual void anchor();
@@ -216,9 +214,8 @@ public:
/// true, then it's expected the pre-extension value is available as a subreg
/// of the result register. This also returns the sub-register index in
/// SubIdx.
- bool isCoalescableExtInstr(const MachineInstr &MI,
- unsigned &SrcReg, unsigned &DstReg,
- unsigned &SubIdx) const override;
+ bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
+ unsigned &DstReg, unsigned &SubIdx) const override;
unsigned isLoadFromStackSlot(const MachineInstr &MI,
int &FrameIndex) const override;
@@ -253,8 +250,8 @@ public:
/// operand to the LEA instruction.
bool classifyLEAReg(MachineInstr &MI, const MachineOperand &Src,
unsigned LEAOpcode, bool AllowSP, unsigned &NewSrc,
- bool &isKill, bool &isUndef,
- MachineOperand &ImplicitOp, LiveVariables *LV) const;
+ bool &isKill, bool &isUndef, MachineOperand &ImplicitOp,
+ LiveVariables *LV) const;
/// convertToThreeAddress - This method must be implemented by targets that
/// set the M_CONVERTIBLE_TO_3_ADDR flag. When this flag is set, the target
@@ -312,8 +309,7 @@ public:
/// FMA213 #1, #2, #3
/// results into instruction with adjusted opcode:
/// FMA231 #3, #2, #1
- bool findFMA3CommutedOpIndices(const MachineInstr &MI,
- unsigned &SrcOpIdx1,
+ bool findFMA3CommutedOpIndices(const MachineInstr &MI, unsigned &SrcOpIdx1,
unsigned &SrcOpIdx2,
const X86InstrFMA3Group &FMA3Group) const;
@@ -332,10 +328,10 @@ public:
/// FMA213 #1, #2, #3
/// results into instruction with adjusted opcode:
/// FMA231 #3, #2, #1
- unsigned getFMA3OpcodeToCommuteOperands(const MachineInstr &MI,
- unsigned SrcOpIdx1,
- unsigned SrcOpIdx2,
- const X86InstrFMA3Group &FMA3Group) const;
+ unsigned
+ getFMA3OpcodeToCommuteOperands(const MachineInstr &MI, unsigned SrcOpIdx1,
+ unsigned SrcOpIdx2,
+ const X86InstrFMA3Group &FMA3Group) const;
// Branch analysis.
bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
@@ -364,8 +360,8 @@ public:
MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
const DebugLoc &DL,
int *BytesAdded = nullptr) const override;
- bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
- unsigned, unsigned, int&, int&, int&) const override;
+ bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+ unsigned, unsigned, int &, int &, int &) const override;
void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
const DebugLoc &DL, unsigned DstReg,
ArrayRef<MachineOperand> Cond, unsigned TrueReg,
@@ -374,8 +370,8 @@ public:
const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- unsigned SrcReg, bool isKill, int FrameIndex,
+ MachineBasicBlock::iterator MI, unsigned SrcReg,
+ bool isKill, int FrameIndex,
const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
@@ -384,12 +380,11 @@ public:
const TargetRegisterClass *RC,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
+ SmallVectorImpl<MachineInstr *> &NewMIs) const;
void loadRegFromStackSlot(MachineBasicBlock &MBB,
- MachineBasicBlock::iterator MI,
- unsigned DestReg, int FrameIndex,
- const TargetRegisterClass *RC,
+ MachineBasicBlock::iterator MI, unsigned DestReg,
+ int FrameIndex, const TargetRegisterClass *RC,
const TargetRegisterInfo *TRI) const override;
void loadRegFromAddr(MachineFunction &MF, unsigned DestReg,
@@ -397,7 +392,7 @@ public:
const TargetRegisterClass *RC,
MachineInstr::mmo_iterator MMOBegin,
MachineInstr::mmo_iterator MMOEnd,
- SmallVectorImpl<MachineInstr*> &NewMIs) const;
+ SmallVectorImpl<MachineInstr *> &NewMIs) const;
bool expandPostRAPseudo(MachineInstr &MI) const override;
@@ -434,7 +429,7 @@ public:
SmallVectorImpl<MachineInstr *> &NewMIs) const override;
bool unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
- SmallVectorImpl<SDNode*> &NewNodes) const override;
+ SmallVectorImpl<SDNode *> &NewNodes) const override;
/// getOpcodeAfterMemoryUnfold - Returns the opcode of the would be new
/// instruction after load / store are unfolded from an instruction of the
@@ -442,9 +437,9 @@ public:
/// possible. If LoadRegIndex is non-null, it is filled in with the operand
/// index of the operand which will hold the register holding the loaded
/// value.
- unsigned getOpcodeAfterMemoryUnfold(unsigned Opc,
- bool UnfoldLoad, bool UnfoldStore,
- unsigned *LoadRegIndex = nullptr) const override;
+ unsigned
+ getOpcodeAfterMemoryUnfold(unsigned Opc, bool UnfoldLoad, bool UnfoldStore,
+ unsigned *LoadRegIndex = nullptr) const override;
/// areLoadsFromSameBasePtr - This is used by the pre-regalloc scheduler
/// to determine if two loads are loading from the same base address. It
@@ -455,15 +450,15 @@ public:
int64_t &Offset2) const override;
/// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to
- /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should
- /// be scheduled togther. On some targets if two loads are loading from
+ /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads
+ /// should be scheduled togther. On some targets if two loads are loading from
/// addresses in the same cache line, it's better if they are scheduled
/// together. This function takes two integers that represent the load offsets
/// from the common base address. It returns true if it decides it's desirable
/// to schedule the two loads together. "NumLoads" is the number of loads that
/// have already been scheduled after Load1.
- bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
- int64_t Offset1, int64_t Offset2,
+ bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, int64_t Offset1,
+ int64_t Offset2,
unsigned NumLoads) const override;
void getNoop(MCInst &NopInst) const override;
@@ -520,9 +515,7 @@ public:
const MachineInstr &UseMI,
unsigned UseIdx) const override;
- bool useMachineCombiner() const override {
- return true;
- }
+ bool useMachineCombiner() const override { return true; }
bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
@@ -566,28 +559,28 @@ public:
ArrayRef<std::pair<unsigned, const char *>>
getSerializableDirectMachineOperandTargetFlags() const override;
- unsigned getOutliningBenefit(size_t SequenceSize,
- size_t Occurrences,
- bool CanBeTailCall) const override;
+ virtual MachineOutlinerInfo getOutlininingCandidateInfo(
+ std::vector<
+ std::pair<MachineBasicBlock::iterator, MachineBasicBlock::iterator>>
+ &RepeatedSequenceLocs) const override;
- bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+ bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
+ bool OutlineFromLinkOnceODRs) const override;
llvm::X86GenInstrInfo::MachineOutlinerInstrType
getOutliningType(MachineInstr &MI) const override;
- void insertOutlinerEpilogue(MachineBasicBlock &MBB,
- MachineFunction &MF,
- bool IsTailCall) const override;
+ void insertOutlinerEpilogue(MachineBasicBlock &MBB, MachineFunction &MF,
+ const MachineOutlinerInfo &MInfo) const override;
- void insertOutlinerPrologue(MachineBasicBlock &MBB,
- MachineFunction &MF,
- bool isTailCall) const override;
+ void insertOutlinerPrologue(MachineBasicBlock &MBB, MachineFunction &MF,
+ const MachineOutlinerInfo &MInfo) const override;
MachineBasicBlock::iterator
insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
- MachineBasicBlock::iterator &It,
- MachineFunction &MF,
- bool IsTailCall) const override;
+ MachineBasicBlock::iterator &It, MachineFunction &MF,
+ const MachineOutlinerInfo &MInfo) const override;
+
protected:
/// Commutes the operands in the given instruction by changing the operands
/// order and/or changing the instruction's opcode and/or the immediate value
@@ -643,6 +636,6 @@ private:
unsigned &SrcOpIdx2) const;
};
-} // End llvm namespace
+} // namespace llvm
#endif
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index fab70e918b8a..42e89cb4831d 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -82,6 +82,9 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
SDTCisPtrTy<1>,
SDTCisInt<2>]>;
+def SDTLockUnaryArithWithFlags : SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
+ SDTCisPtrTy<1>]>;
+
def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>;
def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>,
@@ -271,7 +274,12 @@ def X86lock_and : SDNode<"X86ISD::LAND", SDTLockBinaryArithWithFlags,
[SDNPHasChain, SDNPMayStore, SDNPMayLoad,
SDNPMemOperand]>;
-def X86bextr : SDNode<"X86ISD::BEXTR", SDTIntBinOp>;
+def X86lock_inc : SDNode<"X86ISD::LINC", SDTLockUnaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
+def X86lock_dec : SDNode<"X86ISD::LDEC", SDTLockUnaryArithWithFlags,
+ [SDNPHasChain, SDNPMayStore, SDNPMayLoad,
+ SDNPMemOperand]>;
def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
@@ -592,19 +600,11 @@ def SSECC : Operand<i8> {
let OperandType = "OPERAND_IMMEDIATE";
}
-def i8immZExt3 : ImmLeaf<i8, [{
- return Imm >= 0 && Imm < 8;
-}]>;
-
def AVXCC : Operand<i8> {
let PrintMethod = "printSSEAVXCC";
let OperandType = "OPERAND_IMMEDIATE";
}
-def i8immZExt5 : ImmLeaf<i8, [{
- return Imm >= 0 && Imm < 32;
-}]>;
-
def AVX512ICC : Operand<i8> {
let PrintMethod = "printSSEAVXCC";
let OperandType = "OPERAND_IMMEDIATE";
@@ -803,6 +803,7 @@ def UseSSE41 : Predicate<"Subtarget->hasSSE41() && !Subtarget->hasAVX()">;
def HasSSE42 : Predicate<"Subtarget->hasSSE42()">;
def UseSSE42 : Predicate<"Subtarget->hasSSE42() && !Subtarget->hasAVX()">;
def HasSSE4A : Predicate<"Subtarget->hasSSE4A()">;
+def NoAVX : Predicate<"!Subtarget->hasAVX()">;
def HasAVX : Predicate<"Subtarget->hasAVX()">;
def HasAVX2 : Predicate<"Subtarget->hasAVX2()">;
def HasAVX1Only : Predicate<"Subtarget->hasAVX() && !Subtarget->hasAVX2()">;
@@ -831,30 +832,41 @@ def NoVLX : Predicate<"!Subtarget->hasVLX()">;
def NoVLX_Or_NoBWI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasBWI()">;
def NoVLX_Or_NoDQI : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasDQI()">;
def PKU : Predicate<"Subtarget->hasPKU()">;
+def HasVNNI : Predicate<"Subtarget->hasVNNI()">;
+def HasBITALG : Predicate<"Subtarget->hasBITALG()">;
def HasPOPCNT : Predicate<"Subtarget->hasPOPCNT()">;
def HasAES : Predicate<"Subtarget->hasAES()">;
+def HasVAES : Predicate<"Subtarget->hasVAES()">;
+def NoVLX_Or_NoVAES : Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVAES()">;
def HasFXSR : Predicate<"Subtarget->hasFXSR()">;
def HasXSAVE : Predicate<"Subtarget->hasXSAVE()">;
def HasXSAVEOPT : Predicate<"Subtarget->hasXSAVEOPT()">;
def HasXSAVEC : Predicate<"Subtarget->hasXSAVEC()">;
def HasXSAVES : Predicate<"Subtarget->hasXSAVES()">;
def HasPCLMUL : Predicate<"Subtarget->hasPCLMUL()">;
+def NoVLX_Or_NoVPCLMULQDQ :
+ Predicate<"!Subtarget->hasVLX() || !Subtarget->hasVPCLMULQDQ()">;
+def HasVPCLMULQDQ : Predicate<"Subtarget->hasVPCLMULQDQ()">;
+def HasGFNI : Predicate<"Subtarget->hasGFNI()">;
def HasFMA : Predicate<"Subtarget->hasFMA()">;
def HasFMA4 : Predicate<"Subtarget->hasFMA4()">;
+def NoFMA4 : Predicate<"!Subtarget->hasFMA4()">;
def HasXOP : Predicate<"Subtarget->hasXOP()">;
def HasTBM : Predicate<"Subtarget->hasTBM()">;
+def NoTBM : Predicate<"!Subtarget->hasTBM()">;
def HasLWP : Predicate<"Subtarget->hasLWP()">;
def HasMOVBE : Predicate<"Subtarget->hasMOVBE()">;
def HasRDRAND : Predicate<"Subtarget->hasRDRAND()">;
def HasF16C : Predicate<"Subtarget->hasF16C()">;
-def NoF16C : Predicate<"!Subtarget->hasF16C()">;
def HasFSGSBase : Predicate<"Subtarget->hasFSGSBase()">;
def HasLZCNT : Predicate<"Subtarget->hasLZCNT()">;
def HasBMI : Predicate<"Subtarget->hasBMI()">;
def HasBMI2 : Predicate<"Subtarget->hasBMI2()">;
+def NoBMI2 : Predicate<"!Subtarget->hasBMI2()">;
def HasVBMI : Predicate<"Subtarget->hasVBMI()">,
AssemblerPredicate<"FeatureVBMI", "AVX-512 VBMI ISA">;
+def HasVBMI2 : Predicate<"Subtarget->hasVBMI2()">;
def HasIFMA : Predicate<"Subtarget->hasIFMA()">,
AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
def HasRTM : Predicate<"Subtarget->hasRTM()">;
@@ -869,7 +881,10 @@ def HasCLZERO : Predicate<"Subtarget->hasCLZERO()">;
def FPStackf32 : Predicate<"!Subtarget->hasSSE1()">;
def FPStackf64 : Predicate<"!Subtarget->hasSSE2()">;
def HasMPX : Predicate<"Subtarget->hasMPX()">;
+def HasSHSTK : Predicate<"Subtarget->hasSHSTK()">;
+def HasIBT : Predicate<"Subtarget->hasIBT()">;
def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
+def HasCLWB : Predicate<"Subtarget->hasCLWB()">;
def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
@@ -903,15 +918,15 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
- def OptForSize : Predicate<"MF->getFunction()->optForSize()">;
- def OptForMinSize : Predicate<"MF->getFunction()->optForMinSize()">;
- def OptForSpeed : Predicate<"!MF->getFunction()->optForSize()">;
+ def OptForSize : Predicate<"MF->getFunction().optForSize()">;
+ def OptForMinSize : Predicate<"MF->getFunction().optForMinSize()">;
+ def OptForSpeed : Predicate<"!MF->getFunction().optForSize()">;
+ def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
+ "MF->getFunction().optForSize()">;
}
-def FastBTMem : Predicate<"!Subtarget->isBTMemSlow()">;
def CallImmAddr : Predicate<"Subtarget->isLegalToCallImmediateAddr()">;
-def FavorMemIndirectCall : Predicate<"!Subtarget->callRegIndirect()">;
-def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
+def FavorMemIndirectCall : Predicate<"!Subtarget->slowTwoMemOps()">;
def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
@@ -1108,6 +1123,17 @@ let hasSideEffects = 0, SchedRW = [WriteZero] in {
"nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
def NOOPL : I<0x1f, MRMXm, (outs), (ins i32mem:$zero),
"nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+ def NOOPQ : RI<0x1f, MRMXm, (outs), (ins i64mem:$zero),
+ "nop{q}\t$zero", [], IIC_NOP>, TB,
+ Requires<[In64BitMode]>;
+ // Also allow register so we can assemble/disassemble
+ def NOOPWr : I<0x1f, MRMXr, (outs), (ins GR16:$zero),
+ "nop{w}\t$zero", [], IIC_NOP>, TB, OpSize16;
+ def NOOPLr : I<0x1f, MRMXr, (outs), (ins GR32:$zero),
+ "nop{l}\t$zero", [], IIC_NOP>, TB, OpSize32;
+ def NOOPQr : RI<0x1f, MRMXr, (outs), (ins GR64:$zero),
+ "nop{q}\t$zero", [], IIC_NOP>, TB,
+ Requires<[In64BitMode]>;
}
@@ -1131,7 +1157,8 @@ def LEAVE64 : I<0xC9, RawFrm,
// Miscellaneous Instructions.
//
-let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1 in
+let isBarrier = 1, hasSideEffects = 1, usesCustomInserter = 1,
+ SchedRW = [WriteSystem] in
def Int_eh_sjlj_setup_dispatch
: PseudoI<(outs), (ins), [(X86eh_sjlj_setup_dispatch)]>;
@@ -1461,7 +1488,8 @@ def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
[(store (i32 imm32_su:$src), addr:$dst)], IIC_MOV_MEM>, OpSize32;
def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
"mov{q}\t{$src, $dst|$dst, $src}",
- [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>;
+ [(store i64immSExt32_su:$src, addr:$dst)], IIC_MOV_MEM>,
+ Requires<[In64BitMode]>;
} // SchedRW
let hasSideEffects = 0 in {
@@ -1535,33 +1563,39 @@ def MOV32o16a : Ii16<0xA3, RawFrmMemOffs, (outs), (ins offset16_32:$dst),
let mayLoad = 1 in {
let Defs = [AL] in
def MOV8ao64 : RIi64_NOREX<0xA0, RawFrmMemOffs, (outs), (ins offset64_8:$src),
- "movabs{b}\t{$src, %al|al, $src}", []>, AdSize64;
+ "movabs{b}\t{$src, %al|al, $src}", [], IIC_MOV_MEM>,
+ AdSize64;
let Defs = [AX] in
def MOV16ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_16:$src),
- "movabs{w}\t{$src, %ax|ax, $src}", []>, OpSize16, AdSize64;
+ "movabs{w}\t{$src, %ax|ax, $src}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize64;
let Defs = [EAX] in
def MOV32ao64 : RIi64_NOREX<0xA1, RawFrmMemOffs, (outs), (ins offset64_32:$src),
- "movabs{l}\t{$src, %eax|eax, $src}", []>, OpSize32,
- AdSize64;
+ "movabs{l}\t{$src, %eax|eax, $src}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize64;
let Defs = [RAX] in
def MOV64ao64 : RIi64<0xA1, RawFrmMemOffs, (outs), (ins offset64_64:$src),
- "movabs{q}\t{$src, %rax|rax, $src}", []>, AdSize64;
+ "movabs{q}\t{$src, %rax|rax, $src}", [], IIC_MOV_MEM>,
+ AdSize64;
}
let mayStore = 1 in {
let Uses = [AL] in
def MOV8o64a : RIi64_NOREX<0xA2, RawFrmMemOffs, (outs), (ins offset64_8:$dst),
- "movabs{b}\t{%al, $dst|$dst, al}", []>, AdSize64;
+ "movabs{b}\t{%al, $dst|$dst, al}", [], IIC_MOV_MEM>,
+ AdSize64;
let Uses = [AX] in
def MOV16o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_16:$dst),
- "movabs{w}\t{%ax, $dst|$dst, ax}", []>, OpSize16, AdSize64;
+ "movabs{w}\t{%ax, $dst|$dst, ax}", [], IIC_MOV_MEM>,
+ OpSize16, AdSize64;
let Uses = [EAX] in
def MOV32o64a : RIi64_NOREX<0xA3, RawFrmMemOffs, (outs), (ins offset64_32:$dst),
- "movabs{l}\t{%eax, $dst|$dst, eax}", []>, OpSize32,
- AdSize64;
+ "movabs{l}\t{%eax, $dst|$dst, eax}", [], IIC_MOV_MEM>,
+ OpSize32, AdSize64;
let Uses = [RAX] in
def MOV64o64a : RIi64<0xA3, RawFrmMemOffs, (outs), (ins offset64_64:$dst),
- "movabs{q}\t{%rax, $dst|$dst, rax}", []>, AdSize64;
+ "movabs{q}\t{%rax, $dst|$dst, rax}", [], IIC_MOV_MEM>,
+ AdSize64;
}
} // hasSideEffects = 0
@@ -1654,40 +1688,36 @@ let SchedRW = [WriteALU] in {
def BT16rr : I<0xA3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR16:$src1, GR16:$src2))], IIC_BT_RR>,
- OpSize16, TB;
+ OpSize16, TB, NotMemoryFoldable;
def BT32rr : I<0xA3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt GR32:$src1, GR32:$src2))], IIC_BT_RR>,
- OpSize32, TB;
+ OpSize32, TB, NotMemoryFoldable;
def BT64rr : RI<0xA3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB;
+ [(set EFLAGS, (X86bt GR64:$src1, GR64:$src2))], IIC_BT_RR>, TB,
+ NotMemoryFoldable;
} // SchedRW
// Unlike with the register+register form, the memory+register form of the
// bt instruction does not ignore the high bits of the index. From ISel's
// perspective, this is pretty bizarre. Make these instructions disassembly
-// only for now.
+// only for now. These instructions are also slow on modern CPUs so that's
+// another reason to avoid generating them.
let mayLoad = 1, hasSideEffects = 0, SchedRW = [WriteALULd] in {
def BT16mr : I<0xA3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
- // [(X86bt (loadi16 addr:$src1), GR16:$src2),
- // (implicit EFLAGS)]
[], IIC_BT_MR
- >, OpSize16, TB, Requires<[FastBTMem]>;
+ >, OpSize16, TB, NotMemoryFoldable;
def BT32mr : I<0xA3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"bt{l}\t{$src2, $src1|$src1, $src2}",
- // [(X86bt (loadi32 addr:$src1), GR32:$src2),
- // (implicit EFLAGS)]
[], IIC_BT_MR
- >, OpSize32, TB, Requires<[FastBTMem]>;
+ >, OpSize32, TB, NotMemoryFoldable;
def BT64mr : RI<0xA3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
- // [(X86bt (loadi64 addr:$src1), GR64:$src2),
- // (implicit EFLAGS)]
[], IIC_BT_MR
- >, TB;
+ >, TB, NotMemoryFoldable;
}
let SchedRW = [WriteALU] in {
@@ -1705,9 +1735,8 @@ def BT64ri8 : RIi8<0xBA, MRM4r, (outs), (ins GR64:$src1, i64i8imm:$src2),
IIC_BT_RI>, TB;
} // SchedRW
-// Note that these instructions don't need FastBTMem because that
-// only applies when the other operand is in a register. When it's
-// an immediate, bt is still fast.
+// Note that these instructions aren't slow because that only applies when the
+// other operand is in a register. When it's an immediate, bt is still fast.
let SchedRW = [WriteALU] in {
def BT16mi8 : Ii8<0xBA, MRM4m, (outs), (ins i16mem:$src1, i16i8imm:$src2),
"bt{w}\t{$src2, $src1|$src1, $src2}",
@@ -1720,40 +1749,43 @@ def BT32mi8 : Ii8<0xBA, MRM4m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
def BT64mi8 : RIi8<0xBA, MRM4m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
"bt{q}\t{$src2, $src1|$src1, $src2}",
[(set EFLAGS, (X86bt (loadi64 addr:$src1),
- i64immSExt8:$src2))], IIC_BT_MI>, TB;
+ i64immSExt8:$src2))], IIC_BT_MI>, TB,
+ Requires<[In64BitMode]>;
} // SchedRW
let hasSideEffects = 0 in {
-let SchedRW = [WriteALU] in {
-def BTC16rr : I<0xBB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
+def BTC16rr : I<0xBB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
- OpSize16, TB;
-def BTC32rr : I<0xBB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ OpSize16, TB, NotMemoryFoldable;
+def BTC32rr : I<0xBB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
- OpSize32, TB;
-def BTC64rr : RI<0xBB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+ OpSize32, TB, NotMemoryFoldable;
+def BTC64rr : RI<0xBB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB,
+ NotMemoryFoldable;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTC16mr : I<0xBB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
- OpSize16, TB;
+ OpSize16, TB, NotMemoryFoldable;
def BTC32mr : I<0xBB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
- OpSize32, TB;
+ OpSize32, TB, NotMemoryFoldable;
def BTC64mr : RI<0xBB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+ NotMemoryFoldable;
}
-let SchedRW = [WriteALU] in {
-def BTC16ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
+def BTC16ri8 : Ii8<0xBA, MRM7r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"btc{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize16, TB;
-def BTC32ri8 : Ii8<0xBA, MRM7r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+def BTC32ri8 : Ii8<0xBA, MRM7r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize32, TB;
-def BTC64ri8 : RIi8<0xBA, MRM7r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+def BTC64ri8 : RIi8<0xBA, MRM7r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
} // SchedRW
@@ -1765,39 +1797,41 @@ def BTC32mi8 : Ii8<0xBA, MRM7m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"btc{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize32, TB;
def BTC64mi8 : RIi8<0xBA, MRM7m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+ "btc{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+ Requires<[In64BitMode]>;
}
-let SchedRW = [WriteALU] in {
-def BTR16rr : I<0xB3, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
+def BTR16rr : I<0xB3, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
- OpSize16, TB;
-def BTR32rr : I<0xB3, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ OpSize16, TB, NotMemoryFoldable;
+def BTR32rr : I<0xB3, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
- OpSize32, TB;
-def BTR64rr : RI<0xB3, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB;
+ OpSize32, TB, NotMemoryFoldable;
+def BTR64rr : RI<0xB3, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "btr{q}\t{$src2, $src1|$src1, $src2}", []>, TB, NotMemoryFoldable;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTR16mr : I<0xB3, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
- OpSize16, TB;
+ OpSize16, TB, NotMemoryFoldable;
def BTR32mr : I<0xB3, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
- OpSize32, TB;
+ OpSize32, TB, NotMemoryFoldable;
def BTR64mr : RI<0xB3, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+ NotMemoryFoldable;
}
-let SchedRW = [WriteALU] in {
-def BTR16ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
+def BTR16ri8 : Ii8<0xBA, MRM6r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"btr{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize16, TB;
-def BTR32ri8 : Ii8<0xBA, MRM6r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+def BTR32ri8 : Ii8<0xBA, MRM6r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize32, TB;
-def BTR64ri8 : RIi8<0xBA, MRM6r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+def BTR64ri8 : RIi8<0xBA, MRM6r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
} // SchedRW
@@ -1809,39 +1843,42 @@ def BTR32mi8 : Ii8<0xBA, MRM6m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"btr{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize32, TB;
def BTR64mi8 : RIi8<0xBA, MRM6m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+ "btr{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+ Requires<[In64BitMode]>;
}
-let SchedRW = [WriteALU] in {
-def BTS16rr : I<0xAB, MRMDestReg, (outs), (ins GR16:$src1, GR16:$src2),
+let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
+def BTS16rr : I<0xAB, MRMDestReg, (outs GR16:$dst), (ins GR16:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
- OpSize16, TB;
-def BTS32rr : I<0xAB, MRMDestReg, (outs), (ins GR32:$src1, GR32:$src2),
+ OpSize16, TB, NotMemoryFoldable;
+def BTS32rr : I<0xAB, MRMDestReg, (outs GR32:$dst), (ins GR32:$src1, GR32:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>,
- OpSize32, TB;
-def BTS64rr : RI<0xAB, MRMDestReg, (outs), (ins GR64:$src1, GR64:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB;
+ OpSize32, TB, NotMemoryFoldable;
+def BTS64rr : RI<0xAB, MRMDestReg, (outs GR64:$dst), (ins GR64:$src1, GR64:$src2),
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RR>, TB,
+ NotMemoryFoldable;
} // SchedRW
let mayLoad = 1, mayStore = 1, SchedRW = [WriteALULd, WriteRMW] in {
def BTS16mr : I<0xAB, MRMDestMem, (outs), (ins i16mem:$src1, GR16:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
- OpSize16, TB;
+ OpSize16, TB, NotMemoryFoldable;
def BTS32mr : I<0xAB, MRMDestMem, (outs), (ins i32mem:$src1, GR32:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>,
- OpSize32, TB;
+ OpSize32, TB, NotMemoryFoldable;
def BTS64mr : RI<0xAB, MRMDestMem, (outs), (ins i64mem:$src1, GR64:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MR>, TB,
+ NotMemoryFoldable;
}
-let SchedRW = [WriteALU] in {
-def BTS16ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR16:$src1, i16i8imm:$src2),
+let SchedRW = [WriteALU], Constraints = "$src1 = $dst" in {
+def BTS16ri8 : Ii8<0xBA, MRM5r, (outs GR16:$dst), (ins GR16:$src1, i16i8imm:$src2),
"bts{w}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize16, TB;
-def BTS32ri8 : Ii8<0xBA, MRM5r, (outs), (ins GR32:$src1, i32i8imm:$src2),
+def BTS32ri8 : Ii8<0xBA, MRM5r, (outs GR32:$dst), (ins GR32:$src1, i32i8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>,
OpSize32, TB;
-def BTS64ri8 : RIi8<0xBA, MRM5r, (outs), (ins GR64:$src1, i64i8imm:$src2),
+def BTS64ri8 : RIi8<0xBA, MRM5r, (outs GR64:$dst), (ins GR64:$src1, i64i8imm:$src2),
"bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_RI>, TB;
} // SchedRW
@@ -1853,7 +1890,8 @@ def BTS32mi8 : Ii8<0xBA, MRM5m, (outs), (ins i32mem:$src1, i32i8imm:$src2),
"bts{l}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>,
OpSize32, TB;
def BTS64mi8 : RIi8<0xBA, MRM5m, (outs), (ins i64mem:$src1, i64i8imm:$src2),
- "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB;
+ "bts{q}\t{$src2, $src1|$src1, $src2}", [], IIC_BTX_MI>, TB,
+ Requires<[In64BitMode]>;
}
} // hasSideEffects = 0
} // Defs = [EFLAGS]
@@ -2000,35 +2038,38 @@ def CMPXCHG8B : I<0xC7, MRM1m, (outs), (ins i64mem:$dst),
let Defs = [RAX, RDX, EFLAGS], Uses = [RAX, RBX, RCX, RDX] in
def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
"cmpxchg16b\t$dst", [], IIC_CMPXCHG_16B>,
- TB, Requires<[HasCmpxchg16b]>;
+ TB, Requires<[HasCmpxchg16b, In64BitMode]>;
} // SchedRW
// Lock instruction prefix
+let SchedRW = [WriteMicrocoded] in
def LOCK_PREFIX : I<0xF0, RawFrm, (outs), (ins), "lock", []>;
+let SchedRW = [WriteNop] in {
+
// Rex64 instruction prefix
-def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", []>,
+def REX64_PREFIX : I<0x48, RawFrm, (outs), (ins), "rex64", [], IIC_NOP>,
Requires<[In64BitMode]>;
// Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", []>,
+def DATA16_PREFIX : I<0x66, RawFrm, (outs), (ins), "data16", [], IIC_NOP>,
Requires<[Not16BitMode]>;
// Data instruction prefix
-def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", []>,
+def DATA32_PREFIX : I<0x66, RawFrm, (outs), (ins), "data32", [], IIC_NOP>,
Requires<[In16BitMode]>;
+} // SchedRW
// Repeat string operation instruction prefixes
-// These uses the DF flag in the EFLAGS register to inc or dec ECX
-let Defs = [ECX], Uses = [ECX,EFLAGS] in {
+// These use the DF flag in the EFLAGS register to inc or dec ECX
+let Defs = [ECX], Uses = [ECX,EFLAGS], SchedRW = [WriteMicrocoded] in {
// Repeat (used with INS, OUTS, MOVS, LODS and STOS)
def REP_PREFIX : I<0xF3, RawFrm, (outs), (ins), "rep", []>;
// Repeat while not equal (used with CMPS and SCAS)
def REPNE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "repne", []>;
}
-
// String manipulation instructions
let SchedRW = [WriteMicrocoded] in {
// These uses the DF flag in the EFLAGS register to inc or dec EDI and ESI
@@ -2174,31 +2215,35 @@ let Predicates = [HasMOVBE] in {
//===----------------------------------------------------------------------===//
// RDRAND Instruction
//
-let Predicates = [HasRDRAND], Defs = [EFLAGS] in {
+let Predicates = [HasRDRAND], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
def RDRAND16r : I<0xC7, MRM6r, (outs GR16:$dst), (ins),
"rdrand{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86rdrand))]>, OpSize16, TB;
+ [(set GR16:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
+ OpSize16, PS;
def RDRAND32r : I<0xC7, MRM6r, (outs GR32:$dst), (ins),
"rdrand{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86rdrand))]>, OpSize32, TB;
+ [(set GR32:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>,
+ OpSize32, PS;
def RDRAND64r : RI<0xC7, MRM6r, (outs GR64:$dst), (ins),
"rdrand{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86rdrand))]>, TB;
+ [(set GR64:$dst, EFLAGS, (X86rdrand))], IIC_RDRAND>, PS;
}
//===----------------------------------------------------------------------===//
// RDSEED Instruction
//
-let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
+let Predicates = [HasRDSEED], Defs = [EFLAGS], SchedRW = [WriteSystem] in {
def RDSEED16r : I<0xC7, MRM7r, (outs GR16:$dst), (ins),
"rdseed{w}\t$dst",
- [(set GR16:$dst, EFLAGS, (X86rdseed))]>, OpSize16, TB;
+ [(set GR16:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
+ OpSize16, PS;
def RDSEED32r : I<0xC7, MRM7r, (outs GR32:$dst), (ins),
"rdseed{l}\t$dst",
- [(set GR32:$dst, EFLAGS, (X86rdseed))]>, OpSize32, TB;
+ [(set GR32:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>,
+ OpSize32, PS;
def RDSEED64r : RI<0xC7, MRM7r, (outs GR64:$dst), (ins),
"rdseed{q}\t$dst",
- [(set GR64:$dst, EFLAGS, (X86rdseed))]>, TB;
+ [(set GR64:$dst, EFLAGS, (X86rdseed))], IIC_RDSEED>, PS;
}
//===----------------------------------------------------------------------===//
@@ -2207,30 +2252,33 @@ let Predicates = [HasRDSEED], Defs = [EFLAGS] in {
let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
def LZCNT16rr : I<0xBD, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"lzcnt{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)]>, XS,
- OpSize16;
+ [(set GR16:$dst, (ctlz GR16:$src)), (implicit EFLAGS)],
+ IIC_LZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
def LZCNT16rm : I<0xBD, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"lzcnt{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, (ctlz (loadi16 addr:$src))),
- (implicit EFLAGS)]>, XS, OpSize16;
+ (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize16,
+ Sched<[WriteIMulLd]>;
def LZCNT32rr : I<0xBD, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"lzcnt{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)]>, XS,
- OpSize32;
+ [(set GR32:$dst, (ctlz GR32:$src)), (implicit EFLAGS)],
+ IIC_LZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
def LZCNT32rm : I<0xBD, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"lzcnt{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (ctlz (loadi32 addr:$src))),
- (implicit EFLAGS)]>, XS, OpSize32;
+ (implicit EFLAGS)], IIC_LZCNT_RM>, XS, OpSize32,
+ Sched<[WriteIMulLd]>;
def LZCNT64rr : RI<0xBD, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"lzcnt{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)]>,
- XS;
+ [(set GR64:$dst, (ctlz GR64:$src)), (implicit EFLAGS)],
+ IIC_LZCNT_RR>, XS, Sched<[WriteIMul]>;
def LZCNT64rm : RI<0xBD, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"lzcnt{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (ctlz (loadi64 addr:$src))),
- (implicit EFLAGS)]>, XS;
+ (implicit EFLAGS)], IIC_LZCNT_RM>, XS,
+ Sched<[WriteIMulLd]>;
}
//===----------------------------------------------------------------------===//
@@ -2239,30 +2287,33 @@ let Predicates = [HasLZCNT], Defs = [EFLAGS] in {
let Predicates = [HasBMI], Defs = [EFLAGS] in {
def TZCNT16rr : I<0xBC, MRMSrcReg, (outs GR16:$dst), (ins GR16:$src),
"tzcnt{w}\t{$src, $dst|$dst, $src}",
- [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)]>, XS,
- OpSize16;
+ [(set GR16:$dst, (cttz GR16:$src)), (implicit EFLAGS)],
+ IIC_TZCNT_RR>, XS, OpSize16, Sched<[WriteIMul]>;
def TZCNT16rm : I<0xBC, MRMSrcMem, (outs GR16:$dst), (ins i16mem:$src),
"tzcnt{w}\t{$src, $dst|$dst, $src}",
[(set GR16:$dst, (cttz (loadi16 addr:$src))),
- (implicit EFLAGS)]>, XS, OpSize16;
+ (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize16,
+ Sched<[WriteIMulLd]>;
def TZCNT32rr : I<0xBC, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
"tzcnt{l}\t{$src, $dst|$dst, $src}",
- [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)]>, XS,
- OpSize32;
+ [(set GR32:$dst, (cttz GR32:$src)), (implicit EFLAGS)],
+ IIC_TZCNT_RR>, XS, OpSize32, Sched<[WriteIMul]>;
def TZCNT32rm : I<0xBC, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
"tzcnt{l}\t{$src, $dst|$dst, $src}",
[(set GR32:$dst, (cttz (loadi32 addr:$src))),
- (implicit EFLAGS)]>, XS, OpSize32;
+ (implicit EFLAGS)], IIC_TZCNT_RM>, XS, OpSize32,
+ Sched<[WriteIMulLd]>;
def TZCNT64rr : RI<0xBC, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
"tzcnt{q}\t{$src, $dst|$dst, $src}",
- [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)]>,
- XS;
+ [(set GR64:$dst, (cttz GR64:$src)), (implicit EFLAGS)],
+ IIC_TZCNT_RR>, XS, Sched<[WriteIMul]>;
def TZCNT64rm : RI<0xBC, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
"tzcnt{q}\t{$src, $dst|$dst, $src}",
[(set GR64:$dst, (cttz (loadi64 addr:$src))),
- (implicit EFLAGS)]>, XS;
+ (implicit EFLAGS)], IIC_TZCNT_RM>, XS,
+ Sched<[WriteIMulLd]>;
}
multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
@@ -2270,11 +2321,11 @@ multiclass bmi_bls<string mnemonic, Format RegMRM, Format MemMRM,
let hasSideEffects = 0 in {
def rr : I<0xF3, RegMRM, (outs RC:$dst), (ins RC:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
- []>, T8PS, VEX_4V;
+ [], IIC_UNARY_REG>, T8PS, VEX_4V, Sched<[WriteALU]>;
let mayLoad = 1 in
def rm : I<0xF3, MemMRM, (outs RC:$dst), (ins x86memop:$src),
!strconcat(mnemonic, "\t{$src, $dst|$dst, $src}"),
- []>, T8PS, VEX_4V;
+ [], IIC_UNARY_MEM>, T8PS, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
}
}
@@ -2309,18 +2360,18 @@ let Predicates = [HasBMI] in {
(BLSI64rr GR64:$src)>;
}
-
multiclass bmi_bextr_bzhi<bits<8> opc, string mnemonic, RegisterClass RC,
X86MemOperand x86memop, Intrinsic Int,
PatFrag ld_frag> {
def rr : I<opc, MRMSrcReg4VOp3, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)]>,
- T8PS, VEX;
+ [(set RC:$dst, (Int RC:$src1, RC:$src2)), (implicit EFLAGS)], IIC_BIN_NONMEM>,
+ T8PS, VEX, Sched<[WriteALU]>;
def rm : I<opc, MRMSrcMem4VOp3, (outs RC:$dst), (ins x86memop:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst, (Int (ld_frag addr:$src1), RC:$src2)),
- (implicit EFLAGS)]>, T8PS, VEX;
+ (implicit EFLAGS)], IIC_BIN_MEM>, T8PS, VEX,
+ Sched<[WriteALULd, ReadAfterLd]>;
}
let Predicates = [HasBMI], Defs = [EFLAGS] in {
@@ -2337,22 +2388,45 @@ let Predicates = [HasBMI2], Defs = [EFLAGS] in {
int_x86_bmi_bzhi_64, loadi64>, VEX_W;
}
-
def CountTrailingOnes : SDNodeXForm<imm, [{
// Count the trailing ones in the immediate.
return getI8Imm(countTrailingOnes(N->getZExtValue()), SDLoc(N));
}]>;
-def BZHIMask : ImmLeaf<i64, [{
- return isMask_64(Imm) && (countTrailingOnes<uint64_t>(Imm) > 32);
+def BEXTRMaskXForm : SDNodeXForm<imm, [{
+ unsigned Length = countTrailingOnes(N->getZExtValue());
+ return getI32Imm(Length << 8, SDLoc(N));
}]>;
-let Predicates = [HasBMI2] in {
- def : Pat<(and GR64:$src, BZHIMask:$mask),
+def AndMask64 : ImmLeaf<i64, [{
+ return isMask_64(Imm) && Imm > UINT32_MAX;
+}]>;
+
+// Use BEXTR for 64-bit 'and' with large immediate 'mask'.
+let Predicates = [HasBMI, NoBMI2, NoTBM] in {
+ def : Pat<(and GR64:$src, AndMask64:$mask),
+ (BEXTR64rr GR64:$src,
+ (SUBREG_TO_REG (i64 0),
+ (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
+ def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
+ (BEXTR64rm addr:$src,
+ (SUBREG_TO_REG (i64 0),
+ (MOV32ri (BEXTRMaskXForm imm:$mask)), sub_32bit))>;
+}
+
+// Use BZHI for 64-bit 'and' with large immediate 'mask'.
+let Predicates = [HasBMI2, NoTBM] in {
+ def : Pat<(and GR64:$src, AndMask64:$mask),
(BZHI64rr GR64:$src,
(INSERT_SUBREG (i64 (IMPLICIT_DEF)),
(MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+ def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
+ (BZHI64rm addr:$src,
+ (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+ (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
+}
+let Predicates = [HasBMI2] in {
def : Pat<(and GR32:$src, (add (shl 1, GR8:$lz), -1)),
(BZHI32rr GR32:$src,
(INSERT_SUBREG (i32 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>;
@@ -2402,27 +2476,17 @@ let Predicates = [HasBMI2] in {
(INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>;
} // HasBMI2
-let Predicates = [HasBMI] in {
- def : Pat<(X86bextr GR32:$src1, GR32:$src2),
- (BEXTR32rr GR32:$src1, GR32:$src2)>;
- def : Pat<(X86bextr (loadi32 addr:$src1), GR32:$src2),
- (BEXTR32rm addr:$src1, GR32:$src2)>;
- def : Pat<(X86bextr GR64:$src1, GR64:$src2),
- (BEXTR64rr GR64:$src1, GR64:$src2)>;
- def : Pat<(X86bextr (loadi64 addr:$src1), GR64:$src2),
- (BEXTR64rm addr:$src1, GR64:$src2)>;
-} // HasBMI
-
multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
X86MemOperand x86memop, Intrinsic Int,
PatFrag ld_frag> {
def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
- VEX_4V;
+ [(set RC:$dst, (Int RC:$src1, RC:$src2))], IIC_BIN_NONMEM>,
+ VEX_4V, Sched<[WriteALU]>;
def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>, VEX_4V;
+ [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))],
+ IIC_BIN_MEM>, VEX_4V, Sched<[WriteALULd, ReadAfterLd]>;
}
let Predicates = [HasBMI2] in {
@@ -2448,14 +2512,14 @@ multiclass tbm_ternary_imm_intr<bits<8> opc, RegisterClass RC, string OpcodeStr,
def ri : Ii32<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, immtype:$cntl),
!strconcat(OpcodeStr,
"\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
- [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))]>,
- XOP, XOPA;
+ [(set RC:$dst, (Int RC:$src1, immoperator:$cntl))],
+ IIC_BIN_NONMEM>, XOP, XOPA, Sched<[WriteALU]>;
def mi : Ii32<opc, MRMSrcMem, (outs RC:$dst),
(ins x86memop:$src1, immtype:$cntl),
!strconcat(OpcodeStr,
"\t{$cntl, $src1, $dst|$dst, $src1, $cntl}"),
- [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))]>,
- XOP, XOPA;
+ [(set RC:$dst, (Int (ld_frag addr:$src1), immoperator:$cntl))],
+ IIC_BIN_MEM>, XOP, XOPA, Sched<[WriteALULd, ReadAfterLd]>;
}
defm BEXTRI32 : tbm_ternary_imm_intr<0x10, GR32, "bextr", i32mem, loadi32,
@@ -2471,11 +2535,11 @@ multiclass tbm_binary_rm<bits<8> opc, Format FormReg, Format FormMem,
let hasSideEffects = 0 in {
def rr : I<opc, FormReg, (outs RC:$dst), (ins RC:$src),
!strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
- []>, XOP_4V, XOP9;
+ [], IIC_BIN_NONMEM>, XOP_4V, XOP9, Sched<[WriteALU]>;
let mayLoad = 1 in
def rm : I<opc, FormMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr,"\t{$src, $dst|$dst, $src}"),
- []>, XOP_4V, XOP9;
+ [], IIC_BIN_MEM>, XOP_4V, XOP9, Sched<[WriteALULd, ReadAfterLd]>;
}
}
@@ -2498,34 +2562,43 @@ defm T1MSKC : tbm_binary_intr<0x01, "t1mskc", MRM7r, MRM7m>;
defm TZMSK : tbm_binary_intr<0x01, "tzmsk", MRM4r, MRM4m>;
} // HasTBM, EFLAGS
+// Use BEXTRI for 64-bit 'and' with large immediate 'mask'.
+let Predicates = [HasTBM] in {
+ def : Pat<(and GR64:$src, AndMask64:$mask),
+ (BEXTRI64ri GR64:$src, (BEXTRMaskXForm imm:$mask))>;
+
+ def : Pat<(and (loadi64 addr:$src), AndMask64:$mask),
+ (BEXTRI64mi addr:$src, (BEXTRMaskXForm imm:$mask))>;
+}
+
//===----------------------------------------------------------------------===//
// Lightweight Profiling Instructions
-let Predicates = [HasLWP] in {
+let Predicates = [HasLWP], SchedRW = [WriteSystem] in {
def LLWPCB : I<0x12, MRM0r, (outs), (ins GR32:$src), "llwpcb\t$src",
[(int_x86_llwpcb GR32:$src)], IIC_LWP>,
- XOP, XOP9, Requires<[Not64BitMode]>;
+ XOP, XOP9;
def SLWPCB : I<0x12, MRM1r, (outs GR32:$dst), (ins), "slwpcb\t$dst",
[(set GR32:$dst, (int_x86_slwpcb))], IIC_LWP>,
- XOP, XOP9, Requires<[Not64BitMode]>;
+ XOP, XOP9;
def LLWPCB64 : I<0x12, MRM0r, (outs), (ins GR64:$src), "llwpcb\t$src",
[(int_x86_llwpcb GR64:$src)], IIC_LWP>,
- XOP, XOP9, VEX_W, Requires<[In64BitMode]>;
+ XOP, XOP9, VEX_W;
def SLWPCB64 : I<0x12, MRM1r, (outs GR64:$dst), (ins), "slwpcb\t$dst",
[(set GR64:$dst, (int_x86_slwpcb))], IIC_LWP>,
- XOP, XOP9, VEX_W, Requires<[In64BitMode]>;
+ XOP, XOP9, VEX_W;
multiclass lwpins_intr<RegisterClass RC> {
def rri : Ii32<0x12, MRM0r, (outs), (ins RC:$src0, GR32:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))]>,
+ [(set EFLAGS, (X86lwpins RC:$src0, GR32:$src1, imm:$cntl))], IIC_LWP>,
XOP_4V, XOPA;
let mayLoad = 1 in
def rmi : Ii32<0x12, MRM0m, (outs), (ins RC:$src0, i32mem:$src1, i32imm:$cntl),
"lwpins\t{$cntl, $src1, $src0|$src0, $src1, $cntl}",
- [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))]>,
+ [(set EFLAGS, (X86lwpins RC:$src0, (loadi32 addr:$src1), imm:$cntl))], IIC_LWP>,
XOP_4V, XOPA;
}
@@ -2549,7 +2622,7 @@ multiclass lwpval_intr<RegisterClass RC, Intrinsic Int> {
defm LWPVAL32 : lwpval_intr<GR32, int_x86_lwpval32>;
defm LWPVAL64 : lwpval_intr<GR64, int_x86_lwpval64>, VEX_W;
-} // HasLWP
+} // HasLWP, SchedRW
//===----------------------------------------------------------------------===//
// MONITORX/MWAITX Instructions
@@ -2605,15 +2678,6 @@ def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
//===----------------------------------------------------------------------===//
let Predicates = [HasTBM] in {
- def : Pat<(X86bextr GR32:$src1, (i32 imm:$src2)),
- (BEXTRI32ri GR32:$src1, imm:$src2)>;
- def : Pat<(X86bextr (loadi32 addr:$src1), (i32 imm:$src2)),
- (BEXTRI32mi addr:$src1, imm:$src2)>;
- def : Pat<(X86bextr GR64:$src1, i64immSExt32:$src2),
- (BEXTRI64ri GR64:$src1, i64immSExt32:$src2)>;
- def : Pat<(X86bextr (loadi64 addr:$src1), i64immSExt32:$src2),
- (BEXTRI64mi addr:$src1, i64immSExt32:$src2)>;
-
// FIXME: patterns for the load versions are not implemented
def : Pat<(and GR32:$src, (add GR32:$src, 1)),
(BLCFILL32rr GR32:$src)>;
@@ -2671,11 +2735,14 @@ let Predicates = [HasTBM] in {
// Memory Instructions
//
-let Predicates = [HasCLFLUSHOPT] in
+let Predicates = [HasCLFLUSHOPT], SchedRW = [WriteLoad] in
def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
- "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
-def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
+ "clflushopt\t$src", [(int_x86_clflushopt addr:$src)],
+ IIC_SSE_PREFETCH>, PD;
+let Predicates = [HasCLWB], SchedRW = [WriteLoad] in
+def CLWB : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
+ [(int_x86_clwb addr:$src)], IIC_SSE_PREFETCH>, PD;
//===----------------------------------------------------------------------===//
// Subsystems.
@@ -2719,6 +2786,7 @@ include "X86InstrSystem.td"
// Compiler Pseudo Instructions and Pat Patterns
include "X86InstrCompiler.td"
+include "X86InstrVecCompiler.td"
//===----------------------------------------------------------------------===//
// Assembler Mnemonic Aliases
@@ -2751,6 +2819,7 @@ def : MnemonicAlias<"pop", "popq", "att">, Requires<[In64BitMode]>;
def : MnemonicAlias<"popf", "popfw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"popf", "popfl", "att">, Requires<[In32BitMode]>;
def : MnemonicAlias<"popf", "popfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"popf", "popfq", "intel">, Requires<[In64BitMode]>;
def : MnemonicAlias<"popfd", "popfl", "att">;
// FIXME: This is wrong for "push reg". "push %bx" should turn into pushw in
@@ -2762,6 +2831,7 @@ def : MnemonicAlias<"push", "pushq", "att">, Requires<[In64BitMode]>;
def : MnemonicAlias<"pushf", "pushfw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"pushf", "pushfl", "att">, Requires<[In32BitMode]>;
def : MnemonicAlias<"pushf", "pushfq", "att">, Requires<[In64BitMode]>;
+def : MnemonicAlias<"pushf", "pushfq", "intel">, Requires<[In64BitMode]>;
def : MnemonicAlias<"pushfd", "pushfl", "att">;
def : MnemonicAlias<"popad", "popal", "intel">, Requires<[Not64BitMode]>;
@@ -2804,6 +2874,10 @@ def : MnemonicAlias<"smovq", "movsq", "att">;
def : MnemonicAlias<"ud2a", "ud2", "att">;
def : MnemonicAlias<"verrw", "verr", "att">;
+// MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release'
+def : MnemonicAlias<"acquire", "xacquire", "intel">;
+def : MnemonicAlias<"release", "xrelease", "intel">;
+
// System instruction aliases.
def : MnemonicAlias<"iret", "iretw", "att">, Requires<[In16BitMode]>;
def : MnemonicAlias<"iret", "iretl", "att">, Requires<[Not16BitMode]>;
@@ -3122,8 +3196,8 @@ def : InstAlias<"jmpl\t$seg, $off", (FARJMP32i i32imm:$off, i16imm:$seg)>, Req
// Force mov without a suffix with a segment and mem to prefer the 'l' form of
// the move. All segment/mem forms are equivalent, this has the shortest
// encoding.
-def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV32sm SEGMENT_REG:$seg, i32mem:$mem), 0>;
-def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV32ms i32mem:$mem, SEGMENT_REG:$seg), 0>;
+def : InstAlias<"mov\t{$mem, $seg|$seg, $mem}", (MOV16sm SEGMENT_REG:$seg, i16mem:$mem), 0>;
+def : InstAlias<"mov\t{$seg, $mem|$mem, $seg}", (MOV16ms i16mem:$mem, SEGMENT_REG:$seg), 0>;
// Match 'movq <largeimm>, <reg>' as an alias for movabsq.
def : InstAlias<"mov{q}\t{$imm, $reg|$reg, $imm}", (MOV64ri GR64:$reg, i64imm:$imm), 0>;
@@ -3209,14 +3283,14 @@ defm : ShiftRotateByOneAlias<"ror", "ROR">;
FIXME */
// test: We accept "testX <reg>, <mem>" and "testX <mem>, <reg>" as synonyms.
-def : InstAlias<"test{b}\t{$val, $mem|$mem, $val}",
- (TEST8rm GR8 :$val, i8mem :$mem), 0>;
-def : InstAlias<"test{w}\t{$val, $mem|$mem, $val}",
- (TEST16rm GR16:$val, i16mem:$mem), 0>;
-def : InstAlias<"test{l}\t{$val, $mem|$mem, $val}",
- (TEST32rm GR32:$val, i32mem:$mem), 0>;
-def : InstAlias<"test{q}\t{$val, $mem|$mem, $val}",
- (TEST64rm GR64:$val, i64mem:$mem), 0>;
+def : InstAlias<"test{b}\t{$mem, $val|$val, $mem}",
+ (TEST8mr i8mem :$mem, GR8 :$val), 0>;
+def : InstAlias<"test{w}\t{$mem, $val|$val, $mem}",
+ (TEST16mr i16mem:$mem, GR16:$val), 0>;
+def : InstAlias<"test{l}\t{$mem, $val|$val, $mem}",
+ (TEST32mr i32mem:$mem, GR32:$val), 0>;
+def : InstAlias<"test{q}\t{$mem, $val|$val, $mem}",
+ (TEST64mr i64mem:$mem, GR64:$val), 0>;
// xchg: We accept "xchgX <reg>, <mem>" and "xchgX <mem>, <reg>" as synonyms.
def : InstAlias<"xchg{b}\t{$mem, $val|$val, $mem}",
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 2c047722db24..039b4a248544 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -143,7 +143,7 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
def rm64 : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR64:$dst,
- (IntId64 (bitconvert (memopmmx addr:$src))))],
+ (IntId64 (bitconvert (load_mmx addr:$src))))],
itins.rm>, Sched<[itins.Sched.Folded]>;
}
@@ -163,7 +163,7 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[(set VR64:$dst,
(IntId64 VR64:$src1,
- (bitconvert (memopmmx addr:$src2))))], itins.rm>,
+ (bitconvert (load_mmx addr:$src2))))], itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
@@ -616,7 +616,8 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
(ins VR64:$src),
"pmovmskb\t{$src, $dst|$dst, $src}",
[(set GR32orGR64:$dst,
- (int_x86_mmx_pmovmskb VR64:$src))]>;
+ (int_x86_mmx_pmovmskb VR64:$src))],
+ IIC_MMX_MOVMSK>, Sched<[WriteVecLogic]>;
// Low word of XMM to MMX.
def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
diff --git a/lib/Target/X86/X86InstrMPX.td b/lib/Target/X86/X86InstrMPX.td
index 104ba2a174db..cb2b47b4f0c9 100644
--- a/lib/Target/X86/X86InstrMPX.td
+++ b/lib/Target/X86/X86InstrMPX.td
@@ -13,13 +13,16 @@
//
//===----------------------------------------------------------------------===//
+// FIXME: Investigate a better scheduler itinerary once MPX is used inside LLVM.
+let SchedRW = [WriteSystem] in {
+
multiclass mpx_bound_make<bits<8> opc, string OpcodeStr> {
let mayLoad = 1 in {
def 32rm: I<opc, MRMSrcMem, (outs BNDR:$dst), (ins i32mem:$src),
- OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>,
Requires<[HasMPX, Not64BitMode]>;
def 64rm: RI<opc, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
- OpcodeStr#"\t{$src, $dst|$dst, $src}", []>,
+ OpcodeStr#"\t{$src, $dst|$dst, $src}", [], IIC_MPX>,
Requires<[HasMPX, In64BitMode]>;
}
}
@@ -29,17 +32,17 @@ defm BNDMK : mpx_bound_make<0x1B, "bndmk">, XS;
multiclass mpx_bound_check<bits<8> opc, string OpcodeStr> {
let mayLoad = 1 in {
def 32rm: I<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i32mem:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
Requires<[HasMPX, Not64BitMode]>;
def 64rm: RI<opc, MRMSrcMem, (outs), (ins BNDR:$src1, i64mem:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
Requires<[HasMPX, In64BitMode]>;
}
def 32rr: I<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR32:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
Requires<[HasMPX, Not64BitMode]>;
def 64rr: RI<opc, MRMSrcReg, (outs), (ins BNDR:$src1, GR64:$src2),
- OpcodeStr#"\t{$src2, $src1|$src1, $src2}", []>,
+ OpcodeStr#"\t{$src2, $src1|$src1, $src2}", [], IIC_MPX>,
Requires<[HasMPX, In64BitMode]>;
}
defm BNDCL : mpx_bound_check<0x1A, "bndcl">, XS;
@@ -47,32 +50,33 @@ defm BNDCU : mpx_bound_check<0x1A, "bndcu">, XD;
defm BNDCN : mpx_bound_check<0x1B, "bndcn">, XD;
def BNDMOVRMrr : I<0x1A, MRMSrcReg, (outs BNDR:$dst), (ins BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
Requires<[HasMPX]>;
let mayLoad = 1 in {
def BNDMOVRM32rm : I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
- "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
Requires<[HasMPX, Not64BitMode]>;
def BNDMOVRM64rm : RI<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i128mem:$src),
- "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
Requires<[HasMPX, In64BitMode]>;
}
def BNDMOVMRrr : I<0x1B, MRMDestReg, (outs BNDR:$dst), (ins BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
Requires<[HasMPX]>;
let mayStore = 1 in {
def BNDMOVMR32mr : I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
Requires<[HasMPX, Not64BitMode]>;
def BNDMOVMR64mr : RI<0x1B, MRMDestMem, (outs), (ins i128mem:$dst, BNDR:$src),
- "bndmov\t{$src, $dst|$dst, $src}", []>, PD,
+ "bndmov\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PD,
Requires<[HasMPX, In64BitMode]>;
def BNDSTXmr: I<0x1B, MRMDestMem, (outs), (ins i64mem:$dst, BNDR:$src),
- "bndstx\t{$src, $dst|$dst, $src}", []>, PS,
+ "bndstx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS,
Requires<[HasMPX]>;
}
let mayLoad = 1 in
-def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins i64mem:$src),
- "bndldx\t{$src, $dst|$dst, $src}", []>, PS,
+def BNDLDXrm: I<0x1A, MRMSrcMem, (outs BNDR:$dst), (ins anymem:$src),
+ "bndldx\t{$src, $dst|$dst, $src}", [], IIC_MPX>, PS,
Requires<[HasMPX]>;
+} // SchedRW
diff --git a/lib/Target/X86/X86InstrSGX.td b/lib/Target/X86/X86InstrSGX.td
index 84119ad5eb35..f4331c5e2d93 100644
--- a/lib/Target/X86/X86InstrSGX.td
+++ b/lib/Target/X86/X86InstrSGX.td
@@ -15,6 +15,7 @@
//===----------------------------------------------------------------------===//
// SGX instructions
+let SchedRW = [WriteSystem] in {
// ENCLS - Execute an Enclave System Function of Specified Leaf Number
def ENCLS : I<0x01, MRM_CF, (outs), (ins),
"encls", []>, TB;
@@ -22,3 +23,4 @@ def ENCLS : I<0x01, MRM_CF, (outs), (ins),
// ENCLU - Execute an Enclave User Function of Specified Leaf Number
def ENCLU : I<0x01, MRM_D7, (outs), (ins),
"enclu", []>, TB;
+} // SchedRW
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 650e4fc8716c..a86a0bfc168d 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -25,9 +25,15 @@ class SizeItins<OpndItins arg_s, OpndItins arg_d> {
OpndItins d = arg_d;
}
+class MoveLoadStoreItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
+ InstrItinClass arg_mr> {
+ InstrItinClass rr = arg_rr;
+ InstrItinClass rm = arg_rm;
+ InstrItinClass mr = arg_mr;
+}
class ShiftOpndItins<InstrItinClass arg_rr, InstrItinClass arg_rm,
- InstrItinClass arg_ri> {
+ InstrItinClass arg_ri> {
InstrItinClass rr = arg_rr;
InstrItinClass rm = arg_rm;
InstrItinClass ri = arg_ri;
@@ -120,10 +126,6 @@ def SSE_DIV_ITINS_P : SizeItins<
>;
let Sched = WriteVecLogic in
-def SSE_VEC_BIT_ITINS_P : OpndItins<
- IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
->;
-
def SSE_BIT_ITINS_P : OpndItins<
IIC_SSE_BIT_P_RR, IIC_SSE_BIT_P_RM
>;
@@ -143,6 +145,11 @@ def SSE_INTMUL_ITINS_P : OpndItins<
IIC_SSE_INTMUL_P_RR, IIC_SSE_INTMUL_P_RM
>;
+// FIXME: Merge SSE_INTSHIFT_P + SSE_INTSHIFT_ITINS_P.
+def SSE_INTSHIFT_P : OpndItins<
+ IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM
+>;
+
def SSE_INTSHIFT_ITINS_P : ShiftOpndItins<
IIC_SSE_INTSH_P_RR, IIC_SSE_INTSH_P_RM, IIC_SSE_INTSH_P_RI
>;
@@ -151,10 +158,18 @@ def SSE_MOVA_ITINS : OpndItins<
IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM
>;
+def SSE_MOVA : MoveLoadStoreItins<
+ IIC_SSE_MOVA_P_RR, IIC_SSE_MOVA_P_RM, IIC_SSE_MOVA_P_MR
+>;
+
def SSE_MOVU_ITINS : OpndItins<
IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM
>;
+def SSE_MOVU : MoveLoadStoreItins<
+ IIC_SSE_MOVU_P_RR, IIC_SSE_MOVU_P_RM, IIC_SSE_MOVU_P_MR
+>;
+
def SSE_DPPD_ITINS : OpndItins<
IIC_SSE_DPPD_RR, IIC_SSE_DPPD_RM
>;
@@ -203,6 +218,11 @@ def SSE_INTALU_ITINS_SHUFF_P : OpndItins<
IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
>;
+let Sched = WriteShuffle in
+def SSE_PACK : OpndItins<
+ IIC_SSE_PACK, IIC_SSE_PACK
+>;
+
let Sched = WriteMPSAD in
def DEFAULT_ITINS_MPSADSCHED : OpndItins<
IIC_ALU_NONMEM, IIC_ALU_MEM
@@ -312,134 +332,17 @@ multiclass sse12_fp_packed_logical_rm<bits<8> opc, RegisterClass RC, Domain d,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rr, NoItinerary, d>,
+ pat_rr, IIC_SSE_BIT_P_RR, d>,
Sched<[WriteVecLogic]>;
+ let hasSideEffects = 0, mayLoad = 1 in
def rm : PI<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- pat_rm, NoItinerary, d>,
+ pat_rm, IIC_SSE_BIT_P_RM, d>,
Sched<[WriteVecLogicLd, ReadAfterLd]>;
}
-//===----------------------------------------------------------------------===//
-// Non-instruction patterns
-//===----------------------------------------------------------------------===//
-
-// A vector extract of the first f32/f64 position is a subregister copy
-def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
- (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
-def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
- (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
-
-// A 128-bit subvector extract from the first 256-bit vector position
-// is a subregister copy that needs no instruction.
-def : Pat<(v4i32 (extract_subvector (v8i32 VR256:$src), (iPTR 0))),
- (v4i32 (EXTRACT_SUBREG (v8i32 VR256:$src), sub_xmm))>;
-def : Pat<(v4f32 (extract_subvector (v8f32 VR256:$src), (iPTR 0))),
- (v4f32 (EXTRACT_SUBREG (v8f32 VR256:$src), sub_xmm))>;
-
-def : Pat<(v2i64 (extract_subvector (v4i64 VR256:$src), (iPTR 0))),
- (v2i64 (EXTRACT_SUBREG (v4i64 VR256:$src), sub_xmm))>;
-def : Pat<(v2f64 (extract_subvector (v4f64 VR256:$src), (iPTR 0))),
- (v2f64 (EXTRACT_SUBREG (v4f64 VR256:$src), sub_xmm))>;
-
-def : Pat<(v8i16 (extract_subvector (v16i16 VR256:$src), (iPTR 0))),
- (v8i16 (EXTRACT_SUBREG (v16i16 VR256:$src), sub_xmm))>;
-def : Pat<(v16i8 (extract_subvector (v32i8 VR256:$src), (iPTR 0))),
- (v16i8 (EXTRACT_SUBREG (v32i8 VR256:$src), sub_xmm))>;
-
-// A 128-bit subvector insert to the first 256-bit vector position
-// is a subregister copy that needs no instruction.
-let AddedComplexity = 25 in { // to give priority over vinsertf128rm
-def : Pat<(insert_subvector undef, (v2i64 VR128:$src), (iPTR 0)),
- (INSERT_SUBREG (v4i64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v2f64 VR128:$src), (iPTR 0)),
- (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4i32 VR128:$src), (iPTR 0)),
- (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v4f32 VR128:$src), (iPTR 0)),
- (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v8i16 VR128:$src), (iPTR 0)),
- (INSERT_SUBREG (v16i16 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-def : Pat<(insert_subvector undef, (v16i8 VR128:$src), (iPTR 0)),
- (INSERT_SUBREG (v32i8 (IMPLICIT_DEF)), VR128:$src, sub_xmm)>;
-}
-
-// Implicitly promote a 32-bit scalar to a vector.
-def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
- (COPY_TO_REGCLASS FR32:$src, VR128)>;
-// Implicitly promote a 64-bit scalar to a vector.
-def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
- (COPY_TO_REGCLASS FR64:$src, VR128)>;
-
-// Bitcasts between 128-bit vector types. Return the original type since
-// no instruction is needed for the conversion
-def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
-def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
-def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
-
-// Bitcasts between 256-bit vector types. Return the original type since
-// no instruction is needed for the conversion
-def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
-def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
-def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
-def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
-def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
-def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
-def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
-def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
-def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
-def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
-def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
-def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
-def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
-def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
-def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
-def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
-def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
-def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
-def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
-def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
-def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
-def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
-def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
-def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
-def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
-def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
-def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
-def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
-def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
-def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
// Alias instructions that map fld0 to xorps for sse or vxorps for avx.
// This is expanded by ExpandPostRAPseudos.
@@ -505,22 +408,20 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
// don't use movss/movsd for copies.
//===----------------------------------------------------------------------===//
-multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
+multiclass sse12_move_rr<SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string base_opc,
- string asm_opr, Domain d = GenericDomain,
- string Name> {
+ string asm_opr, Domain d, string Name> {
let isCommutable = 1 in
def rr : SI<0x10, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, RC:$src2),
+ (ins VR128:$src1, VR128:$src2),
!strconcat(base_opc, asm_opr),
- [(set VR128:$dst, (vt (OpNode VR128:$src1,
- (scalar_to_vector RC:$src2))))],
+ [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
IIC_SSE_MOV_S_RR, d>, Sched<[WriteFShuffle]>;
// For the disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : SI<0x11, MRMDestReg, (outs VR128:$dst),
- (ins VR128:$src1, RC:$src2),
+ (ins VR128:$src1, VR128:$src2),
!strconcat(base_opc, asm_opr),
[], IIC_SSE_MOV_S_RR>, Sched<[WriteFShuffle]>,
FoldGenData<Name#rr>;
@@ -528,9 +429,9 @@ multiclass sse12_move_rr<RegisterClass RC, SDNode OpNode, ValueType vt,
multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
X86MemOperand x86memop, string OpcodeStr,
- Domain d = GenericDomain, string Name> {
+ Domain d, string Name> {
// AVX
- defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ defm V#NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}", d,
"V"#Name>,
VEX_4V, VEX_LIG, VEX_WIG;
@@ -541,7 +442,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
// SSE1 & 2
let Constraints = "$src1 = $dst" in {
- defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
+ defm NAME : sse12_move_rr<OpNode, vt, x86memop, OpcodeStr,
"\t{$src2, $dst|$dst, $src2}", d, Name>;
}
@@ -553,8 +454,7 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
// Loading from memory automatically zeroing upper bits.
multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
- PatFrag mem_pat, string OpcodeStr,
- Domain d = GenericDomain> {
+ PatFrag mem_pat, string OpcodeStr, Domain d> {
def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set RC:$dst, (mem_pat addr:$src))],
@@ -627,66 +527,40 @@ let Predicates = [UseAVX] in {
// Shuffle with VMOVSS
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (VMOVSSrr (v4i32 VR128:$src1),
- (COPY_TO_REGCLASS (v4i32 VR128:$src2), FR32))>;
- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
- (VMOVSSrr (v4f32 VR128:$src1),
- (COPY_TO_REGCLASS (v4f32 VR128:$src2), FR32))>;
-
- // 256-bit variants
- def : Pat<(v8i32 (X86Movss VR256:$src1, VR256:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSSrr (EXTRACT_SUBREG (v8i32 VR256:$src1), sub_xmm),
- (EXTRACT_SUBREG (v8i32 VR256:$src2), sub_xmm)),
- sub_xmm)>;
- def : Pat<(v8f32 (X86Movss VR256:$src1, VR256:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSSrr (EXTRACT_SUBREG (v8f32 VR256:$src1), sub_xmm),
- (EXTRACT_SUBREG (v8f32 VR256:$src2), sub_xmm)),
- sub_xmm)>;
+ (VMOVSSrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+ (VMOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>;
// Shuffle with VMOVSD
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-
- // 256-bit variants
- def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDrr (EXTRACT_SUBREG (v4i64 VR256:$src1), sub_xmm),
- (EXTRACT_SUBREG (v4i64 VR256:$src2), sub_xmm)),
- sub_xmm)>;
- def : Pat<(v4f64 (X86Movsd VR256:$src1, VR256:$src2)),
- (SUBREG_TO_REG (i32 0),
- (VMOVSDrr (EXTRACT_SUBREG (v4f64 VR256:$src1), sub_xmm),
- (EXTRACT_SUBREG (v4f64 VR256:$src2), sub_xmm)),
- sub_xmm)>;
+ (VMOVSDrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+ (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>;
// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
// is during lowering, where it's not possible to recognize the fold cause
// it has two uses through a bitcast. One use disappears at isel time and the
// fold opportunity reappears.
def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (VMOVSDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (VMOVSDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (VMOVSDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (VMOVSDrr VR128:$src1, VR128:$src2)>;
}
let Predicates = [UseSSE1] in {
let Predicates = [NoSSE41], AddedComplexity = 15 in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSS to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
- (MOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
- (MOVSSrr (v4f32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ (MOVSSrr (v4f32 (V_SET0)), VR128:$src)>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
- (MOVSSrr (v4i32 (V_SET0)), (COPY_TO_REGCLASS VR128:$src, FR32))>;
+ (MOVSSrr (v4i32 (V_SET0)), VR128:$src)>;
}
let AddedComplexity = 20 in {
@@ -708,9 +582,10 @@ let Predicates = [UseSSE1] in {
// Shuffle with MOVSS
def : Pat<(v4i32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
- def : Pat<(v4f32 (X86Movss VR128:$src1, VR128:$src2)),
- (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR32))>;
+ (MOVSSrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v4f32 (X86Movss VR128:$src1, (scalar_to_vector FR32:$src2))),
+ (MOVSSrr VR128:$src1, (COPY_TO_REGCLASS FR32:$src2, VR128))>;
}
let Predicates = [UseSSE2] in {
@@ -718,7 +593,7 @@ let Predicates = [UseSSE2] in {
// Move scalar to XMM zero-extended, zeroing a VR128 then do a
// MOVSD to the lower bits.
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (MOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+ (MOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>;
}
let AddedComplexity = 20 in {
@@ -737,22 +612,23 @@ let Predicates = [UseSSE2] in {
// Shuffle with MOVSD
def : Pat<(v2i64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
- def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (MOVSDrr VR128:$src1, VR128:$src2)>;
+
+ def : Pat<(v2f64 (X86Movsd VR128:$src1, (scalar_to_vector FR64:$src2))),
+ (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS FR64:$src2, VR128))>;
// FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
// is during lowering, where it's not possible to recognize the fold because
// it has two uses through a bitcast. One use disappears at isel time and the
// fold opportunity reappears.
def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (MOVSDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (MOVSDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (MOVSDrr VR128:$src1, VR128:$src2)>;
def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
- (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
+ (MOVSDrr VR128:$src1, VR128:$src2)>;
}
// Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -845,11 +721,11 @@ def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movaps\t{$src, $dst|$dst, $src}",
- [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
+ [(alignedstore (v8f32 VR256:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movapd\t{$src, $dst|$dst, $src}",
- [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
+ [(alignedstore (v4f64 VR256:$src), addr:$dst)],
IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
"movups\t{$src, $dst|$dst, $src}",
@@ -969,13 +845,13 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVAPSYrm addr:$src)>;
def : Pat<(loadv4i64 addr:$src),
(VMOVUPSYrm addr:$src)>;
- def : Pat<(alignedstore256 (v4i64 VR256:$src), addr:$dst),
+ def : Pat<(alignedstore (v4i64 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
- def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
+ def : Pat<(alignedstore (v8i32 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
- def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+ def : Pat<(alignedstore (v16i16 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
- def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+ def : Pat<(alignedstore (v32i8 VR256:$src), addr:$dst),
(VMOVAPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v4i64 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
@@ -985,22 +861,6 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVUPSYmr addr:$dst, VR256:$src)>;
def : Pat<(store (v32i8 VR256:$src), addr:$dst),
(VMOVUPSYmr addr:$dst, VR256:$src)>;
-
- // Special patterns for storing subvector extracts of lower 128-bits
- // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
- def : Pat<(alignedstore (v2f64 (extract_subvector
- (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVAPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v4f32 (extract_subvector
- (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-
- def : Pat<(store (v2f64 (extract_subvector
- (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVUPDmr addr:$dst, (v2f64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(store (v4f32 (extract_subvector
- (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
}
// Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -1103,14 +963,10 @@ let Predicates = [UseAVX] in {
// Shuffle with VMOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
(VMOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
- (VMOVLPSrm VR128:$src1, addr:$src2)>;
// Shuffle with VMOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
- (VMOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Movsd VR128:$src1,
(v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
(VMOVLPDrm VR128:$src1, addr:$src2)>;
@@ -1119,15 +975,9 @@ let Predicates = [UseAVX] in {
def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
addr:$src1),
(VMOVLPSmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v4i32 (X86Movlps
- (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)), addr:$src1),
- (VMOVLPSmr addr:$src1, VR128:$src2)>;
def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
addr:$src1),
(VMOVLPDmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
- addr:$src1),
- (VMOVLPDmr addr:$src1, VR128:$src2)>;
}
let Predicates = [UseSSE1] in {
@@ -1139,8 +989,6 @@ let Predicates = [UseSSE1] in {
// Shuffle with MOVLPS
def : Pat<(v4f32 (X86Movlps VR128:$src1, (load addr:$src2))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
- def : Pat<(v4i32 (X86Movlps VR128:$src1, (load addr:$src2))),
- (MOVLPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlps VR128:$src1,
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(MOVLPSrm VR128:$src1, addr:$src2)>;
@@ -1149,18 +997,12 @@ let Predicates = [UseSSE1] in {
def : Pat<(store (v4f32 (X86Movlps (load addr:$src1), VR128:$src2)),
addr:$src1),
(MOVLPSmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v4i32 (X86Movlps
- (bc_v4i32 (loadv2i64 addr:$src1)), VR128:$src2)),
- addr:$src1),
- (MOVLPSmr addr:$src1, VR128:$src2)>;
}
let Predicates = [UseSSE2] in {
// Shuffle with MOVLPD
def : Pat<(v2f64 (X86Movlpd VR128:$src1, (load addr:$src2))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
- def : Pat<(v2i64 (X86Movlpd VR128:$src1, (load addr:$src2))),
- (MOVLPDrm VR128:$src1, addr:$src2)>;
def : Pat<(v2f64 (X86Movsd VR128:$src1,
(v2f64 (scalar_to_vector (loadf64 addr:$src2))))),
(MOVLPDrm VR128:$src1, addr:$src2)>;
@@ -1169,9 +1011,6 @@ let Predicates = [UseSSE2] in {
def : Pat<(store (v2f64 (X86Movlpd (load addr:$src1), VR128:$src2)),
addr:$src1),
(MOVLPDmr addr:$src1, VR128:$src2)>;
- def : Pat<(store (v2i64 (X86Movlpd (load addr:$src1), VR128:$src2)),
- addr:$src1),
- (MOVLPDmr addr:$src1, VR128:$src2)>;
}
//===----------------------------------------------------------------------===//
@@ -1179,7 +1018,7 @@ let Predicates = [UseSSE2] in {
//===----------------------------------------------------------------------===//
let AddedComplexity = 20 in {
- defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Movlhpd, "movhp",
+ defm MOVH : sse12_mov_hilo_packed<0x16, X86Movlhps, X86Unpckl, "movhp",
IIC_SSE_MOV_LH>;
}
@@ -1218,19 +1057,9 @@ let Predicates = [UseAVX] in {
(bc_v4f32 (v2i64 (scalar_to_vector (loadi64 addr:$src2))))),
(VMOVHPSrm VR128:$src1, addr:$src2)>;
def : Pat<(X86Movlhps VR128:$src1,
- (bc_v4i32 (v2i64 (X86vzload addr:$src2)))),
+ (bc_v4f32 (v2i64 (X86vzload addr:$src2)))),
(VMOVHPSrm VR128:$src1, addr:$src2)>;
- // VMOVHPD patterns
-
- // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
- // is during lowering, where it's not possible to recognize the load fold
- // cause it has two uses through a bitcast. One use disappears at isel time
- // and the fold opportunity reappears.
- def : Pat<(v2f64 (X86Unpckl VR128:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (VMOVHPDrm VR128:$src1, addr:$src2)>;
-
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1261,14 +1090,6 @@ let Predicates = [UseSSE1] in {
let Predicates = [UseSSE2] in {
// MOVHPD patterns
- // FIXME: Instead of X86Unpckl, there should be a X86Movlhpd here, the problem
- // is during lowering, where it's not possible to recognize the load fold
- // cause it has two uses through a bitcast. One use disappears at isel time
- // and the fold opportunity reappears.
- def : Pat<(v2f64 (X86Unpckl VR128:$src1,
- (scalar_to_vector (loadf64 addr:$src2)))),
- (MOVHPDrm VR128:$src1, addr:$src2)>;
-
// Also handle an i64 load because that may get selected as a faster way to
// load the data.
def : Pat<(v2f64 (X86Unpckl VR128:$src1,
@@ -1322,63 +1143,77 @@ let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
IIC_SSE_MOV_LH>, Sched<[WriteFShuffle]>;
}
-let Predicates = [UseAVX] in {
- // MOVLHPS patterns
- def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
- (VMOVLHPSrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
- (VMOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+//===----------------------------------------------------------------------===//
+// SSE 1 & 2 - Conversion Instructions
+//===----------------------------------------------------------------------===//
- // MOVHLPS patterns
- def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
- (VMOVHLPSrr VR128:$src1, VR128:$src2)>;
-}
+let Sched = WriteCvtF2I in {
+def SSE_CVT_SS2SI_32 : OpndItins<
+ IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+>;
-let Predicates = [UseSSE1] in {
- // MOVLHPS patterns
- def : Pat<(v4i32 (X86Movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr VR128:$src1, VR128:$src2)>;
- def : Pat<(v2i64 (X86Movlhps VR128:$src1, VR128:$src2)),
- (MOVLHPSrr (v2i64 VR128:$src1), VR128:$src2)>;
+let Sched = WriteCvtF2I in
+def SSE_CVT_SS2SI_64 : OpndItins<
+ IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+>;
- // MOVHLPS patterns
- def : Pat<(v4i32 (X86Movhlps VR128:$src1, VR128:$src2)),
- (MOVHLPSrr VR128:$src1, VR128:$src2)>;
-}
+def SSE_CVT_SD2SI : OpndItins<
+ IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+>;
-//===----------------------------------------------------------------------===//
-// SSE 1 & 2 - Conversion Instructions
-//===----------------------------------------------------------------------===//
+def SSE_CVT_PS2I : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
-def SSE_CVT_PD : OpndItins<
+def SSE_CVT_PD2I : OpndItins<
IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
>;
+}
-let Sched = WriteCvtI2F in
-def SSE_CVT_PS : OpndItins<
+let Sched = WriteCvtI2F in {
+def SSE_CVT_SI2SS : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+def SSE_CVT_SI2SD : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
+>;
+
+def SSE_CVT_I2PS : OpndItins<
IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
>;
-let Sched = WriteCvtI2F in
-def SSE_CVT_Scalar : OpndItins<
+def SSE_CVT_I2PD : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
+>;
+}
+
+let Sched = WriteCvtF2F in {
+def SSE_CVT_SD2SS : OpndItins<
IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
>;
-let Sched = WriteCvtF2I in
-def SSE_CVT_SS2SI_32 : OpndItins<
- IIC_SSE_CVT_SS2SI32_RR, IIC_SSE_CVT_SS2SI32_RM
+def SSE_CVT_SS2SD : OpndItins<
+ IIC_SSE_CVT_Scalar_RR, IIC_SSE_CVT_Scalar_RM
>;
-let Sched = WriteCvtF2I in
-def SSE_CVT_SS2SI_64 : OpndItins<
- IIC_SSE_CVT_SS2SI64_RR, IIC_SSE_CVT_SS2SI64_RM
+def SSE_CVT_PD2PS : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
>;
-let Sched = WriteCvtF2I in
-def SSE_CVT_SD2SI : OpndItins<
- IIC_SSE_CVT_SD2SI_RR, IIC_SSE_CVT_SD2SI_RM
+def SSE_CVT_PS2PD : OpndItins<
+ IIC_SSE_CVT_PD_RR, IIC_SSE_CVT_PD_RM
>;
+def SSE_CVT_PH2PS : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+
+def SSE_CVT_PS2PH : OpndItins<
+ IIC_SSE_CVT_PS_RR, IIC_SSE_CVT_PS_RM
+>;
+}
+
// FIXME: We probably want to match the rm form only when optimizing for
// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
@@ -1410,16 +1245,16 @@ let hasSideEffects = 0 in {
// FIXME: We probably want to match the rm form only when optimizing for
// size, to avoid false depenendecies (see sse_fp_unop_s for details)
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
- X86MemOperand x86memop, string asm> {
+ X86MemOperand x86memop, string asm, OpndItins itins> {
let hasSideEffects = 0, Predicates = [UseAVX] in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
- !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- Sched<[WriteCvtI2F]>;
+ !strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), [],
+ itins.rr>, Sched<[itins.Sched]>;
let mayLoad = 1 in
def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
(ins DstRC:$src1, x86memop:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
- Sched<[WriteCvtI2FLd, ReadAfterLd]>;
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
} // hasSideEffects = 0
}
@@ -1462,14 +1297,14 @@ def : InstAlias<"vcvttsd2si{q}\t{$src, $dst|$dst, $src}",
// register, but the same isn't true when only using memory operands,
// provide other assembly "l" and "q" forms to address this explicitly
// where appropriate to do so.
-defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}">,
- XS, VEX_4V, VEX_LIG;
-defm VCVTSI2SS64 : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}">,
- XS, VEX_4V, VEX_W, VEX_LIG;
-defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}">,
- XD, VEX_4V, VEX_LIG;
-defm VCVTSI2SD64 : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}">,
- XD, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss{l}",
+ SSE_CVT_SI2SS>, XS, VEX_4V, VEX_LIG;
+defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss{q}",
+ SSE_CVT_SI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
+defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd{l}",
+ SSE_CVT_SI2SD>, XD, VEX_4V, VEX_LIG;
+defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd{q}",
+ SSE_CVT_SI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
let Predicates = [UseAVX] in {
def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
@@ -1480,20 +1315,20 @@ let Predicates = [UseAVX] in {
def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
- (VCVTSI2SS64rm (f32 (IMPLICIT_DEF)), addr:$src)>;
+ (VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
- (VCVTSI2SD64rm (f64 (IMPLICIT_DEF)), addr:$src)>;
+ (VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
def : Pat<(f32 (sint_to_fp GR32:$src)),
(VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
def : Pat<(f32 (sint_to_fp GR64:$src)),
- (VCVTSI2SS64rr (f32 (IMPLICIT_DEF)), GR64:$src)>;
+ (VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
def : Pat<(f64 (sint_to_fp GR32:$src)),
(VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
def : Pat<(f64 (sint_to_fp GR64:$src)),
- (VCVTSI2SD64rr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+ (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
}
defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
@@ -1510,16 +1345,16 @@ defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
SSE_CVT_SD2SI>, XD, REX_W;
defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
"cvtsi2ss{l}\t{$src, $dst|$dst, $src}",
- SSE_CVT_Scalar>, XS;
-defm CVTSI2SS64 : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+ SSE_CVT_SI2SS>, XS;
+defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
"cvtsi2ss{q}\t{$src, $dst|$dst, $src}",
- SSE_CVT_Scalar>, XS, REX_W;
+ SSE_CVT_SI2SS>, XS, REX_W;
defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
"cvtsi2sd{l}\t{$src, $dst|$dst, $src}",
- SSE_CVT_Scalar>, XD;
-defm CVTSI2SD64 : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+ SSE_CVT_SI2SD>, XD;
+defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
"cvtsi2sd{q}\t{$src, $dst|$dst, $src}",
- SSE_CVT_Scalar>, XD, REX_W;
+ SSE_CVT_SI2SD>, XD, REX_W;
def : InstAlias<"cvttss2si{l}\t{$src, $dst|$dst, $src}",
(CVTTSS2SIrr GR32:$dst, FR32:$src), 0>;
@@ -1551,33 +1386,33 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
Intrinsic Int, Operand memop, ComplexPattern mem_cpat,
string asm, OpndItins itins> {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
- Sched<[itins.Sched]>;
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
- !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
- [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
- Sched<[itins.Sched.Folded]>;
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int SrcRC:$src))], itins.rr>,
+ Sched<[itins.Sched]>;
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
+ !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
+ [(set DstRC:$dst, (Int mem_cpat:$src))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
}
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, Intrinsic Int, X86MemOperand x86memop,
PatFrag ld_frag, string asm, OpndItins itins,
bit Is2Addr = 1> {
- def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
- !if(Is2Addr,
- !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
- itins.rr>, Sched<[itins.Sched]>;
- def rm : SI<opc, MRMSrcMem, (outs DstRC:$dst),
- (ins DstRC:$src1, x86memop:$src2),
- !if(Is2Addr,
- !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
- itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, SrcRC:$src2))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst),
+ (ins DstRC:$src1, x86memop:$src2),
+ !if(Is2Addr,
+ !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
+ !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
+ [(set DstRC:$dst, (Int DstRC:$src1, (ld_frag addr:$src2)))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Predicates = [UseAVX] in {
@@ -1596,34 +1431,34 @@ defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse2_cvtsd2si64,
let isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in {
- defm Int_VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse_cvtsi2ss, i32mem, loadi32, "cvtsi2ss{l}",
- SSE_CVT_Scalar, 0>, XS, VEX_4V;
- defm Int_VCVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ SSE_CVT_SI2SS, 0>, XS, VEX_4V;
+ defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64, "cvtsi2ss{q}",
- SSE_CVT_Scalar, 0>, XS, VEX_4V,
+ SSE_CVT_SI2SS, 0>, XS, VEX_4V,
VEX_W;
- defm Int_VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse2_cvtsi2sd, i32mem, loadi32, "cvtsi2sd{l}",
- SSE_CVT_Scalar, 0>, XD, VEX_4V;
- defm Int_VCVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ SSE_CVT_SI2SD, 0>, XD, VEX_4V;
+ defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64, "cvtsi2sd{q}",
- SSE_CVT_Scalar, 0>, XD,
+ SSE_CVT_SI2SD, 0>, XD,
VEX_4V, VEX_W;
}
let Constraints = "$src1 = $dst" in {
- defm Int_CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse_cvtsi2ss, i32mem, loadi32,
- "cvtsi2ss{l}", SSE_CVT_Scalar>, XS;
- defm Int_CVTSI2SS64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ "cvtsi2ss{l}", SSE_CVT_SI2SS>, XS;
+ defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse_cvtsi642ss, i64mem, loadi64,
- "cvtsi2ss{q}", SSE_CVT_Scalar>, XS, REX_W;
- defm Int_CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
+ "cvtsi2ss{q}", SSE_CVT_SI2SS>, XS, REX_W;
+ defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
int_x86_sse2_cvtsi2sd, i32mem, loadi32,
- "cvtsi2sd{l}", SSE_CVT_Scalar>, XD;
- defm Int_CVTSI2SD64 : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
+ "cvtsi2sd{l}", SSE_CVT_SI2SD>, XD;
+ defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
int_x86_sse2_cvtsi642sd, i64mem, loadi64,
- "cvtsi2sd{q}", SSE_CVT_Scalar>, XD, REX_W;
+ "cvtsi2sd{q}", SSE_CVT_SI2SD>, XD, REX_W;
}
} // isCodeGenOnly = 1
@@ -1632,31 +1467,31 @@ let isCodeGenOnly = 1 in {
// Aliases for intrinsics
let isCodeGenOnly = 1 in {
let Predicates = [UseAVX] in {
-defm Int_VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
- ssmem, sse_load_f32, "cvttss2si",
- SSE_CVT_SS2SI_32>, XS, VEX;
-defm Int_VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
- int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
- "cvttss2si", SSE_CVT_SS2SI_64>,
- XS, VEX, VEX_W;
-defm Int_VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
- sdmem, sse_load_f64, "cvttsd2si",
- SSE_CVT_SD2SI>, XD, VEX;
-defm Int_VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
- int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
- "cvttsd2si", SSE_CVT_SD2SI>,
- XD, VEX, VEX_W;
-}
-defm Int_CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
+ ssmem, sse_load_f32, "cvttss2si",
+ SSE_CVT_SS2SI_32>, XS, VEX;
+defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
+ "cvttss2si", SSE_CVT_SS2SI_64>,
+ XS, VEX, VEX_W;
+defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+ sdmem, sse_load_f64, "cvttsd2si",
+ SSE_CVT_SD2SI>, XD, VEX;
+defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+ int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
+ "cvttsd2si", SSE_CVT_SD2SI>,
+ XD, VEX, VEX_W;
+}
+defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse_cvttss2si,
ssmem, sse_load_f32, "cvttss2si",
SSE_CVT_SS2SI_32>, XS;
-defm Int_CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse_cvttss2si64, ssmem, sse_load_f32,
"cvttss2si", SSE_CVT_SS2SI_64>, XS, REX_W;
-defm Int_CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
+defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, int_x86_sse2_cvttsd2si,
sdmem, sse_load_f64, "cvttsd2si",
SSE_CVT_SD2SI>, XD;
-defm Int_CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
+defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64,
int_x86_sse2_cvttsd2si64, sdmem, sse_load_f64,
"cvttsd2si", SSE_CVT_SD2SI>, XD, REX_W;
} // isCodeGenOnly = 1
@@ -1678,53 +1513,53 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_PS>,
+ SSEPackedSingle, SSE_CVT_I2PS>,
PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
defm VCVTDQ2PSY : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_PS>,
+ SSEPackedSingle, SSE_CVT_I2PS>,
PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
- SSEPackedSingle, SSE_CVT_PS>,
+ SSEPackedSingle, SSE_CVT_I2PS>,
PS, Requires<[UseSSE2]>;
let Predicates = [UseAVX] in {
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+ (VCVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>;
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+ (VCVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>;
def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+ (VCVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>;
def : InstAlias<"vcvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+ (VCVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>;
def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+ (VCVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>;
def : InstAlias<"vcvtss2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+ (VCVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>;
def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+ (VCVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>;
def : InstAlias<"vcvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (VCVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+ (VCVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>;
}
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSS2SIrr GR32:$dst, VR128:$src), 0>;
+ (CVTSS2SIrr_Int GR32:$dst, VR128:$src), 0>;
def : InstAlias<"cvtss2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSS2SIrm GR32:$dst, ssmem:$src), 0>;
+ (CVTSS2SIrm_Int GR32:$dst, ssmem:$src), 0>;
def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSD2SIrr GR32:$dst, VR128:$src), 0>;
+ (CVTSD2SIrr_Int GR32:$dst, VR128:$src), 0>;
def : InstAlias<"cvtsd2si{l}\t{$src, $dst|$dst, $src}",
- (CVTSD2SIrm GR32:$dst, sdmem:$src), 0>;
+ (CVTSD2SIrm_Int GR32:$dst, sdmem:$src), 0>;
def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSS2SI64rr GR64:$dst, VR128:$src), 0>;
+ (CVTSS2SI64rr_Int GR64:$dst, VR128:$src), 0>;
def : InstAlias<"cvtss2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSS2SI64rm GR64:$dst, ssmem:$src), 0>;
+ (CVTSS2SI64rm_Int GR64:$dst, ssmem:$src), 0>;
def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSD2SI64rr GR64:$dst, VR128:$src), 0>;
+ (CVTSD2SI64rr_Int GR64:$dst, VR128:$src), 0>;
def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
- (CVTSD2SI64rm GR64:$dst, sdmem:$src), 0>;
+ (CVTSD2SI64rm_Int GR64:$dst, sdmem:$src), 0>;
/// SSE 2 Only
@@ -1734,18 +1569,17 @@ def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2F]>, VEX_WIG;
+ Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR32:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>,
- XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
+ [], IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
}
def : Pat<(f32 (fpround FR64:$src)),
- (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
+ (VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1760,14 +1594,14 @@ def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
Requires<[UseSSE2, OptForSize]>, Sched<[WriteCvtF2FLd]>;
let isCodeGenOnly = 1 in {
-def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
+def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
-def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
+def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1776,14 +1610,14 @@ def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in {
-def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
+def CVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XD, Requires<[UseSSE2]>,
Sched<[WriteCvtF2F]>;
-def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
+def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
"cvtsd2ss\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst, (int_x86_sse2_cvtsd2ss
@@ -1799,20 +1633,18 @@ let hasSideEffects = 0, Predicates = [UseAVX] in {
def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RR>,
- XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
- Sched<[WriteCvtF2F]>, VEX_WIG;
+ [], IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2F]>, VEX_WIG, NotMemoryFoldable;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR64:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [], IIC_SSE_CVT_Scalar_RM>,
- XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
- Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
+ [], IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_LIG,
+ Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG, NotMemoryFoldable;
}
def : Pat<(f64 (fpextend FR32:$src)),
- (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
+ (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
def : Pat<(fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
@@ -1845,14 +1677,14 @@ def : Pat<(extloadf32 addr:$src),
(CVTSS2SDrr (MOVSSrm addr:$src))>, Requires<[UseSSE2, OptForSpeed]>;
let isCodeGenOnly = 1 in {
-def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
+def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
-def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
+def VCVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
[(set VR128:$dst,
@@ -1860,14 +1692,14 @@ def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
-def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
+def CVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
(int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
IIC_SSE_CVT_Scalar_RR>, XS, Requires<[UseSSE2]>,
Sched<[WriteCvtF2F]>;
-def Int_CVTSS2SDrm: I<0x5A, MRMSrcMem,
+def CVTSS2SDrm_Int: I<0x5A, MRMSrcMem,
(outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
"cvtss2sd\t{$src2, $dst|$dst, $src2}",
[(set VR128:$dst,
@@ -1885,33 +1717,33 @@ def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector
(f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
- (Int_VCVTSD2SSrr VR128:$dst, VR128:$src)>;
+ (VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector
(f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
- (Int_VCVTSS2SDrr VR128:$dst, VR128:$src)>;
+ (VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
- (Int_VCVTSI2SS64rr VR128:$dst, GR64:$src)>;
+ (VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
- (Int_VCVTSI2SSrr VR128:$dst, GR32:$src)>;
+ (VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
- (Int_VCVTSI2SD64rr VR128:$dst, GR64:$src)>;
+ (VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
- (Int_VCVTSI2SDrr VR128:$dst, GR32:$src)>;
+ (VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
} // Predicates = [UseAVX]
let Predicates = [UseSSE2] in {
@@ -1919,35 +1751,35 @@ def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector
(f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
- (Int_CVTSD2SSrr VR128:$dst, VR128:$src)>;
+ (CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector
(f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
- (Int_CVTSS2SDrr VR128:$dst, VR128:$src)>;
+ (CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
- (Int_CVTSI2SD64rr VR128:$dst, GR64:$src)>;
+ (CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
- (Int_CVTSI2SDrr VR128:$dst, GR32:$src)>;
+ (CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
} // Predicates = [UseSSE2]
let Predicates = [UseSSE1] in {
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
- (Int_CVTSI2SS64rr VR128:$dst, GR64:$src)>;
+ (CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
- (Int_CVTSI2SSrr VR128:$dst, GR32:$src)>;
+ (CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
} // Predicates = [UseSSE1]
// Convert packed single/double fp to doubleword
@@ -2115,10 +1947,16 @@ let Predicates = [HasAVX, NoVLX] in {
(v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
(VCVTPD2DQrr VR128:$src)>;
def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (loadv2f64 addr:$src)))))),
+ (VCVTPD2DQrm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
(VCVTTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (loadv2f64 addr:$src)))))),
+ (VCVTTPD2DQrm addr:$src)>;
}
-} // Predicates = [HasAVX]
+} // Predicates = [HasAVX, NoVLX]
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
@@ -2137,8 +1975,14 @@ let Predicates = [UseSSE2] in {
(v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
(CVTPD2DQrr VR128:$src)>;
def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvtp2Int (memopv2f64 addr:$src)))))),
+ (CVTPD2DQrm addr:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
(v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
(CVTTPD2DQrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2i64 (bitconvert
+ (v4i32 (X86cvttp2si (memopv2f64 addr:$src)))))),
+ (CVTTPD2DQrm addr:$src)>;
}
} // Predicates = [UseSSE2]
@@ -2180,7 +2024,7 @@ let hasSideEffects = 0, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>,
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2203,7 +2047,7 @@ let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))],
+ (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2215,12 +2059,16 @@ def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(VCVTDQ2PDrm addr:$src)>;
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ (VCVTDQ2PDrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
let Predicates = [UseSSE2] in {
def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
(CVTDQ2PDrm addr:$src)>;
+ def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))),
+ (CVTDQ2PDrm addr:$src)>;
} // Predicates = [UseSSE2]
// Convert packed double to packed single
@@ -2275,38 +2123,51 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
let Predicates = [HasAVX, NoVLX] in {
// Match fpround and fpextend for 128/256-bit conversions
- let AddedComplexity = 15 in
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (VCVTPD2PSrr VR128:$src)>;
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+ (VCVTPD2PSrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (loadv2f64 addr:$src)))))),
+ (VCVTPD2PSrm addr:$src)>;
+ }
}
let Predicates = [UseSSE2] in {
// Match fpround and fpextend for 128 conversions
- let AddedComplexity = 15 in
- def : Pat<(X86vzmovl (v2f64 (bitconvert
- (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
- (CVTPD2PSrr VR128:$src)>;
+ let AddedComplexity = 15 in {
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (v2f64 VR128:$src)))))),
+ (CVTPD2PSrr VR128:$src)>;
+ def : Pat<(X86vzmovl (v2f64 (bitconvert
+ (v4f32 (X86vfpround (memopv2f64 addr:$src)))))),
+ (CVTPD2PSrm addr:$src)>;
+ }
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Compare Instructions
//===----------------------------------------------------------------------===//
+let Sched = WriteFAdd in
+def SSE_COMIS : OpndItins<
+ IIC_SSE_COMIS_RR, IIC_SSE_COMIS_RM
+>;
+
// sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
Operand CC, SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm, string asm_alt,
- OpndItins itins, ImmLeaf immLeaf> {
+ OpndItins itins> {
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
- [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, immLeaf:$cc))],
+ [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, imm:$cc))],
itins.rr>, Sched<[itins.Sched]>;
def rm : SIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
[(set RC:$dst, (OpNode (VT RC:$src1),
- (ld_frag addr:$src2), immLeaf:$cc))],
+ (ld_frag addr:$src2), imm:$cc))],
itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
@@ -2327,41 +2188,41 @@ let ExeDomain = SSEPackedSingle in
defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
"cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+ SSE_ALU_F32S>, XS, VEX_4V, VEX_LIG, VEX_WIG;
let ExeDomain = SSEPackedDouble in
defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
"cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
+ SSE_ALU_F32S>, // same latency as 32 bit compare
XD, VEX_4V, VEX_LIG, VEX_WIG;
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
"cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
- "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
- i8immZExt3>, XS;
+ "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S>,
+ XS;
let ExeDomain = SSEPackedDouble in
defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
"cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
"cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SSE_ALU_F64S, i8immZExt3>, XD;
+ SSE_ALU_F64S>, XD;
}
multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
Intrinsic Int, string asm, OpndItins itins,
- ImmLeaf immLeaf, ComplexPattern mem_cpat> {
- def rr : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
+ ComplexPattern mem_cpat> {
+ def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- VR128:$src, immLeaf:$cc))],
+ VR128:$src, imm:$cc))],
itins.rr>,
Sched<[itins.Sched]>;
let mayLoad = 1 in
- def rm : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
+ def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, memop:$src, CC:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
- mem_cpat:$src, immLeaf:$cc))],
+ mem_cpat:$src, imm:$cc))],
itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -2369,25 +2230,23 @@ let mayLoad = 1 in
let isCodeGenOnly = 1 in {
// Aliases to match intrinsics which expect XMM operand(s).
let ExeDomain = SSEPackedSingle in
- defm Int_VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
+ defm VCMPSS : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
- XS, VEX_4V;
+ SSE_ALU_F32S, sse_load_f32>, XS, VEX_4V;
let ExeDomain = SSEPackedDouble in
- defm Int_VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
+ defm VCMPSD : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
- SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
+ SSE_ALU_F32S, sse_load_f64>, // same latency as f32
XD, VEX_4V;
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
- defm Int_CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
+ defm CMPSS : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
"cmp${cc}ss\t{$src, $dst|$dst, $src}",
- SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+ SSE_ALU_F32S, sse_load_f32>, XS;
let ExeDomain = SSEPackedDouble in
- defm Int_CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
+ defm CMPSD : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
"cmp${cc}sd\t{$src, $dst|$dst, $src}",
- SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
- XD;
+ SSE_ALU_F64S, sse_load_f64>, XD;
}
}
@@ -2395,102 +2254,106 @@ let isCodeGenOnly = 1 in {
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, X86MemOperand x86memop,
- PatFrag ld_frag, string OpcodeStr> {
+ PatFrag ld_frag, string OpcodeStr,
+ OpndItins itins> {
+let hasSideEffects = 0 in {
def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
- IIC_SSE_COMIS_RR>,
- Sched<[WriteFAdd]>;
+ itins.rr>,
+ Sched<[itins.Sched]>;
let mayLoad = 1 in
def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
(ld_frag addr:$src2)))],
- IIC_SSE_COMIS_RM>,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
}
// sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, Operand memop,
- ComplexPattern mem_cpat, string OpcodeStr> {
+ ComplexPattern mem_cpat, string OpcodeStr,
+ OpndItins itins> {
def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))],
- IIC_SSE_COMIS_RR>,
- Sched<[WriteFAdd]>;
+ itins.rr>,
+ Sched<[itins.Sched]>;
let mayLoad = 1 in
def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1),
mem_cpat:$src2))],
- IIC_SSE_COMIS_RM>,
- Sched<[WriteFAddLd, ReadAfterLd]>;
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Defs = [EFLAGS] in {
defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG;
+ "ucomiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG;
defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG;
+ "ucomisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG;
let Pattern = []<dag> in {
defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
- "comiss">, PS, VEX, VEX_LIG, VEX_WIG;
+ "comiss", SSE_COMIS>, PS, VEX, VEX_LIG, VEX_WIG;
defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
- "comisd">, PD, VEX, VEX_LIG, VEX_WIG;
+ "comisd", SSE_COMIS>, PD, VEX, VEX_LIG, VEX_WIG;
}
let isCodeGenOnly = 1 in {
defm Int_VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG;
+ sse_load_f32, "ucomiss", SSE_COMIS>, PS, VEX, VEX_WIG;
defm Int_VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG;
+ sse_load_f64, "ucomisd", SSE_COMIS>, PD, VEX, VEX_WIG;
defm Int_VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss">, PS, VEX, VEX_WIG;
+ sse_load_f32, "comiss", SSE_COMIS>, PS, VEX, VEX_WIG;
defm Int_VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd">, PD, VEX, VEX_WIG;
+ sse_load_f64, "comisd", SSE_COMIS>, PD, VEX, VEX_WIG;
}
defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss">, PS;
+ "ucomiss", SSE_COMIS>, PS;
defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd">, PD;
+ "ucomisd", SSE_COMIS>, PD;
let Pattern = []<dag> in {
defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
- "comiss">, PS;
+ "comiss", SSE_COMIS>, PS;
defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
- "comisd">, PD;
+ "comisd", SSE_COMIS>, PD;
}
let isCodeGenOnly = 1 in {
defm Int_UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss">, PS;
+ sse_load_f32, "ucomiss", SSE_COMIS>, PS;
defm Int_UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd">, PD;
+ sse_load_f64, "ucomisd", SSE_COMIS>, PD;
defm Int_COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss">, PS;
+ sse_load_f32, "comiss", SSE_COMIS>, PS;
defm Int_COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd">, PD;
+ sse_load_f64, "comisd", SSE_COMIS>, PD;
}
} // Defs = [EFLAGS]
// sse12_cmp_packed - sse 1 & 2 compare packed instructions
multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
Operand CC, ValueType VT, string asm,
- string asm_alt, Domain d, ImmLeaf immLeaf,
+ string asm_alt, Domain d,
PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
- [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))],
+ [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, imm:$cc)))],
itins.rr, d>,
Sched<[WriteFAdd]>;
def rmi : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
[(set RC:$dst,
- (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))],
+ (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), imm:$cc)))],
itins.rm, d>,
Sched<[WriteFAddLd, ReadAfterLd]>;
@@ -2510,181 +2373,200 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
"cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG;
+ SSEPackedSingle, loadv4f32>, PS, VEX_4V, VEX_WIG;
defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
"cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG;
+ SSEPackedDouble, loadv2f64>, PD, VEX_4V, VEX_WIG;
defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
"cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
+ SSEPackedSingle, loadv8f32>, PS, VEX_4V, VEX_L;
defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
"cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
"cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
- SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
+ SSEPackedDouble, loadv4f64>, PD, VEX_4V, VEX_L;
let Constraints = "$src1 = $dst" in {
defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
"cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
"cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
+ SSEPackedSingle, memopv4f32, SSE_ALU_F32P>, PS;
defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
"cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
"cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
- SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
+ SSEPackedDouble, memopv2f64, SSE_ALU_F64P>, PD;
+}
+
+def CommutableCMPCC : PatLeaf<(imm), [{
+ return (N->getZExtValue() == 0x00 || N->getZExtValue() == 0x03 ||
+ N->getZExtValue() == 0x04 || N->getZExtValue() == 0x07);
+}]>;
+
+// Patterns to select compares with loads in first operand.
+let Predicates = [HasAVX] in {
+ def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPSYrmi VR256:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
+ CommutableCMPCC:$cc)),
+ (VCMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE2] in {
+ def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPSDrm FR64:$src1, addr:$src2, imm:$cc)>;
+}
+
+let Predicates = [UseSSE1] in {
+ def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPPSrmi VR128:$src1, addr:$src2, imm:$cc)>;
+
+ def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
+ CommutableCMPCC:$cc)),
+ (CMPSSrm FR32:$src1, addr:$src2, imm:$cc)>;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Shuffle Instructions
//===----------------------------------------------------------------------===//
+let Sched = WriteFShuffle in
+def SSE_SHUFP : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+
/// sse12_shuffle - sse 1 & 2 fp shuffle instructions
multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
ValueType vt, string asm, PatFrag mem_frag,
- Domain d> {
+ OpndItins itins, Domain d> {
def rmi : PIi8<0xC6, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, (mem_frag addr:$src2),
- (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ (i8 imm:$src3))))], itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
def rri : PIi8<0xC6, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3), asm,
[(set RC:$dst, (vt (X86Shufp RC:$src1, RC:$src2,
- (i8 imm:$src3))))], IIC_SSE_SHUFP, d>,
- Sched<[WriteFShuffle]>;
+ (i8 imm:$src3))))], itins.rr, d>,
+ Sched<[itins.Sched]>;
}
let Predicates = [HasAVX, NoVLX] in {
defm VSHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+ loadv4f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
"shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+ loadv8f32, SSE_SHUFP, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
defm VSHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ loadv2f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
"shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+ loadv4f64, SSE_SHUFP, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
"shufps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv4f32, SSEPackedSingle>, PS;
+ memopv4f32, SSE_SHUFP, SSEPackedSingle>, PS;
defm SHUFPD : sse12_shuffle<VR128, f128mem, v2f64,
"shufpd\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- memopv2f64, SSEPackedDouble>, PD;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4i32 (X86Shufp VR128:$src1,
- (bc_v4i32 (loadv2i64 addr:$src2)), (i8 imm:$imm))),
- (VSHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VSHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
- def : Pat<(v2i64 (X86Shufp VR128:$src1,
- (loadv2i64 addr:$src2), (i8 imm:$imm))),
- (VSHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VSHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
-
- // 256-bit patterns
- def : Pat<(v8i32 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VSHUFPSYrri VR256:$src1, VR256:$src2, imm:$imm)>;
- def : Pat<(v8i32 (X86Shufp VR256:$src1,
- (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
- (VSHUFPSYrmi VR256:$src1, addr:$src2, imm:$imm)>;
-
- def : Pat<(v4i64 (X86Shufp VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VSHUFPDYrri VR256:$src1, VR256:$src2, imm:$imm)>;
- def : Pat<(v4i64 (X86Shufp VR256:$src1,
- (loadv4i64 addr:$src2), (i8 imm:$imm))),
- (VSHUFPDYrmi VR256:$src1, addr:$src2, imm:$imm)>;
-}
-
-let Predicates = [UseSSE1] in {
- def : Pat<(v4i32 (X86Shufp VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)), (i8 imm:$imm))),
- (SHUFPSrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v4i32 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (SHUFPSrri VR128:$src1, VR128:$src2, imm:$imm)>;
-}
-
-let Predicates = [UseSSE2] in {
- // Generic SHUFPD patterns
- def : Pat<(v2i64 (X86Shufp VR128:$src1,
- (memopv2i64 addr:$src2), (i8 imm:$imm))),
- (SHUFPDrmi VR128:$src1, addr:$src2, imm:$imm)>;
- def : Pat<(v2i64 (X86Shufp VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (SHUFPDrri VR128:$src1, VR128:$src2, imm:$imm)>;
+ memopv2f64, SSE_SHUFP, SSEPackedDouble>, PD;
}
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Unpack FP Instructions
//===----------------------------------------------------------------------===//
+let Sched = WriteFShuffle in
+def SSE_UNPCK : OpndItins<
+ IIC_SSE_UNPCK, IIC_SSE_UNPCK
+>;
+
/// sse12_unpack_interleave - sse 1 & 2 fp unpack and interleave
multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
PatFrag mem_frag, RegisterClass RC,
X86MemOperand x86memop, string asm,
- Domain d, bit IsCommutable = 0> {
+ OpndItins itins, Domain d, bit IsCommutable = 0> {
let isCommutable = IsCommutable in
def rr : PI<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
asm, [(set RC:$dst,
(vt (OpNode RC:$src1, RC:$src2)))],
- IIC_SSE_UNPCK, d>, Sched<[WriteFShuffle]>;
+ itins.rr, d>, Sched<[itins.Sched]>;
def rm : PI<opc, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
asm, [(set RC:$dst,
(vt (OpNode RC:$src1,
(mem_frag addr:$src2))))],
- IIC_SSE_UNPCK, d>,
- Sched<[WriteFShuffleLd, ReadAfterLd]>;
+ itins.rm, d>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoVLX] in {
defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+ SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
+ SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
+ SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+ SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+ SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
+ SSE_UNPCK, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
+ SSE_UNPCK, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
}// Predicates = [HasAVX, NoVLX]
+
let Constraints = "$src1 = $dst" in {
defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
VR128, f128mem, "unpckhps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, PS;
+ SSE_UNPCK, SSEPackedSingle>, PS;
defm UNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, memopv2f64,
VR128, f128mem, "unpckhpd\t{$src2, $dst|$dst, $src2}",
- SSEPackedDouble, 1>, PD;
+ SSE_UNPCK, SSEPackedDouble, 1>, PD;
defm UNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, memopv4f32,
VR128, f128mem, "unpcklps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, PS;
+ SSE_UNPCK, SSEPackedSingle>, PS;
defm UNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, memopv2f64,
VR128, f128mem, "unpcklpd\t{$src2, $dst|$dst, $src2}",
- SSEPackedDouble>, PD;
+ SSE_UNPCK, SSEPackedDouble>, PD;
} // Constraints = "$src1 = $dst"
let Predicates = [HasAVX1Only] in {
@@ -2787,13 +2669,13 @@ let Predicates = [HasAVX2, prd] in
// These are ordered here for pattern ordering requirements with the fp versions
defm PAND : PDI_binop_all<0xDB, "pand", and, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+ SSE_BIT_ITINS_P, 1, NoVLX>;
defm POR : PDI_binop_all<0xEB, "por", or, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+ SSE_BIT_ITINS_P, 1, NoVLX>;
defm PXOR : PDI_binop_all<0xEF, "pxor", xor, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 1, NoVLX>;
+ SSE_BIT_ITINS_P, 1, NoVLX>;
defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
- SSE_VEC_BIT_ITINS_P, 0, NoVLX>;
+ SSE_BIT_ITINS_P, 0, NoVLX>;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Logical Instructions
@@ -2801,54 +2683,36 @@ defm PANDN : PDI_binop_all<0xDF, "pandn", X86andnp, v2i64, v4i64,
/// sse12_fp_packed_logical - SSE 1 & 2 packed FP logical ops
///
+/// There are no patterns here because isel prefers integer versions for SSE2
+/// and later. There are SSE1 v4f32 patterns later.
multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
SDNode OpNode> {
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PSY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f256mem,
- [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
- (bc_v4i64 (v8f32 VR256:$src2))))],
- [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
- (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
+ [], [], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
!strconcat(OpcodeStr, "pd"), f256mem,
- [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
- (bc_v4i64 (v4f64 VR256:$src2))))],
- [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
- (loadv4i64 addr:$src2)))], 0>,
- PD, VEX_4V, VEX_L, VEX_WIG;
+ [], [], 0>, PD, VEX_4V, VEX_L, VEX_WIG;
defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem,
- [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
- (bc_v2i64 (v4f32 VR128:$src2))))],
- [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
- (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG;
+ [], [], 0>, PS, VEX_4V, VEX_WIG;
defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
!strconcat(OpcodeStr, "pd"), f128mem,
- [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
- (bc_v2i64 (v2f64 VR128:$src2))))],
- [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
- (loadv2i64 addr:$src2)))], 0>,
- PD, VEX_4V, VEX_WIG;
+ [], [], 0>, PD, VEX_4V, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
!strconcat(OpcodeStr, "ps"), f128mem,
- [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
- (bc_v2i64 (v4f32 VR128:$src2))))],
- [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
- (memopv2i64 addr:$src2)))]>, PS;
+ [], []>, PS;
defm PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
!strconcat(OpcodeStr, "pd"), f128mem,
- [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
- (bc_v2i64 (v2f64 VR128:$src2))))],
- [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
- (memopv2i64 addr:$src2)))]>, PD;
+ [], []>, PD;
}
}
@@ -3146,22 +3010,6 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
(!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
}
- // With SSE 4.1, blendi is preferred to movsd, so match that too.
- let Predicates = [UseSSE41] in {
- // extracted scalar math op with insert via blend
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (!cast<I>(OpcPrefix#SSrr_Int) v4f32:$dst,
- (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
- // vector math op with insert via blend
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (!cast<I>(OpcPrefix#SSrr_Int)v4f32:$dst, v4f32:$src)>;
-
- }
-
// Repeat everything for AVX.
let Predicates = [UseAVX] in {
// extracted scalar math op with insert via movss
@@ -3171,22 +3019,10 @@ multiclass scalar_math_f32_patterns<SDNode Op, string OpcPrefix> {
(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
(COPY_TO_REGCLASS FR32:$src, VR128))>;
- // extracted scalar math op with insert via blend
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst), (v4f32 (scalar_to_vector
- (Op (f32 (extractelt (v4f32 VR128:$dst), (iPTR 0))),
- FR32:$src))), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst,
- (COPY_TO_REGCLASS FR32:$src, VR128))>;
-
// vector math op with insert via movss
def : Pat<(v4f32 (X86Movss (v4f32 VR128:$dst),
(Op (v4f32 VR128:$dst), (v4f32 VR128:$src)))),
(!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
-
- // vector math op with insert via blend
- def : Pat<(v4f32 (X86Blendi (v4f32 VR128:$dst),
- (Op (v4f32 VR128:$dst), (v4f32 VR128:$src)), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SSrr_Int) v4f32:$dst, v4f32:$src)>;
}
}
@@ -3210,21 +3046,6 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
(!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
}
- // With SSE 4.1, blendi is preferred to movsd, so match those too.
- let Predicates = [UseSSE41] in {
- // extracted scalar math op with insert via blend
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst,
- (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
- // vector math op with insert via blend
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (!cast<I>(OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
- }
-
// Repeat everything for AVX.
let Predicates = [UseAVX] in {
// extracted scalar math op with insert via movsd
@@ -3234,22 +3055,10 @@ multiclass scalar_math_f64_patterns<SDNode Op, string OpcPrefix> {
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
(COPY_TO_REGCLASS FR64:$src, VR128))>;
- // extracted scalar math op with insert via blend
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst), (v2f64 (scalar_to_vector
- (Op (f64 (extractelt (v2f64 VR128:$dst), (iPTR 0))),
- FR64:$src))), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst,
- (COPY_TO_REGCLASS FR64:$src, VR128))>;
-
// vector math op with insert via movsd
def : Pat<(v2f64 (X86Movsd (v2f64 VR128:$dst),
(Op (v2f64 VR128:$dst), (v2f64 VR128:$src)))),
(!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
-
- // vector math op with insert via blend
- def : Pat<(v2f64 (X86Blendi (v2f64 VR128:$dst),
- (Op (v2f64 VR128:$dst), (v2f64 VR128:$src)), (i8 1))),
- (!cast<I>("V"#OpcPrefix#SDrr_Int) v2f64:$dst, v2f64:$src)>;
}
}
@@ -3295,6 +3104,14 @@ def SSE_RSQRTSS : OpndItins<
>;
}
+def SSE_RSQRT_P : SizeItins<
+ SSE_RSQRTPS, SSE_RSQRTPS
+>;
+
+def SSE_RSQRT_S : SizeItins<
+ SSE_RSQRTSS, SSE_RSQRTSS
+>;
+
let Sched = WriteFRcp in {
def SSE_RCPP : OpndItins<
IIC_SSE_RCPP_RR, IIC_SSE_RCPP_RM
@@ -3305,12 +3122,21 @@ def SSE_RCPS : OpndItins<
>;
}
+def SSE_RCP_P : SizeItins<
+ SSE_RCPP, SSE_RCPP
+>;
+
+def SSE_RCP_S : SizeItins<
+ SSE_RCPS, SSE_RCPS
+>;
+
/// sse_fp_unop_s - SSE1 unops in scalar form
/// For the non-AVX defs, we need $src1 to be tied to $dst because
/// the HW instructions are 2 operand / destructive.
multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
X86MemOperand x86memop,
+ Operand intmemop, ComplexPattern int_cpat,
Intrinsic Intr,
SDNode OpNode, Domain d, OpndItins itins,
Predicate target, string Suffix> {
@@ -3331,7 +3157,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
let mayLoad = 1 in
- def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, x86memop:$src2),
+ def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst), (ins VR128:$src1, intmemop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -3351,7 +3177,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// which has a clobber before the rcp, vs.
// rcpss mem, %xmm0
let Predicates = [target, OptForSize] in {
- def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+ def : Pat<(Intr int_cpat:$src2),
(!cast<Instruction>(NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
}
@@ -3360,8 +3186,9 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
ValueType vt, ValueType ScalarVT,
X86MemOperand x86memop,
+ Operand intmemop, ComplexPattern int_cpat,
Intrinsic Intr, SDNode OpNode, Domain d,
- OpndItins itins, string Suffix> {
+ OpndItins itins, Predicate target, string Suffix> {
let hasSideEffects = 0 in {
def r : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3377,7 +3204,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
[]>, Sched<[itins.Sched.Folded]>;
let mayLoad = 1 in
def m_Int : I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, x86memop:$src2),
+ (ins VR128:$src1, intmemop:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -3392,21 +3219,17 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
// vrcpss mem, %xmm0, %xmm0
// TODO: In theory, we could fold the load, and avoid the stall caused by
// the partial register store, either in ExecutionDepsFix or with smarter RA.
- let Predicates = [UseAVX] in {
+ let Predicates = [target] in {
def : Pat<(OpNode RC:$src), (!cast<Instruction>("V"#NAME#Suffix##r)
(ScalarVT (IMPLICIT_DEF)), RC:$src)>;
- }
- let Predicates = [HasAVX] in {
def : Pat<(Intr VR128:$src),
(!cast<Instruction>("V"#NAME#Suffix##r_Int) VR128:$src,
VR128:$src)>;
}
- let Predicates = [HasAVX, OptForSize] in {
- def : Pat<(Intr (scalar_to_vector (ScalarVT (load addr:$src2)))),
+ let Predicates = [target, OptForSize] in {
+ def : Pat<(Intr int_cpat:$src2),
(!cast<Instruction>("V"#NAME#Suffix##m_Int)
(vt (IMPLICIT_DEF)), addr:$src2)>;
- }
- let Predicates = [UseAVX, OptForSize] in {
def : Pat<(ScalarVT (OpNode (load addr:$src))),
(!cast<Instruction>("V"#NAME#Suffix##m) (ScalarVT (IMPLICIT_DEF)),
addr:$src)>;
@@ -3452,7 +3275,7 @@ let Predicates = prds in {
/// sse2_fp_unop_p - SSE2 unops in vector forms.
multiclass sse2_fp_unop_p<bits<8> opc, string OpcodeStr,
SDNode OpNode, OpndItins itins> {
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def V#NAME#PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat("v", OpcodeStr,
"pd\t{$src, $dst|$dst, $src}"),
@@ -3486,40 +3309,43 @@ let Predicates = [HasAVX] in {
}
multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ OpndItins itins, Predicate AVXTarget> {
defm SS : sse_fp_unop_s<opc, OpcodeStr##ss, FR32, v4f32, f32, f32mem,
+ ssmem, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
SSEPackedSingle, itins, UseSSE1, "SS">, XS;
defm V#NAME#SS : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
- f32mem,
+ f32mem, ssmem, sse_load_f32,
!cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
- SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG;
+ SSEPackedSingle, itins, AVXTarget, "SS">, XS, VEX_4V,
+ VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
- OpndItins itins> {
+ OpndItins itins, Predicate AVXTarget> {
defm SD : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, v2f64, f64, f64mem,
+ sdmem, sse_load_f64,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
OpNode, SSEPackedDouble, itins, UseSSE2, "SD">, XD;
defm V#NAME#SD : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, v2f64, f64,
- f64mem,
+ f64mem, sdmem, sse_load_f64,
!cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
- OpNode, SSEPackedDouble, itins, "SD">,
- XD, VEX_4V, VEX_LIG, VEX_WIG;
+ OpNode, SSEPackedDouble, itins, AVXTarget, "SD">,
+ XD, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
}
// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX]>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD>,
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSS, UseAVX>,
+ sse1_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPS, [HasAVX, NoVLX]>,
+ sse2_fp_unop_s<0x51, "sqrt", fsqrt, SSE_SQRTSD, UseAVX>,
sse2_fp_unop_p<0x51, "sqrt", fsqrt, SSE_SQRTPD>;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
-defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS>,
- sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX, NoVLX] >;
-defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS>,
- sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX, NoVLX]>;
+defm RSQRT : sse1_fp_unop_s<0x52, "rsqrt", X86frsqrt, SSE_RSQRTSS, HasAVX>,
+ sse1_fp_unop_p<0x52, "rsqrt", X86frsqrt, SSE_RSQRTPS, [HasAVX]>;
+defm RCP : sse1_fp_unop_s<0x53, "rcp", X86frcp, SSE_RCPS, HasAVX>,
+ sse1_fp_unop_p<0x53, "rcp", X86frcp, SSE_RCPP, [HasAVX]>;
// There is no f64 version of the reciprocal approximation instructions.
@@ -3535,19 +3361,10 @@ multiclass scalar_unary_math_patterns<Intrinsic Intr, string OpcPrefix,
(!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
}
- // With SSE 4.1, blendi is preferred to movs*, so match that too.
- let Predicates = [UseSSE41] in {
- def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
- (!cast<I>(OpcPrefix#r_Int) VT:$dst, VT:$src)>;
- }
-
// Repeat for AVX versions of the instructions.
let Predicates = [HasAVX] in {
def : Pat<(VT (Move VT:$dst, (Intr VT:$src))),
(!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
-
- def : Pat<(VT (X86Blendi VT:$dst, (Intr VT:$src), (i8 1))),
- (!cast<I>("V"#OpcPrefix#r_Int) VT:$dst, VT:$src)>;
}
}
@@ -3893,34 +3710,6 @@ let Predicates = [HasAVX, NoVLX] in {
(VMOVDQUmr addr:$dst, VR128:$src)>;
def : Pat<(store (v16i8 VR128:$src), addr:$dst),
(VMOVDQUmr addr:$dst, VR128:$src)>;
-
- // Special patterns for storing subvector extracts of lower 128-bits
- // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr
- def : Pat<(alignedstore (v2i64 (extract_subvector
- (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v4i32 (extract_subvector
- (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v8i16 (extract_subvector
- (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(alignedstore (v16i8 (extract_subvector
- (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-
- def : Pat<(store (v2i64 (extract_subvector
- (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(store (v4i32 (extract_subvector
- (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(store (v8i16 (extract_subvector
- (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
- def : Pat<(store (v16i8 (extract_subvector
- (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
- (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
}
//===---------------------------------------------------------------------===//
@@ -4166,9 +3955,14 @@ defm PCMPGTD : PDI_binop_all<0x66, "pcmpgtd", X86pcmpgt, v4i32, v8i32,
// SSE2 - Packed Integer Shuffle Instructions
//===---------------------------------------------------------------------===//
+let Sched = WriteShuffle in
+def SSE_PSHUF : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
let ExeDomain = SSEPackedInt in {
multiclass sse2_pshuffle<string OpcodeStr, ValueType vt128, ValueType vt256,
- SDNode OpNode, Predicate prd> {
+ SDNode OpNode, OpndItins itins, Predicate prd> {
let Predicates = [HasAVX, prd] in {
def V#NAME#ri : Ii8<0x70, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, u8imm:$src2),
@@ -4176,15 +3970,15 @@ let Predicates = [HasAVX, prd] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG;
+ itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
- (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
- Sched<[WriteShuffleLd]>, VEX_WIG;
+ (i8 imm:$src2))))], itins.rm>, VEX,
+ Sched<[itins.Sched.Folded]>, VEX_WIG;
}
let Predicates = [HasAVX2, prd] in {
@@ -4194,15 +3988,15 @@ let Predicates = [HasAVX2, prd] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
- IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG;
+ itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src1, u8imm:$src2),
!strconcat("v", OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR256:$dst,
(vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
- (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
- Sched<[WriteShuffleLd]>, VEX_WIG;
+ (i8 imm:$src2))))], itins.rm>, VEX, VEX_L,
+ Sched<[itins.Sched.Folded]>, VEX_WIG;
}
let Predicates = [UseSSE2] in {
@@ -4212,23 +4006,24 @@ let Predicates = [UseSSE2] in {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
- IIC_SSE_PSHUF_RI>, Sched<[WriteShuffle]>;
+ itins.rr>, Sched<[itins.Sched]>;
def mi : Ii8<0x70, MRMSrcMem,
(outs VR128:$dst), (ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (bitconvert (memopv2i64 addr:$src1)),
- (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ (i8 imm:$src2))))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
}
}
} // ExeDomain = SSEPackedInt
-defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, NoVLX>, PD;
-defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
+defm PSHUFD : sse2_pshuffle<"pshufd", v4i32, v8i32, X86PShufd, SSE_PSHUF,
+ NoVLX>, PD;
+defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw, SSE_PSHUF,
NoVLX_Or_NoBWI>, XS;
-defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
+defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw, SSE_PSHUF,
NoVLX_Or_NoBWI>, XD;
//===---------------------------------------------------------------------===//
@@ -4237,126 +4032,94 @@ defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
let ExeDomain = SSEPackedInt in {
multiclass sse2_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
- ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+ ValueType ArgVT, SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag,
bit Is2Addr = 1> {
def rr : PDI<opc, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst,
- (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
- Sched<[WriteShuffle]>;
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))],
+ itins.rr>, Sched<[itins.Sched]>;
def rm : PDI<opc, MRMSrcMem,
- (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst,
- (OutVT (OpNode (ArgVT VR128:$src1),
- (bitconvert (ld_frag addr:$src2)))))]>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
-}
-
-multiclass sse2_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
- ValueType ArgVT, SDNode OpNode> {
- def Yrr : PDI<opc, MRMSrcReg,
- (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst,
- (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
- Sched<[WriteShuffle]>;
- def Yrm : PDI<opc, MRMSrcMem,
- (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst,
- (OutVT (OpNode (ArgVT VR256:$src1),
- (bitconvert (loadv4i64 addr:$src2)))))]>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1),
+ (bitconvert (ld_frag addr:$src2)))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass sse4_pack<bits<8> opc, string OpcodeStr, ValueType OutVT,
- ValueType ArgVT, SDNode OpNode, PatFrag ld_frag,
+ ValueType ArgVT, SDNode OpNode, RegisterClass RC,
+ X86MemOperand x86memop, OpndItins itins, PatFrag ld_frag,
bit Is2Addr = 1> {
def rr : SS48I<opc, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst,
- (OutVT (OpNode (ArgVT VR128:$src1), VR128:$src2)))]>,
- Sched<[WriteShuffle]>;
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1), RC:$src2)))],
+ itins.rr>, Sched<[itins.Sched]>;
def rm : SS48I<opc, MRMSrcMem,
- (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst,
- (OutVT (OpNode (ArgVT VR128:$src1),
- (bitconvert (ld_frag addr:$src2)))))]>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
-}
-
-multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
- ValueType ArgVT, SDNode OpNode> {
- def Yrr : SS48I<opc, MRMSrcReg,
- (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst,
- (OutVT (OpNode (ArgVT VR256:$src1), VR256:$src2)))]>,
- Sched<[WriteShuffle]>;
- def Yrm : SS48I<opc, MRMSrcMem,
- (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
- !strconcat(OpcodeStr,
- "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst,
- (OutVT (OpNode (ArgVT VR256:$src1),
- (bitconvert (loadv4i64 addr:$src2)))))]>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ [(set RC:$dst,
+ (OutVT (OpNode (ArgVT RC:$src1),
+ (bitconvert (ld_frag addr:$src2)))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
- defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss, VR128,
+ i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss, VR128,
+ i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
- loadv2i64, 0>, VEX_4V;
+ defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus, VR128,
+ i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus, VR128,
+ i128mem, SSE_PACK, loadv2i64, 0>, VEX_4V;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
- VEX_4V, VEX_L, VEX_WIG;
+ defm VPACKSSWBY : sse2_pack<0x63, "vpacksswb", v32i8, v16i16, X86Packss,
+ VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPACKSSDWY : sse2_pack<0x6B, "vpackssdw", v16i16, v8i32, X86Packss,
+ VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
- defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
- VEX_4V, VEX_L;
+ defm VPACKUSWBY : sse2_pack<0x67, "vpackuswb", v32i8, v16i16, X86Packus,
+ VR256,i256mem, SSE_PACK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPACKUSDWY : sse4_pack<0x2B, "vpackusdw", v16i16, v8i32, X86Packus,
+ VR256, i256mem, SSE_PACK, loadv4i64, 0>,
+ VEX_4V, VEX_L;
}
let Constraints = "$src1 = $dst" in {
- defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss,
- memopv2i64>;
- defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss,
- memopv2i64>;
+ defm PACKSSWB : sse2_pack<0x63, "packsswb", v16i8, v8i16, X86Packss, VR128,
+ i128mem, SSE_PACK, memopv2i64>;
+ defm PACKSSDW : sse2_pack<0x6B, "packssdw", v8i16, v4i32, X86Packss, VR128,
+ i128mem, SSE_PACK, memopv2i64>;
- defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus,
- memopv2i64>;
+ defm PACKUSWB : sse2_pack<0x67, "packuswb", v16i8, v8i16, X86Packus, VR128,
+ i128mem, SSE_PACK, memopv2i64>;
- defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus,
- memopv2i64>;
+ defm PACKUSDW : sse4_pack<0x2B, "packusdw", v8i16, v4i32, X86Packus, VR128,
+ i128mem, SSE_PACK, memopv2i64>;
}
} // ExeDomain = SSEPackedInt
@@ -4364,103 +4127,107 @@ let Constraints = "$src1 = $dst" in {
// SSE2 - Packed Integer Unpack Instructions
//===---------------------------------------------------------------------===//
+let Sched = WriteShuffle in
+def SSE_PUNPCK : OpndItins<
+ IIC_SSE_UNPCK, IIC_SSE_UNPCK
+>;
+
let ExeDomain = SSEPackedInt in {
multiclass sse2_unpack<bits<8> opc, string OpcodeStr, ValueType vt,
- SDNode OpNode, PatFrag ld_frag, bit Is2Addr = 1> {
+ SDNode OpNode, RegisterClass RC, X86MemOperand x86memop,
+ OpndItins itins, PatFrag ld_frag, bit Is2Addr = 1> {
def rr : PDI<opc, MRMSrcReg,
- (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
+ (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (vt (OpNode VR128:$src1, VR128:$src2)))],
- IIC_SSE_UNPCK>, Sched<[WriteShuffle]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))],
+ itins.rr>, Sched<[itins.Sched]>;
def rm : PDI<opc, MRMSrcMem,
- (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
+ (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr,"\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (vt (OpNode VR128:$src1,
+ [(set RC:$dst, (vt (OpNode RC:$src1,
(bitconvert (ld_frag addr:$src2)))))],
- IIC_SSE_UNPCK>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
-}
-
-multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
- SDNode OpNode> {
- def Yrr : PDI<opc, MRMSrcReg,
- (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
- !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src1, VR256:$src2)))]>,
- Sched<[WriteShuffle]>;
- def Yrm : PDI<opc, MRMSrcMem,
- (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
- !strconcat(OpcodeStr,"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src1,
- (bitconvert (loadv4i64 addr:$src2)))))]>,
- Sched<[WriteShuffleLd, ReadAfterLd]>;
+ itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
-
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
- defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPUNPCKLBW : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPUNPCKLWD : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPUNPCKHBW : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPUNPCKHWD : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>, VEX_4V, VEX_WIG;
}
+
let Predicates = [HasAVX, NoVLX] in {
- defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
- defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
- loadv2i64, 0>, VEX_4V, VEX_WIG;
+ defm VPUNPCKLDQ : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKHDQ : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
+ defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, loadv2i64, 0>,
+ VEX_4V, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- defm VPUNPCKLBW : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPUNPCKLWD : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPUNPCKHBW : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPUNPCKHWD : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
- VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKLBWY : sse2_unpack<0x60, "vpunpcklbw", v32i8, X86Unpckl, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKLWDY : sse2_unpack<0x61, "vpunpcklwd", v16i16, X86Unpckl, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHBWY : sse2_unpack<0x68, "vpunpckhbw", v32i8, X86Unpckh, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHWDY : sse2_unpack<0x69, "vpunpckhwd", v16i16, X86Unpckh, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
}
+
let Predicates = [HasAVX2, NoVLX] in {
- defm VPUNPCKLDQ : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPUNPCKHDQ : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
- VEX_4V, VEX_L, VEX_WIG;
- defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
- VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKLDQY : sse2_unpack<0x62, "vpunpckldq", v8i32, X86Unpckl, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKLQDQY : sse2_unpack<0x6C, "vpunpcklqdq", v4i64, X86Unpckl, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHDQY : sse2_unpack<0x6A, "vpunpckhdq", v8i32, X86Unpckh, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPUNPCKHQDQY : sse2_unpack<0x6D, "vpunpckhqdq", v4i64, X86Unpckh, VR256,
+ i256mem, SSE_PUNPCK, loadv4i64, 0>,
+ VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
- defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl,
- memopv2i64>;
- defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl,
- memopv2i64>;
- defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl,
- memopv2i64>;
- defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl,
- memopv2i64>;
-
- defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh,
- memopv2i64>;
- defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh,
- memopv2i64>;
- defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh,
- memopv2i64>;
- defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh,
- memopv2i64>;
+ defm PUNPCKLBW : sse2_unpack<0x60, "punpcklbw", v16i8, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+ defm PUNPCKLWD : sse2_unpack<0x61, "punpcklwd", v8i16, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+ defm PUNPCKLDQ : sse2_unpack<0x62, "punpckldq", v4i32, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+ defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Unpckl, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+
+ defm PUNPCKHBW : sse2_unpack<0x68, "punpckhbw", v16i8, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+ defm PUNPCKHWD : sse2_unpack<0x69, "punpckhwd", v8i16, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+ defm PUNPCKHDQ : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
+ defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Unpckh, VR128,
+ i128mem, SSE_PUNPCK, memopv2i64>;
}
} // ExeDomain = SSEPackedInt
@@ -5014,6 +4781,12 @@ let Predicates = [UseSSE3] in {
// SSE3 - Replicate Double FP - MOVDDUP
//===---------------------------------------------------------------------===//
+// FIXME: Improve MOVDDUP/BROADCAST reg/mem scheduling itineraries.
+let Sched = WriteFShuffle in
+def SSE_MOVDDUP : OpndItins<
+ IIC_SSE_MOV_LH, IIC_SSE_MOV_LH
+>;
+
multiclass sse3_replicate_dfp<string OpcodeStr> {
def rr : S3DI<0x12, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
@@ -5051,23 +4824,11 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup">;
let Predicates = [HasAVX, NoVLX] in {
def : Pat<(X86Movddup (loadv2f64 addr:$src)),
(VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
-
- // 256-bit version
- def : Pat<(X86Movddup (loadv4i64 addr:$src)),
- (VMOVDDUPYrm addr:$src)>;
- def : Pat<(X86Movddup (v4i64 VR256:$src)),
- (VMOVDDUPYrr VR256:$src)>;
}
-let Predicates = [HasAVX, NoVLX] in
-def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
- (VMOVDDUPrm addr:$src)>;
-let Predicates = [HasAVX1Only] in
-def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
- (VMOVDDUPrm addr:$src)>;
-
let Predicates = [UseSSE3] in {
- def : Pat<(X86Movddup (memopv2f64 addr:$src)),
+ // No need for aligned memory as this only loads 64-bits.
+ def : Pat<(X86Movddup (loadv2f64 addr:$src)),
(MOVDDUPrm addr:$src)>;
}
@@ -5095,7 +4856,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
// SSE3 - Arithmetic
//===---------------------------------------------------------------------===//
-multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
+multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
X86MemOperand x86memop, OpndItins itins,
PatFrag ld_frag, bit Is2Addr = 1> {
def rr : I<0xD0, MRMSrcReg,
@@ -5103,147 +4864,124 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr>,
+ [(set RC:$dst, (vt (X86Addsub RC:$src1, RC:$src2)))], itins.rr>,
Sched<[itins.Sched]>;
def rm : I<0xD0, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))], itins.rr>,
- Sched<[itins.Sched.Folded, ReadAfterLd]>;
+ [(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))],
+ itins.rr>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
- defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
- f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG;
- defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
- f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG;
+ defm VADDSUBPS : sse3_addsub<"vaddsubps", v4f32, VR128, f128mem,
+ SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V,
+ VEX_WIG;
+ defm VADDSUBPSY : sse3_addsub<"vaddsubps", v8f32, VR256, f256mem,
+ SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V,
+ VEX_L, VEX_WIG;
}
let ExeDomain = SSEPackedDouble in {
- defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
- f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG;
- defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
- f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
+ defm VADDSUBPD : sse3_addsub<"vaddsubpd", v2f64, VR128, f128mem,
+ SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V,
+ VEX_WIG;
+ defm VADDSUBPDY : sse3_addsub<"vaddsubpd", v4f64, VR256, f256mem,
+ SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V,
+ VEX_L, VEX_WIG;
}
}
let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
let ExeDomain = SSEPackedSingle in
- defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
- f128mem, SSE_ALU_F32P, memopv4f32>, XD;
+ defm ADDSUBPS : sse3_addsub<"addsubps", v4f32, VR128, f128mem, SSE_ALU_F32P,
+ memopv4f32>, XD;
let ExeDomain = SSEPackedDouble in
- defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
- f128mem, SSE_ALU_F64P, memopv2f64>, PD;
-}
-
-// Patterns used to select 'addsub' instructions.
-let Predicates = [HasAVX] in {
- def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
- (VADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
- def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (loadv4f32 addr:$rhs))),
- (VADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
- def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
- (VADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
- def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (loadv2f64 addr:$rhs))),
- (VADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
-
- def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (v8f32 VR256:$rhs))),
- (VADDSUBPSYrr VR256:$lhs, VR256:$rhs)>;
- def : Pat<(v8f32 (X86Addsub (v8f32 VR256:$lhs), (loadv8f32 addr:$rhs))),
- (VADDSUBPSYrm VR256:$lhs, f256mem:$rhs)>;
- def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (v4f64 VR256:$rhs))),
- (VADDSUBPDYrr VR256:$lhs, VR256:$rhs)>;
- def : Pat<(v4f64 (X86Addsub (v4f64 VR256:$lhs), (loadv4f64 addr:$rhs))),
- (VADDSUBPDYrm VR256:$lhs, f256mem:$rhs)>;
-}
-
-let Predicates = [UseSSE3] in {
- def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (v4f32 VR128:$rhs))),
- (ADDSUBPSrr VR128:$lhs, VR128:$rhs)>;
- def : Pat<(v4f32 (X86Addsub (v4f32 VR128:$lhs), (memopv4f32 addr:$rhs))),
- (ADDSUBPSrm VR128:$lhs, f128mem:$rhs)>;
- def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (v2f64 VR128:$rhs))),
- (ADDSUBPDrr VR128:$lhs, VR128:$rhs)>;
- def : Pat<(v2f64 (X86Addsub (v2f64 VR128:$lhs), (memopv2f64 addr:$rhs))),
- (ADDSUBPDrm VR128:$lhs, f128mem:$rhs)>;
+ defm ADDSUBPD : sse3_addsub<"addsubpd", v2f64, VR128, f128mem, SSE_ALU_F64P,
+ memopv2f64>, PD;
}
//===---------------------------------------------------------------------===//
// SSE3 Instructions
//===---------------------------------------------------------------------===//
+let Sched = WriteFHAdd in
+def SSE_HADDSUB : OpndItins<
+ IIC_SSE_HADDSUB_RR, IIC_SSE_HADDSUB_RM
+>;
+
// Horizontal ops
multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
- X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
- bit Is2Addr = 1> {
+ X86MemOperand x86memop, SDNode OpNode, OpndItins itins,
+ PatFrag ld_frag, bit Is2Addr = 1> {
def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
- Sched<[WriteFHAdd]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm : S3DI<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
- IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
- X86MemOperand x86memop, SDNode OpNode, PatFrag ld_frag,
- bit Is2Addr = 1> {
+ X86MemOperand x86memop, SDNode OpNode, OpndItins itins,
+ PatFrag ld_frag, bit Is2Addr = 1> {
def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], IIC_SSE_HADDSUB_RR>,
- Sched<[WriteFHAdd]>;
+ [(set RC:$dst, (vt (OpNode RC:$src1, RC:$src2)))], itins.rr>,
+ Sched<[itins.Sched]>;
def rm : S3I<o, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))],
- IIC_SSE_HADDSUB_RM>, Sched<[WriteFHAddLd, ReadAfterLd]>;
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
defm VHADDPS : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
- X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG;
+ X86fhadd, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG;
defm VHSUBPS : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
- X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG;
+ X86fhsub, SSE_HADDSUB, loadv4f32, 0>, VEX_4V, VEX_WIG;
defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
- X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+ X86fhadd, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
- X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
+ X86fhsub, SSE_HADDSUB, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
}
let ExeDomain = SSEPackedDouble in {
defm VHADDPD : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
- X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG;
+ X86fhadd, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG;
defm VHSUBPD : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
- X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG;
+ X86fhsub, SSE_HADDSUB, loadv2f64, 0>, VEX_4V, VEX_WIG;
defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
- X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ X86fhadd, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
- X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
+ X86fhsub, SSE_HADDSUB, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
}
}
let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in {
defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd,
- memopv4f32>;
+ SSE_HADDSUB, memopv4f32>;
defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub,
- memopv4f32>;
+ SSE_HADDSUB, memopv4f32>;
}
let ExeDomain = SSEPackedDouble in {
defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd,
- memopv2f64>;
+ SSE_HADDSUB, memopv2f64>;
defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub,
- memopv2f64>;
+ SSE_HADDSUB, memopv2f64>;
}
}
@@ -5251,59 +4989,63 @@ let Constraints = "$src1 = $dst" in {
// SSSE3 - Packed Absolute Instructions
//===---------------------------------------------------------------------===//
+let Sched = WriteVecALU in
+def SSE_PABS : OpndItins<
+ IIC_SSE_PABS_RR, IIC_SSE_PABS_RM
+>;
/// SS3I_unop_rm_int - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm<bits<8> opc, string OpcodeStr, ValueType vt,
- SDNode OpNode, PatFrag ld_frag> {
+ SDNode OpNode, OpndItins itins, PatFrag ld_frag> {
def rr : SS38I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst, (vt (OpNode VR128:$src)))],
- IIC_SSE_PABS_RR>, Sched<[WriteVecALU]>;
+ itins.rr>, Sched<[itins.Sched]>;
def rm : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
(vt (OpNode (bitconvert (ld_frag addr:$src)))))],
- IIC_SSE_PABS_RM>, Sched<[WriteVecALULd]>;
+ itins.rm>, Sched<[itins.Sched.Folded]>;
}
/// SS3I_unop_rm_int_y - Simple SSSE3 unary op whose type can be v*{i8,i16,i32}.
multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
- SDNode OpNode> {
+ SDNode OpNode, OpndItins itins> {
def Yrr : SS38I<opc, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (vt (OpNode VR256:$src)))]>,
- Sched<[WriteVecALU]>;
+ [(set VR256:$dst, (vt (OpNode VR256:$src)))], itins.rr>,
+ Sched<[itins.Sched]>;
def Yrm : SS38I<opc, MRMSrcMem, (outs VR256:$dst),
(ins i256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR256:$dst,
- (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))]>,
- Sched<[WriteVecALULd]>;
+ (vt (OpNode (bitconvert (loadv4i64 addr:$src)))))], itins.rm>,
+ Sched<[itins.Sched.Folded]>;
}
let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
- defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG;
- defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG;
+ defm VPABSB : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
+ defm VPABSW : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
}
let Predicates = [HasAVX, NoVLX] in {
- defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG;
+ defm VPABSD : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, SSE_PABS, loadv2i64>, VEX, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
- defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG;
- defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG;
+ defm VPABSB : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
+ defm VPABSW : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
}
let Predicates = [HasAVX2, NoVLX] in {
- defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG;
+ defm VPABSD : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs, SSE_PABS>, VEX, VEX_L, VEX_WIG;
}
-defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>;
-defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>;
-defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>;
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, SSE_PABS, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, SSE_PABS, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, SSE_PABS, memopv2i64>;
//===---------------------------------------------------------------------===//
// SSSE3 - Packed Binary Operator Instructions
@@ -5367,7 +5109,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
+ [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))], itins.rr>,
Sched<[itins.Sched]>;
def rm128 : SS38I<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
@@ -5376,7 +5118,7 @@ multiclass SS3I_binop_rm_int<bits<8> opc, string OpcodeStr,
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
[(set VR128:$dst,
(IntId128 VR128:$src1,
- (bitconvert (ld_frag addr:$src2))))]>,
+ (bitconvert (ld_frag addr:$src2))))], itins.rm>,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
@@ -5523,81 +5265,46 @@ defm PMULHRSW : SS3I_binop_rm<0x0B, "pmulhrsw", X86mulhrs, v8i16, v8i16,
// SSSE3 - Packed Align Instruction Patterns
//===---------------------------------------------------------------------===//
-multiclass ssse3_palignr<string asm, bit Is2Addr = 1> {
+let Sched = WriteShuffle in
+def SSE_PALIGN : OpndItins<
+ IIC_SSE_PALIGNRR, IIC_SSE_PALIGNRM
+>;
+
+multiclass ssse3_palignr<string asm, ValueType VT, RegisterClass RC,
+ PatFrag memop_frag, X86MemOperand x86memop,
+ OpndItins itins, bit Is2Addr = 1> {
let hasSideEffects = 0 in {
- def rri : SS3AI<0x0F, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ def rri : SS3AI<0x0F, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
!if(Is2Addr,
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [], IIC_SSE_PALIGNRR>, Sched<[WriteShuffle]>;
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1, RC:$src2, (i8 imm:$src3))))],
+ itins.rr>, Sched<[itins.Sched]>;
let mayLoad = 1 in
- def rmi : SS3AI<0x0F, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ def rmi : SS3AI<0x0F, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
!if(Is2Addr,
!strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(asm,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [], IIC_SSE_PALIGNRM>, Sched<[WriteShuffleLd, ReadAfterLd]>;
- }
-}
-
-multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
- let hasSideEffects = 0 in {
- def Yrri : SS3AI<0x0F, MRMSrcReg, (outs VR256:$dst),
- (ins VR256:$src1, VR256:$src2, u8imm:$src3),
- !strconcat(asm,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, Sched<[WriteShuffle]>;
- let mayLoad = 1 in
- def Yrmi : SS3AI<0x0F, MRMSrcMem, (outs VR256:$dst),
- (ins VR256:$src1, i256mem:$src2, u8imm:$src3),
- !strconcat(asm,
- "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, Sched<[WriteShuffleLd, ReadAfterLd]>;
+ [(set RC:$dst, (VT (X86PAlignr RC:$src1,
+ (bitconvert (memop_frag addr:$src2)),
+ (i8 imm:$src3))))],
+ itins.rm>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
}
-let Predicates = [HasAVX] in
- defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG;
-let Predicates = [HasAVX2] in
- defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG;
+let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
+ defm VPALIGNR : ssse3_palignr<"vpalignr", v16i8, VR128, loadv2i64,
+ i128mem, SSE_PALIGN, 0>, VEX_4V, VEX_WIG;
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
+ defm VPALIGNRY : ssse3_palignr<"vpalignr", v32i8, VR256, loadv4i64,
+ i256mem, SSE_PALIGN, 0>, VEX_4V, VEX_L, VEX_WIG;
let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
- defm PALIGNR : ssse3_palignr<"palignr">;
-
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-def : Pat<(v8i32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v8f32 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86PAlignr VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPALIGNRYrri VR256:$src1, VR256:$src2, imm:$imm)>;
-}
-
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (VPALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-}
-
-let Predicates = [UseSSSE3] in {
-def : Pat<(v4i32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-def : Pat<(v4f32 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-def : Pat<(v8i16 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-def : Pat<(v16i8 (X86PAlignr VR128:$src1, VR128:$src2, (i8 imm:$imm))),
- (PALIGNRrri VR128:$src1, VR128:$src2, imm:$imm)>;
-}
+ defm PALIGNR : ssse3_palignr<"palignr", v16i8, VR128, memopv2i64,
+ i128mem, SSE_PALIGN>;
//===---------------------------------------------------------------------===//
// SSSE3 - Thread synchronization
@@ -5911,8 +5618,8 @@ multiclass SS41I_extract8<bits<8> opc, string OpcodeStr> {
(ins i8mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (i8 (trunc (assertzext (X86pextrb (v16i8 VR128:$src1),
- imm:$src2)))), addr:$dst)]>;
+ [(store (i8 (trunc (X86pextrb (v16i8 VR128:$src1), imm:$src2))),
+ addr:$dst)]>;
}
let Predicates = [HasAVX, NoBWI] in
@@ -5936,8 +5643,8 @@ multiclass SS41I_extract16<bits<8> opc, string OpcodeStr> {
(ins i16mem:$dst, VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr,
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(store (i16 (trunc (assertzext (X86pextrw (v8i16 VR128:$src1),
- imm:$src2)))), addr:$dst)]>;
+ [(store (i16 (trunc (X86pextrw (v8i16 VR128:$src1), imm:$src2))),
+ addr:$dst)]>;
}
let Predicates = [HasAVX, NoBWI] in
@@ -6147,18 +5854,6 @@ let ExeDomain = SSEPackedSingle in {
defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
}
-let Predicates = [UseSSE41] in {
- // If we're inserting an element from a load or a null pshuf of a load,
- // fold the load into the insertps instruction.
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd (v4f32
- (scalar_to_vector (loadf32 addr:$src2))), (i8 0)),
- imm:$src3)),
- (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
- def : Pat<(v4f32 (X86insertps (v4f32 VR128:$src1), (X86PShufd
- (loadv4f32 addr:$src2), (i8 0)), imm:$src3)),
- (INSERTPSrm VR128:$src1, addr:$src2, imm:$src3)>;
-}
-
let Predicates = [UseAVX] in {
// If we're inserting an element from a vbroadcast of a load, fold the
// load into the X86insertps instruction.
@@ -6176,8 +5871,9 @@ let Predicates = [UseAVX] in {
multiclass sse41_fp_unop_p<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
X86MemOperand x86memop, RegisterClass RC,
+ ValueType VT32, ValueType VT64,
PatFrag mem_frag32, PatFrag mem_frag64,
- Intrinsic V4F32Int, Intrinsic V2F64Int> {
+ SDNode OpNode> {
let ExeDomain = SSEPackedSingle in {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
@@ -6185,7 +5881,7 @@ let ExeDomain = SSEPackedSingle in {
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (V4F32Int RC:$src1, imm:$src2))],
+ [(set RC:$dst, (VT32 (OpNode RC:$src1, imm:$src2)))],
IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
// Vector intrinsic operation, mem
@@ -6194,7 +5890,7 @@ let ExeDomain = SSEPackedSingle in {
!strconcat(OpcodeStr,
"ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (V4F32Int (mem_frag32 addr:$src1),imm:$src2))],
+ (VT32 (OpNode (mem_frag32 addr:$src1),imm:$src2)))],
IIC_SSE_ROUNDPS_MEM>, Sched<[WriteFAddLd]>;
} // ExeDomain = SSEPackedSingle
@@ -6204,8 +5900,8 @@ let ExeDomain = SSEPackedDouble in {
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
"pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set RC:$dst, (V2F64Int RC:$src1, imm:$src2))],
- IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAdd]>;
+ [(set RC:$dst, (VT64 (OpNode RC:$src1, imm:$src2)))],
+ IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAdd]>;
// Vector intrinsic operation, mem
def PDm : SS4AIi8<opcpd, MRMSrcMem,
@@ -6213,14 +5909,14 @@ let ExeDomain = SSEPackedDouble in {
!strconcat(OpcodeStr,
"pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set RC:$dst,
- (V2F64Int (mem_frag64 addr:$src1),imm:$src2))],
- IIC_SSE_ROUNDPS_REG>, Sched<[WriteFAddLd]>;
+ (VT64 (OpNode (mem_frag64 addr:$src1),imm:$src2)))],
+ IIC_SSE_ROUNDPD_REG>, Sched<[WriteFAddLd]>;
} // ExeDomain = SSEPackedDouble
}
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr> {
-let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, FR32:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
@@ -6233,7 +5929,9 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, FR64:$src2, i32u8imm:$src3),
!strconcat(OpcodeStr,
@@ -6246,12 +5944,12 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain, hasSideEffects = 0
+} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
}
multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr> {
-let ExeDomain = GenericDomain, hasSideEffects = 0 in {
+let ExeDomain = SSEPackedSingle, hasSideEffects = 0 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -6264,7 +5962,9 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in {
!strconcat(OpcodeStr,
"ss\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = SSEPackedSingle, hasSideEffects = 0
+let ExeDomain = SSEPackedDouble, hasSideEffects = 0 in {
def SDr : SS4AIi8<opcsd, MRMSrcReg,
(outs FR64:$dst), (ins FR64:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -6277,14 +5977,13 @@ let ExeDomain = GenericDomain, hasSideEffects = 0 in {
!strconcat(OpcodeStr,
"sd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[]>, Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain, hasSideEffects = 0
+} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
}
multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
- string OpcodeStr,
- Intrinsic F32Int,
- Intrinsic F64Int, bit Is2Addr = 1> {
-let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
+ string OpcodeStr, ValueType VT32, ValueType VT64,
+ SDNode OpNode, bit Is2Addr = 1> {
+let ExeDomain = SSEPackedSingle, isCodeGenOnly = 1 in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6292,7 +5991,7 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
"ss\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set VR128:$dst, (F32Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ [(set VR128:$dst, (VT32 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
Sched<[WriteFAdd]>;
def SSm_Int : SS4AIi8<opcss, MRMSrcMem,
@@ -6303,9 +6002,11 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (F32Int VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
+ (OpNode VR128:$src1, sse_load_f32:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
+} // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
+let ExeDomain = SSEPackedDouble, isCodeGenOnly = 1 in {
def SDr_Int : SS4AIi8<opcsd, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
!if(Is2Addr,
@@ -6313,7 +6014,7 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
"sd\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
- [(set VR128:$dst, (F64Int VR128:$src1, VR128:$src2, imm:$src3))]>,
+ [(set VR128:$dst, (VT64 (OpNode VR128:$src1, VR128:$src2, imm:$src3)))]>,
Sched<[WriteFAdd]>;
def SDm_Int : SS4AIi8<opcsd, MRMSrcMem,
@@ -6324,26 +6025,25 @@ let ExeDomain = GenericDomain, isCodeGenOnly = 1 in {
!strconcat(OpcodeStr,
"sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
[(set VR128:$dst,
- (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
+ (OpNode VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
Sched<[WriteFAddLd, ReadAfterLd]>;
-} // ExeDomain = GenericDomain, isCodeGenOnly = 1
+} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
// FP round - roundss, roundps, roundsd, roundpd
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
// Intrinsic form
- defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
- loadv4f32, loadv2f64,
- int_x86_sse41_round_ps,
- int_x86_sse41_round_pd>, VEX, VEX_WIG;
- defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
- loadv8f32, loadv4f64,
- int_x86_avx_round_ps_256,
- int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG;
- defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround",
- int_x86_sse41_round_ss,
- int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG;
- defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
+ defm VROUND : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128, v4f32,
+ v2f64, loadv4f32, loadv2f64, X86VRndScale>,
+ VEX, VEX_WIG;
+ defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256, v8f32,
+ v4f64, loadv8f32, loadv4f64, X86VRndScale>,
+ VEX, VEX_L, VEX_WIG;
+}
+let Predicates = [HasAVX, NoAVX512] in {
+ defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", v4f32, v2f64,
+ X86RndScales, 0>, VEX_4V, VEX_LIG, VEX_WIG;
+ defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG, VEX_WIG;
}
let Predicates = [UseAVX] in {
@@ -6369,7 +6069,7 @@ let Predicates = [UseAVX] in {
(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src, (i32 0xB))>;
}
-let Predicates = [HasAVX] in {
+let Predicates = [HasAVX, NoVLX] in {
def : Pat<(v4f32 (ffloor VR128:$src)),
(VROUNDPSr VR128:$src, (i32 0x9))>;
def : Pat<(v4f32 (fnearbyint VR128:$src)),
@@ -6415,15 +6115,13 @@ let Predicates = [HasAVX] in {
(VROUNDYPDr VR256:$src, (i32 0xB))>;
}
-defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128,
- memopv4f32, memopv2f64, int_x86_sse41_round_ps,
- int_x86_sse41_round_pd>;
+defm ROUND : sse41_fp_unop_p<0x08, 0x09, "round", f128mem, VR128, v4f32, v2f64,
+ memopv4f32, memopv2f64, X86VRndScale>;
defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round">;
let Constraints = "$src1 = $dst" in
-defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round",
- int_x86_sse41_round_ss, int_x86_sse41_round_sd>;
+defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
def : Pat<(ffloor FR32:$src),
@@ -6474,6 +6172,11 @@ let Predicates = [UseSSE41] in {
// SSE4.1 - Packed Bit Test
//===----------------------------------------------------------------------===//
+let Sched = WriteVecLogic in
+def SSE_PTEST : OpndItins<
+ IIC_SSE_INTALU_P_RR, IIC_SSE_INTALU_P_RM
+>;
+
// ptest instruction we'll lower to this in X86ISelLowering primarily from
// the intel intrinsic that corresponds to this.
let Defs = [EFLAGS], Predicates = [HasAVX] in {
@@ -6572,22 +6275,20 @@ let Defs = [EFLAGS], Predicates = [HasPOPCNT] in {
Sched<[WriteFAddLd]>, XS;
}
-
-
// SS41I_unop_rm_int_v16 - SSE 4.1 unary operator whose type is v8i16.
multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
- Intrinsic IntId128, PatFrag ld_frag,
+ SDNode OpNode, PatFrag ld_frag,
X86FoldableSchedWrite Sched> {
def rr128 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (IntId128 VR128:$src))]>,
+ [(set VR128:$dst, (v8i16 (OpNode (v8i16 VR128:$src))))]>,
Sched<[Sched]>;
def rm128 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
[(set VR128:$dst,
- (IntId128 (bitconvert (ld_frag addr:$src))))]>,
+ (v8i16 (OpNode (v8i16 (bitconvert (ld_frag addr:$src))))))]>,
Sched<[Sched.Folded]>;
}
@@ -6595,10 +6296,10 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
// model, although the naming is misleading.
let Predicates = [HasAVX] in
defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
- int_x86_sse41_phminposuw, loadv2i64,
+ X86phminpos, loadv2i64,
WriteVecIMul>, VEX, VEX_WIG;
defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
- int_x86_sse41_phminposuw, memopv2i64,
+ X86phminpos, memopv2i64,
WriteVecIMul>;
/// SS48I_binop_rm - Simple SSE41 binary operator.
@@ -6763,8 +6464,8 @@ let Constraints = "$src1 = $dst" in {
/// SS41I_binop_rmi_int - SSE 4.1 binary operator with 8-bit immediate
multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
Intrinsic IntId, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, bit Is2Addr = 1,
- OpndItins itins = DEFAULT_ITINS> {
+ X86MemOperand x86memop, bit Is2Addr,
+ OpndItins itins> {
let isCommutable = 1 in
def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -6791,8 +6492,8 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
/// SS41I_binop_rmi - SSE 4.1 binary operator with 8-bit immediate
multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop, bit Is2Addr = 1,
- OpndItins itins = DEFAULT_ITINS> {
+ X86MemOperand x86memop, bit Is2Addr,
+ OpndItins itins> {
let isCommutable = 1 in
def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -6816,6 +6517,21 @@ multiclass SS41I_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
+def BlendCommuteImm2 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x03;
+ return getI8Imm(Imm ^ 0x03, SDLoc(N));
+}]>;
+
+def BlendCommuteImm4 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0x0f;
+ return getI8Imm(Imm ^ 0x0f, SDLoc(N));
+}]>;
+
+def BlendCommuteImm8 : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue() & 0xff;
+ return getI8Imm(Imm ^ 0xff, SDLoc(N));
+}]>;
+
let Predicates = [HasAVX] in {
let isCommutable = 0 in {
defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
@@ -6823,26 +6539,6 @@ let Predicates = [HasAVX] in {
DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
}
- let ExeDomain = SSEPackedSingle in {
- defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
- VR128, loadv4f32, f128mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
- defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
- VR256, loadv8f32, f256mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
- }
- let ExeDomain = SSEPackedDouble in {
- defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
- VR128, loadv2f64, f128mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
- defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
- VR256, loadv4f64, f256mem, 0,
- DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
- }
- defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
- VR128, loadv2i64, i128mem, 0,
- DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;
-
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, loadv4f32, f128mem, 0,
@@ -6863,9 +6559,6 @@ let Predicates = [HasAVX2] in {
VR256, loadv4i64, i256mem, 0,
DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
- defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
- VR256, loadv4i64, i256mem, 0,
- DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
@@ -6874,17 +6567,7 @@ let Constraints = "$src1 = $dst" in {
VR128, memopv2i64, i128mem,
1, SSE_MPSADBW_ITINS>;
}
- let ExeDomain = SSEPackedSingle in
- defm BLENDPS : SS41I_binop_rmi<0x0C, "blendps", X86Blendi, v4f32,
- VR128, memopv4f32, f128mem,
- 1, SSE_INTALU_ITINS_FBLEND_P>;
- let ExeDomain = SSEPackedDouble in
- defm BLENDPD : SS41I_binop_rmi<0x0D, "blendpd", X86Blendi, v2f64,
- VR128, memopv2f64, f128mem,
- 1, SSE_INTALU_ITINS_FBLEND_P>;
- defm PBLENDW : SS41I_binop_rmi<0x0E, "pblendw", X86Blendi, v8i16,
- VR128, memopv2i64, i128mem,
- 1, SSE_INTALU_ITINS_BLEND_P>;
+
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
VR128, memopv4f32, f128mem, 1,
@@ -6895,6 +6578,82 @@ let Constraints = "$src1 = $dst" in {
SSE_DPPD_ITINS>;
}
+/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
+multiclass SS41I_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
+ X86MemOperand x86memop, bit Is2Addr, Domain d,
+ OpndItins itins, SDNodeXForm commuteXForm> {
+let ExeDomain = d, Constraints = !if(Is2Addr, "$src1 = $dst", "") in {
+ let isCommutable = 1 in
+ def rri : SS4AIi8<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ itins.rr>, Sched<[itins.Sched]>;
+ def rmi : SS4AIi8<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, x86memop:$src2, u8imm:$src3),
+ !if(Is2Addr,
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
+ !strconcat(OpcodeStr,
+ "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
+ [(set RC:$dst,
+ (OpVT (OpNode RC:$src1,
+ (bitconvert (memop_frag addr:$src2)), imm:$src3)))], itins.rm>,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
+}
+
+ // Pattern to commute if load is in first source.
+ def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
+ RC:$src1, imm:$src3)),
+ (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+ (commuteXForm imm:$src3))>;
+}
+
+let Predicates = [HasAVX] in {
+ defm VBLENDPS : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v4f32,
+ VR128, loadv4f32, f128mem, 0, SSEPackedSingle,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+ VEX_4V, VEX_WIG;
+ defm VBLENDPSY : SS41I_blend_rmi<0x0C, "vblendps", X86Blendi, v8f32,
+ VR256, loadv8f32, f256mem, 0, SSEPackedSingle,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm8>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VBLENDPD : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
+ VR128, loadv2f64, f128mem, 0, SSEPackedDouble,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm2>,
+ VEX_4V, VEX_WIG;
+ defm VBLENDPDY : SS41I_blend_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
+ VR256, loadv4f64, f256mem, 0, SSEPackedDouble,
+ DEFAULT_ITINS_FBLENDSCHED, BlendCommuteImm4>,
+ VEX_4V, VEX_L, VEX_WIG;
+ defm VPBLENDW : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
+ VR128, loadv2i64, i128mem, 0, SSEPackedInt,
+ DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+ VEX_4V, VEX_WIG;
+}
+
+let Predicates = [HasAVX2] in {
+ defm VPBLENDWY : SS41I_blend_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
+ VR256, loadv4i64, i256mem, 0, SSEPackedInt,
+ DEFAULT_ITINS_BLENDSCHED, BlendCommuteImm8>,
+ VEX_4V, VEX_L, VEX_WIG;
+}
+
+defm BLENDPS : SS41I_blend_rmi<0x0C, "blendps", X86Blendi, v4f32,
+ VR128, memopv4f32, f128mem, 1, SSEPackedSingle,
+ SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm4>;
+defm BLENDPD : SS41I_blend_rmi<0x0D, "blendpd", X86Blendi, v2f64,
+ VR128, memopv2f64, f128mem, 1, SSEPackedDouble,
+ SSE_INTALU_ITINS_FBLEND_P, BlendCommuteImm2>;
+defm PBLENDW : SS41I_blend_rmi<0x0E, "pblendw", X86Blendi, v8i16,
+ VR128, memopv2i64, i128mem, 1, SSEPackedInt,
+ SSE_INTALU_ITINS_BLEND_P, BlendCommuteImm8>;
+
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
let Predicates = [HasAVX] in {
@@ -7003,16 +6762,12 @@ let Predicates = [HasAVX2] in {
// movs[s/d] are 1-2 byte shorter instructions.
let Predicates = [UseAVX] in {
let AddedComplexity = 15 in {
- // Move scalar to XMM zero-extended, zeroing a VR128 then do a
- // MOVS{S,D} to the lower bits.
- def : Pat<(v4f32 (X86vzmovl (v4f32 (scalar_to_vector FR32:$src)))),
- (VMOVSSrr (v4f32 (V_SET0)), FR32:$src)>;
def : Pat<(v4f32 (X86vzmovl (v4f32 VR128:$src))),
(VBLENDPSrri (v4f32 (V_SET0)), VR128:$src, (i8 1))>;
def : Pat<(v4i32 (X86vzmovl (v4i32 VR128:$src))),
(VPBLENDWrri (v4i32 (V_SET0)), VR128:$src, (i8 3))>;
def : Pat<(v2f64 (X86vzmovl (v2f64 (scalar_to_vector FR64:$src)))),
- (VMOVSDrr (v2f64 (V_SET0)), FR64:$src)>;
+ (VMOVSDrr (v2f64 (V_SET0)), (COPY_TO_REGCLASS FR64:$src, VR128))>;
// Move low f32 and clear high bits.
def : Pat<(v8f32 (X86vzmovl (v8f32 VR256:$src))),
@@ -7049,7 +6804,7 @@ let Predicates = [UseSSE41], AddedComplexity = 15 in {
let Uses = [XMM0], Constraints = "$src1 = $dst" in {
multiclass SS41I_ternary_int<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
X86MemOperand x86memop, Intrinsic IntId,
- OpndItins itins = DEFAULT_ITINS> {
+ OpndItins itins> {
def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr,
@@ -7210,7 +6965,7 @@ multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
(bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
}
-let Defs = [EFLAGS], usesCustomInserter = 1 in {
+let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
Requires<[HasAVX]>, VEX_WIG;
defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
@@ -7247,7 +7002,7 @@ multiclass pseudo_pcmpestrm<string asm, PatFrag ld_frag> {
(bc_v16i8 (ld_frag addr:$src3)), EDX, imm:$src5))]>;
}
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
defm VPCMPESTRM128 : pseudo_pcmpestrm<"#VPCMPESTRM128", loadv2i64>,
Requires<[HasAVX]>;
defm PCMPESTRM128 : pseudo_pcmpestrm<"#PCMPESTRM128", memopv2i64>,
@@ -7284,7 +7039,7 @@ multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
(bc_v16i8 (ld_frag addr:$src2)), imm:$src3))]>;
}
-let Defs = [EFLAGS], usesCustomInserter = 1 in {
+let Defs = [EFLAGS], usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
Requires<[HasAVX]>, VEX_WIG;
defm PCMPISTRI : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
@@ -7322,7 +7077,7 @@ multiclass pseudo_pcmpestri<string asm, PatFrag ld_frag> {
imm:$src5))]>;
}
-let Defs = [EFLAGS], Uses = [EAX, EDX], usesCustomInserter = 1 in {
+let Defs = [EFLAGS], Uses = [EAX, EDX], hasNoSchedulingInfo = 1, usesCustomInserter = 1 in {
defm VPCMPESTRI : pseudo_pcmpestri<"#VPCMPESTRI", loadv2i64>,
Requires<[HasAVX]>;
defm PCMPESTRI : pseudo_pcmpestri<"#PCMPESTRI", memopv2i64>,
@@ -7400,8 +7155,9 @@ let Constraints = "$src1 = $dst" in {
// SHA-NI Instructions
//===----------------------------------------------------------------------===//
+// FIXME: Is there a better scheduler itinerary for SHA than WriteVecIMul?
multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
- bit UsesXMM0 = 0> {
+ OpndItins itins, bit UsesXMM0 = 0> {
def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2),
!if(UsesXMM0,
@@ -7409,7 +7165,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
[!if(UsesXMM0,
(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
- (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
+ (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))], itins.rr>,
+ T8, Sched<[itins.Sched]>;
def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
@@ -7420,7 +7177,8 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
(set VR128:$dst, (IntId VR128:$src1,
(bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
(set VR128:$dst, (IntId VR128:$src1,
- (bc_v4i32 (memopv2i64 addr:$src2)))))]>, T8;
+ (bc_v4i32 (memopv2i64 addr:$src2)))))], itins.rm>, T8,
+ Sched<[itins.Sched.Folded, ReadAfterLd]>;
}
let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
@@ -7429,24 +7187,32 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
- (i8 imm:$src3)))]>, TA;
+ (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RR>, TA,
+ Sched<[WriteVecIMul]>;
def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
"sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
[(set VR128:$dst,
(int_x86_sha1rnds4 VR128:$src1,
(bc_v4i32 (memopv2i64 addr:$src2)),
- (i8 imm:$src3)))]>, TA;
+ (i8 imm:$src3)))], IIC_SSE_INTMUL_P_RM>, TA,
+ Sched<[WriteVecIMulLd, ReadAfterLd]>;
- defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte>;
- defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1>;
- defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2>;
+ defm SHA1NEXTE : SHAI_binop<0xC8, "sha1nexte", int_x86_sha1nexte,
+ SSE_INTMUL_ITINS_P>;
+ defm SHA1MSG1 : SHAI_binop<0xC9, "sha1msg1", int_x86_sha1msg1,
+ SSE_INTMUL_ITINS_P>;
+ defm SHA1MSG2 : SHAI_binop<0xCA, "sha1msg2", int_x86_sha1msg2,
+ SSE_INTMUL_ITINS_P>;
let Uses=[XMM0] in
- defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2, 1>;
+ defm SHA256RNDS2 : SHAI_binop<0xCB, "sha256rnds2", int_x86_sha256rnds2,
+ SSE_INTMUL_ITINS_P, 1>;
- defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1>;
- defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2>;
+ defm SHA256MSG1 : SHAI_binop<0xCC, "sha256msg1", int_x86_sha256msg1,
+ SSE_INTMUL_ITINS_P>;
+ defm SHA256MSG2 : SHAI_binop<0xCD, "sha256msg2", int_x86_sha256msg2,
+ SSE_INTMUL_ITINS_P>;
}
// Aliases with explicit %xmm0
@@ -7459,46 +7225,60 @@ def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
// AES-NI Instructions
//===----------------------------------------------------------------------===//
-multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
- PatFrag ld_frag, bit Is2Addr = 1> {
- def rr : AES8I<opc, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst, (IntId128 VR128:$src1, VR128:$src2))]>,
- Sched<[WriteAESDecEnc]>;
- def rm : AES8I<opc, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2),
- !if(Is2Addr,
- !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
- !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
- [(set VR128:$dst,
- (IntId128 VR128:$src1, (ld_frag addr:$src2)))]>,
- Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
+ Intrinsic IntId, PatFrag ld_frag,
+ bit Is2Addr = 0, RegisterClass RC = VR128,
+ X86MemOperand MemOp = i128mem> {
+ let AsmString = OpcodeStr##
+ !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
+ "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
+ def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2), "",
+ [(set RC:$dst, (IntId RC:$src1, RC:$src2))]>,
+ Sched<[WriteAESDecEnc]>;
+ def rm : AES8I<opc, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, MemOp:$src2), "",
+ [(set RC:$dst, (IntId RC:$src1, (ld_frag addr:$src2)))]>,
+ Sched<[WriteAESDecEncLd, ReadAfterLd]>;
+ }
}
// Perform One Round of an AES Encryption/Decryption Flow
-let Predicates = [HasAVX, HasAES] in {
+let Predicates = [HasAVX, NoVLX_Or_NoVAES, HasAES] in {
defm VAESENC : AESI_binop_rm_int<0xDC, "vaesenc",
- int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ int_x86_aesni_aesenc, loadv2i64>, VEX_4V, VEX_WIG;
defm VAESENCLAST : AESI_binop_rm_int<0xDD, "vaesenclast",
- int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ int_x86_aesni_aesenclast, loadv2i64>, VEX_4V, VEX_WIG;
defm VAESDEC : AESI_binop_rm_int<0xDE, "vaesdec",
- int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ int_x86_aesni_aesdec, loadv2i64>, VEX_4V, VEX_WIG;
defm VAESDECLAST : AESI_binop_rm_int<0xDF, "vaesdeclast",
- int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
+ int_x86_aesni_aesdeclast, loadv2i64>, VEX_4V, VEX_WIG;
+}
+
+let Predicates = [NoVLX, HasVAES] in {
+ defm VAESENCY : AESI_binop_rm_int<0xDC, "vaesenc",
+ int_x86_aesni_aesenc_256, loadv4i64, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+ defm VAESENCLASTY : AESI_binop_rm_int<0xDD, "vaesenclast",
+ int_x86_aesni_aesenclast_256, loadv4i64, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+ defm VAESDECY : AESI_binop_rm_int<0xDE, "vaesdec",
+ int_x86_aesni_aesdec_256, loadv4i64, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
+ defm VAESDECLASTY : AESI_binop_rm_int<0xDF, "vaesdeclast",
+ int_x86_aesni_aesdeclast_256, loadv4i64, 0, VR256,
+ i256mem>, VEX_4V, VEX_L, VEX_WIG;
}
let Constraints = "$src1 = $dst" in {
defm AESENC : AESI_binop_rm_int<0xDC, "aesenc",
- int_x86_aesni_aesenc, memopv2i64>;
+ int_x86_aesni_aesenc, memopv2i64, 1>;
defm AESENCLAST : AESI_binop_rm_int<0xDD, "aesenclast",
- int_x86_aesni_aesenclast, memopv2i64>;
+ int_x86_aesni_aesenclast, memopv2i64, 1>;
defm AESDEC : AESI_binop_rm_int<0xDE, "aesdec",
- int_x86_aesni_aesdec, memopv2i64>;
+ int_x86_aesni_aesdec, memopv2i64, 1>;
defm AESDECLAST : AESI_binop_rm_int<0xDF, "aesdeclast",
- int_x86_aesni_aesdeclast, memopv2i64>;
+ int_x86_aesni_aesdeclast, memopv2i64, 1>;
}
// Perform the AES InvMixColumn Transformation
@@ -7558,63 +7338,103 @@ def AESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
// PCLMUL Instructions
//===----------------------------------------------------------------------===//
-// AVX carry-less Multiplication instructions
-let isCommutable = 1 in
-def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128:$dst,
- (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
- Sched<[WriteCLMul]>, VEX_WIG;
-
-def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
- (loadv2i64 addr:$src2), imm:$src3))]>,
- Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG;
-
-// Carry-less Multiplication instructions
-let Constraints = "$src1 = $dst" in {
-let isCommutable = 1 in
-def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
- (ins VR128:$src1, VR128:$src2, u8imm:$src3),
- "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set VR128:$dst,
- (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
- IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
-
-def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
- (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
- "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
- [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
- (memopv2i64 addr:$src2), imm:$src3))],
- IIC_SSE_PCLMULQDQ_RM>,
- Sched<[WriteCLMulLd, ReadAfterLd]>;
-} // Constraints = "$src1 = $dst"
+// Immediate transform to help with commuting.
+def PCLMULCommuteImm : SDNodeXForm<imm, [{
+ uint8_t Imm = N->getZExtValue();
+ return getI8Imm((uint8_t)((Imm >> 4) | (Imm << 4)), SDLoc(N));
+}]>;
-
-multiclass pclmul_alias<string asm, int immop> {
- def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
- (PCLMULQDQrr VR128:$dst, VR128:$src, immop), 0>;
-
- def : InstAlias<!strconcat("pclmul", asm, "dq {$src, $dst|$dst, $src}"),
- (PCLMULQDQrm VR128:$dst, i128mem:$src, immop), 0>;
-
- def : InstAlias<!strconcat("vpclmul", asm,
- "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
- (VPCLMULQDQrr VR128:$dst, VR128:$src1, VR128:$src2, immop),
- 0>;
-
- def : InstAlias<!strconcat("vpclmul", asm,
- "dq {$src2, $src1, $dst|$dst, $src1, $src2}"),
- (VPCLMULQDQrm VR128:$dst, VR128:$src1, i128mem:$src2, immop),
- 0>;
+// SSE carry-less Multiplication instructions
+let Predicates = [NoAVX, HasPCLMUL] in {
+ let Constraints = "$src1 = $dst" in {
+ let isCommutable = 1 in
+ def PCLMULQDQrr : PCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
+ (ins VR128:$src1, VR128:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))],
+ IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMul]>;
+
+ def PCLMULQDQrm : PCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
+ (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
+ "pclmulqdq\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ [(set VR128:$dst,
+ (int_x86_pclmulqdq VR128:$src1, (memopv2i64 addr:$src2),
+ imm:$src3))],
+ IIC_SSE_PCLMULQDQ_RR>, Sched<[WriteCLMulLd, ReadAfterLd]>;
+ } // Constraints = "$src1 = $dst"
+
+ def : Pat<(int_x86_pclmulqdq (memopv2i64 addr:$src2), VR128:$src1,
+ (i8 imm:$src3)),
+ (PCLMULQDQrm VR128:$src1, addr:$src2,
+ (PCLMULCommuteImm imm:$src3))>;
+} // Predicates = [NoAVX, HasPCLMUL]
+
+// SSE aliases
+foreach HI = ["hq","lq"] in
+foreach LO = ["hq","lq"] in {
+ def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
+ (PCLMULQDQrr VR128:$dst, VR128:$src,
+ !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
+ def : InstAlias<"pclmul" # HI # LO # "dq\t{$src, $dst|$dst, $src}",
+ (PCLMULQDQrm VR128:$dst, i128mem:$src,
+ !add(!shl(!eq(LO,"hq"),4),!eq(HI,"hq"))), 0>;
}
-defm : pclmul_alias<"hqhq", 0x11>;
-defm : pclmul_alias<"hqlq", 0x01>;
-defm : pclmul_alias<"lqhq", 0x10>;
-defm : pclmul_alias<"lqlq", 0x00>;
+
+// AVX carry-less Multiplication instructions
+multiclass vpclmulqdq<RegisterClass RC, X86MemOperand MemOp,
+ PatFrag LdFrag, Intrinsic IntId> {
+ let isCommutable = 1 in
+ def rr : PCLMULIi8<0x44, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set RC:$dst,
+ (IntId RC:$src1, RC:$src2, imm:$src3))]>,
+ Sched<[WriteCLMul]>;
+
+ def rm : PCLMULIi8<0x44, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, MemOp:$src2, u8imm:$src3),
+ "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
+ [(set RC:$dst,
+ (IntId RC:$src1, (LdFrag addr:$src2), imm:$src3))]>,
+ Sched<[WriteCLMulLd, ReadAfterLd]>;
+
+ // We can commute a load in the first operand by swapping the sources and
+ // rotating the immediate.
+ def : Pat<(IntId (LdFrag addr:$src2), RC:$src1, (i8 imm:$src3)),
+ (!cast<Instruction>(NAME#"rm") RC:$src1, addr:$src2,
+ (PCLMULCommuteImm imm:$src3))>;
+}
+
+let Predicates = [HasAVX, NoVLX_Or_NoVPCLMULQDQ, HasPCLMUL] in
+defm VPCLMULQDQ : vpclmulqdq<VR128, i128mem, loadv2i64,
+ int_x86_pclmulqdq>, VEX_4V, VEX_WIG;
+
+let Predicates = [NoVLX, HasVPCLMULQDQ] in
+defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, loadv4i64,
+ int_x86_pclmulqdq_256>, VEX_4V, VEX_L, VEX_WIG;
+
+multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
+ X86MemOperand MemOp, string Hi, string Lo> {
+ def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
+ !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
+ def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+ (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
+ !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
+}
+
+multiclass vpclmulqdq_aliases<string InstStr, RegisterClass RC,
+ X86MemOperand MemOp> {
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "hq">;
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "hq", "lq">;
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "hq">;
+ defm : vpclmulqdq_aliases_impl<InstStr, RC, MemOp, "lq", "lq">;
+}
+
+// AVX aliases
+defm : vpclmulqdq_aliases<"VPCLMULQDQ", VR128, i128mem>;
+defm : vpclmulqdq_aliases<"VPCLMULQDQY", VR256, i256mem>;
//===----------------------------------------------------------------------===//
// SSE4A Instructions
@@ -7628,29 +7448,33 @@ def EXTRQI : Ii8<0x78, MRMXr, (outs VR128:$dst),
(ins VR128:$src, u8imm:$len, u8imm:$idx),
"extrq\t{$idx, $len, $src|$src, $len, $idx}",
[(set VR128:$dst, (X86extrqi VR128:$src, imm:$len,
- imm:$idx))]>, PD;
+ imm:$idx))], IIC_SSE_INTALU_P_RR>,
+ PD, Sched<[WriteVecALU]>;
def EXTRQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
"extrq\t{$mask, $src|$src, $mask}",
[(set VR128:$dst, (int_x86_sse4a_extrq VR128:$src,
- VR128:$mask))]>, PD;
+ VR128:$mask))], IIC_SSE_INTALU_P_RR>,
+ PD, Sched<[WriteVecALU]>;
def INSERTQI : Ii8<0x78, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$src2, u8imm:$len, u8imm:$idx),
"insertq\t{$idx, $len, $src2, $src|$src, $src2, $len, $idx}",
[(set VR128:$dst, (X86insertqi VR128:$src, VR128:$src2,
- imm:$len, imm:$idx))]>, XD;
+ imm:$len, imm:$idx))], IIC_SSE_INTALU_P_RR>,
+ XD, Sched<[WriteVecALU]>;
def INSERTQ : I<0x79, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src, VR128:$mask),
"insertq\t{$mask, $src|$src, $mask}",
[(set VR128:$dst, (int_x86_sse4a_insertq VR128:$src,
- VR128:$mask))]>, XD;
+ VR128:$mask))], IIC_SSE_INTALU_P_RR>,
+ XD, Sched<[WriteVecALU]>;
}
} // ExeDomain = SSEPackedInt
// Non-temporal (unaligned) scalar stores.
let AddedComplexity = 400 in { // Prefer non-temporal versions
-let mayStore = 1, SchedRW = [WriteStore] in {
+let hasSideEffects = 0, mayStore = 1, SchedRW = [WriteStore] in {
def MOVNTSS : I<0x2B, MRMDestMem, (outs), (ins f32mem:$dst, VR128:$src),
"movntss\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVNT>, XS;
@@ -7712,6 +7536,15 @@ let ExeDomain = SSEPackedDouble, Predicates = [HasAVX2, NoVLX] in
def VBROADCASTSDYrr : avx2_broadcast_rr<0x19, "vbroadcastsd", VR256,
v4f64, v2f64, WriteFShuffle256>, VEX_L;
+let Predicates = [HasAVX, NoVLX] in {
+ def : Pat<(v4f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (VBROADCASTSSrm addr:$src)>;
+ def : Pat<(v8f32 (X86VBroadcast (v4f32 (scalar_to_vector (loadf32 addr:$src))))),
+ (VBROADCASTSSYrm addr:$src)>;
+ def : Pat<(v4f64 (X86VBroadcast (v2f64 (scalar_to_vector (loadf64 addr:$src))))),
+ (VBROADCASTSDYrm addr:$src)>;
+}
+
//===----------------------------------------------------------------------===//
// VBROADCAST*128 - Load from memory and broadcast 128-bit vector to both
// halves of a 256-bit vector.
@@ -7852,21 +7685,23 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
def rm : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, f128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))]>,
- VEX_4V;
+ [(set VR128:$dst, (IntLd addr:$src2, VR128:$src1))],
+ IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
def Yrm : AVX8I<opc_rm, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L;
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
+ IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
def mr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ [(IntSt addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
+ VEX_4V, Sched<[WriteStore]>;
def Ymr : AVX8I<opc_mr, MRMDestMem, (outs),
(ins f256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
+ VEX_4V, VEX_L, Sched<[WriteStore]>;
}
let ExeDomain = SSEPackedSingle in
@@ -7885,6 +7720,17 @@ defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
//===----------------------------------------------------------------------===//
// VPERMIL - Permute Single and Double Floating-Point Values
//
+
+let Sched = WriteFShuffle in
+def AVX_VPERMILV : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+
+let Sched = WriteFShuffle in
+def AVX_VPERMIL : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+
multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
RegisterClass RC, X86MemOperand x86memop_f,
X86MemOperand x86memop_i, PatFrag i_frag,
@@ -7937,83 +7783,81 @@ let isCommutable = 1 in
def VPERM2F128rr : AVXAIi8<0x06, MRMSrcReg, (outs VR256:$dst),
(ins VR256:$src1, VR256:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst, (v8f32 (X86VPerm2x128 VR256:$src1, VR256:$src2,
+ [(set VR256:$dst, (v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2,
(i8 imm:$src3))))]>, VEX_4V, VEX_L,
Sched<[WriteFShuffle]>;
def VPERM2F128rm : AVXAIi8<0x06, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, f256mem:$src2, u8imm:$src3),
"vperm2f128\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv8f32 addr:$src2),
+ [(set VR256:$dst, (X86VPerm2x128 VR256:$src1, (loadv4f64 addr:$src2),
(i8 imm:$src3)))]>, VEX_4V, VEX_L,
Sched<[WriteFShuffleLd, ReadAfterLd]>;
}
+// Immediate transform to help with commuting.
+def Perm2XCommuteImm : SDNodeXForm<imm, [{
+ return getI8Imm(N->getZExtValue() ^ 0x22, SDLoc(N));
+}]>;
+
let Predicates = [HasAVX] in {
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v4f64 (X86VPerm2x128 VR256:$src1,
- (loadv4f64 addr:$src2), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+// Pattern with load in other operand.
+def : Pat<(v4f64 (X86VPerm2x128 (loadv4f64 addr:$src2),
+ VR256:$src1, (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
}
let Predicates = [HasAVX1Only] in {
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
(VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2F128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1,
- (bc_v8i32 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
def : Pat<(v4i64 (X86VPerm2x128 VR256:$src1,
(loadv4i64 addr:$src2), (i8 imm:$imm))),
(VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1,
- (bc_v32i8 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
- (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2F128rm VR256:$src1, addr:$src2, imm:$imm)>;
+// Pattern with load in other operand.
+def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
+ VR256:$src1, (i8 imm:$imm))),
+ (VPERM2F128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
}
//===----------------------------------------------------------------------===//
// VZERO - Zero YMM registers
//
// Note, these instruction do not affect the YMM16-YMM31.
+let SchedRW = [WriteSystem] in {
let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
// Zero All YMM registers
def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
- [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG;
+ [(int_x86_avx_vzeroall)], IIC_AVX_ZERO>, PS, VEX, VEX_L,
+ Requires<[HasAVX]>, VEX_WIG;
// Zero Upper bits of YMM registers
def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
- [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG;
-}
+ [(int_x86_avx_vzeroupper)], IIC_AVX_ZERO>, PS, VEX,
+ Requires<[HasAVX]>, VEX_WIG;
+} // Defs
+} // SchedRW
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
-multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop> {
def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
"vcvtph2ps\t{$src, $dst|$dst, $src}",
- [(set RC:$dst, (Int VR128:$src))]>,
+ [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
T8PD, VEX, Sched<[WriteCvtF2F]>;
let hasSideEffects = 0, mayLoad = 1 in
def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
- "vcvtph2ps\t{$src, $dst|$dst, $src}", []>, T8PD, VEX,
- Sched<[WriteCvtF2FLd]>;
+ "vcvtph2ps\t{$src, $dst|$dst, $src}",
+ [(set RC:$dst, (X86cvtph2ps (bc_v8i16
+ (loadv2i64 addr:$src))))]>,
+ T8PD, VEX, Sched<[WriteCvtF2FLd]>;
}
-multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
+multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop> {
def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
(ins RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
- [(set VR128:$dst, (Int RC:$src1, imm:$src2))]>,
+ [(set VR128:$dst, (X86cvtps2ph RC:$src1, imm:$src2))]>,
TAPD, VEX, Sched<[WriteCvtF2F]>;
let hasSideEffects = 0, mayStore = 1,
SchedRW = [WriteCvtF2FLd, WriteRMW] in
@@ -8023,32 +7867,31 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop, Intrinsic Int> {
TAPD, VEX;
}
-let Predicates = [HasF16C] in {
- defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, int_x86_vcvtph2ps_128>;
- defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, int_x86_vcvtph2ps_256>, VEX_L;
- defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, int_x86_vcvtps2ph_128>;
- defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, int_x86_vcvtps2ph_256>, VEX_L;
+let Predicates = [HasF16C, NoVLX] in {
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem>;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem>, VEX_L;
+ defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem>;
+ defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem>, VEX_L;
// Pattern match vcvtph2ps of a scalar i64 load.
- def : Pat<(int_x86_vcvtph2ps_128 (vzmovl_v2i64 addr:$src)),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzmovl_v2i64 addr:$src)))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(int_x86_vcvtph2ps_128 (vzload_v2i64 addr:$src)),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (vzload_v2i64 addr:$src)))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(int_x86_vcvtph2ps_128 (bitconvert
- (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+ (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
(VCVTPH2PSrm addr:$src)>;
- def : Pat<(store (f64 (extractelt (bc_v2f64 (v8i16
- (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
- addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (i64 (extractelt (bc_v2i64 (v8i16
- (int_x86_vcvtps2ph_128 VR128:$src1, i32:$src2))), (iPTR 0))),
- addr:$dst),
- (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
- def : Pat<(store (v8i16 (int_x86_vcvtps2ph_256 VR256:$src1, i32:$src2)),
- addr:$dst),
- (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
+ def : Pat<(store (f64 (extractelt
+ (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (i64 (extractelt
+ (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, i32:$src2))),
+ (iPTR 0))), addr:$dst),
+ (VCVTPS2PHmr addr:$dst, VR128:$src1, imm:$src2)>;
+ def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, i32:$src2)), addr:$dst),
+ (VCVTPS2PHYmr addr:$dst, VR256:$src1, imm:$src2)>;
}
// Patterns for matching conversions from float to half-float and vice versa.
@@ -8075,10 +7918,10 @@ let Predicates = [HasF16C, NoVLX] in {
// AVX2 Instructions
//===----------------------------------------------------------------------===//
-/// AVX2_binop_rmi - AVX2 binary operator with 8-bit immediate
-multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
+/// AVX2_blend_rmi - AVX2 blend with 8-bit immediate
+multiclass AVX2_blend_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT, RegisterClass RC, PatFrag memop_frag,
- X86MemOperand x86memop> {
+ X86MemOperand x86memop, SDNodeXForm commuteXForm> {
let isCommutable = 1 in
def rri : AVX2AIi8<opc, MRMSrcReg, (outs RC:$dst),
(ins RC:$src1, RC:$src2, u8imm:$src3),
@@ -8094,12 +7937,19 @@ multiclass AVX2_binop_rmi<bits<8> opc, string OpcodeStr, SDNode OpNode,
(OpVT (OpNode RC:$src1,
(bitconvert (memop_frag addr:$src2)), imm:$src3)))]>,
Sched<[WriteBlendLd, ReadAfterLd]>, VEX_4V;
+
+ // Pattern to commute if load is in first source.
+ def : Pat<(OpVT (OpNode (bitconvert (memop_frag addr:$src2)),
+ RC:$src1, imm:$src3)),
+ (!cast<Instruction>(NAME#"rmi") RC:$src1, addr:$src2,
+ (commuteXForm imm:$src3))>;
}
-defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
- VR128, loadv2i64, i128mem>;
-defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
- VR256, loadv4i64, i256mem>, VEX_L;
+defm VPBLENDD : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v4i32,
+ VR128, loadv2i64, i128mem, BlendCommuteImm4>;
+defm VPBLENDDY : AVX2_blend_rmi<0x02, "vpblendd", X86Blendi, v8i32,
+ VR256, loadv4i64, i256mem, BlendCommuteImm8>,
+ VEX_L;
// For insertion into the zero index (low half) of a 256-bit vector, it is
// more efficient to generate a blend with immediate instead of an insert*128.
@@ -8187,12 +8037,23 @@ defm VPBROADCASTD : avx2_broadcast<0x58, "vpbroadcastd", i32mem, loadi32,
defm VPBROADCASTQ : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
v2i64, v4i64, NoVLX>;
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+let Predicates = [HasAVX2, NoVLX] in {
// 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
(VPBROADCASTQrm addr:$src)>;
def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
(VPBROADCASTQYrm addr:$src)>;
+
+ def : Pat<(v4i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VPBROADCASTDrm addr:$src)>;
+ def : Pat<(v8i32 (X86VBroadcast (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
+ (VPBROADCASTDYrm addr:$src)>;
+ def : Pat<(v2i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VPBROADCASTQrm addr:$src)>;
+ def : Pat<(v4i64 (X86VBroadcast (v2i64 (scalar_to_vector (loadi64 addr:$src))))),
+ (VPBROADCASTQYrm addr:$src)>;
+}
+let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
// loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
// This means we'll encounter truncated i32 loads; match that here.
def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -8279,6 +8140,13 @@ let Predicates = [HasAVX, NoVLX] in {
// 128bit broadcasts:
def : Pat<(v2f64 (X86VBroadcast f64:$src)),
(VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128))>;
+ def : Pat<(v2f64 (X86VBroadcast (loadf64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
+
+ def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
+ (VMOVDDUPrr VR128:$src)>;
+ def : Pat<(v2f64 (X86VBroadcast (loadv2f64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
}
let Predicates = [HasAVX1Only] in {
@@ -8306,12 +8174,24 @@ let Predicates = [HasAVX1Only] in {
def : Pat<(v2i64 (X86VBroadcast i64:$src)),
(VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
+ def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+ (VMOVDDUPrm addr:$src)>;
}
//===----------------------------------------------------------------------===//
// VPERM - Permute instructions
//
+let Sched = WriteFShuffle256 in
+def AVX2_PERMV_F : OpndItins<
+ IIC_SSE_SHUFP, IIC_SSE_SHUFP
+>;
+
+let Sched = WriteShuffle256 in
+def AVX2_PERMV_I : OpndItins<
+ IIC_SSE_PSHUF_RI, IIC_SSE_PSHUF_MI
+>;
+
multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
ValueType OpVT, X86FoldableSchedWrite Sched,
X86MemOperand memOp> {
@@ -8385,24 +8265,10 @@ def VPERM2I128rm : AVX2AIi8<0x46, MRMSrcMem, (outs VR256:$dst),
(i8 imm:$src3)))]>,
Sched<[WriteShuffle256Ld, ReadAfterLd]>, VEX_4V, VEX_L;
-let Predicates = [HasAVX2] in {
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1, VR256:$src2, (i8 imm:$imm))),
- (VPERM2I128rr VR256:$src1, VR256:$src2, imm:$imm)>;
-
-def : Pat<(v32i8 (X86VPerm2x128 VR256:$src1, (bc_v32i8 (loadv4i64 addr:$src2)),
- (i8 imm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
- (bc_v16i16 (loadv4i64 addr:$src2)), (i8 imm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-def : Pat<(v8i32 (X86VPerm2x128 VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)),
- (i8 imm:$imm))),
- (VPERM2I128rm VR256:$src1, addr:$src2, imm:$imm)>;
-}
+let Predicates = [HasAVX2] in
+def : Pat<(v4i64 (X86VPerm2x128 (loadv4i64 addr:$src2),
+ VR256:$src1, (i8 imm:$imm))),
+ (VPERM2I128rm VR256:$src1, addr:$src2, (Perm2XCommuteImm imm:$imm))>;
//===----------------------------------------------------------------------===//
@@ -8456,20 +8322,23 @@ multiclass avx2_pmovmask<string OpcodeStr,
def rm : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>, VEX_4V;
+ [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))],
+ IIC_SSE_MASKMOV>, VEX_4V, Sched<[WriteLoad]>;
def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
(ins VR256:$src1, i256mem:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
- VEX_4V, VEX_L;
+ [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))],
+ IIC_SSE_MASKMOV>, VEX_4V, VEX_L, Sched<[WriteLoad]>;
def mr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i128mem:$dst, VR128:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>, VEX_4V;
+ [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)], IIC_SSE_MASKMOV>,
+ VEX_4V, Sched<[WriteStore]>;
def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
(ins i256mem:$dst, VR256:$src1, VR256:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
- [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V, VEX_L;
+ [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)], IIC_SSE_MASKMOV>,
+ VEX_4V, VEX_L, Sched<[WriteStore]>;
}
defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
@@ -8616,40 +8485,63 @@ let Predicates = [HasAVX2, NoVLX] in {
(VPSRAVDYrm VR256:$src1, addr:$src2)>;
}
-
-
//===----------------------------------------------------------------------===//
// VGATHER - GATHER Operations
-multiclass avx2_gather<bits<8> opc, string OpcodeStr, RegisterClass RC256,
- X86MemOperand memop128, X86MemOperand memop256> {
+
+// FIXME: Improve scheduling of gather instructions.
+multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
+ ValueType VTy, PatFrag GatherNode128,
+ PatFrag GatherNode256, RegisterClass RC256,
+ X86MemOperand memop128, X86MemOperand memop256,
+ ValueType MTx = VTx, ValueType MTy = VTy> {
def rm : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
(ins VR128:$src1, memop128:$src2, VR128:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX;
+ [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
+ (GatherNode128 VR128:$src1, VR128:$mask,
+ vectoraddr:$src2))]>,
+ VEX, Sched<[WriteLoad]>;
def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
(ins RC256:$src1, memop256:$src2, RC256:$mask),
!strconcat(OpcodeStr,
"\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
- []>, VEX, VEX_L;
-}
-
-let mayLoad = 1, hasSideEffects = 0, Constraints
- = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
- in {
- defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", VR256, vx128mem, vx256mem>, VEX_W;
- defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", VR256, vx128mem, vy256mem>, VEX_W;
- defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", VR256, vx128mem, vy256mem>;
- defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", VR128, vx64mem, vy128mem>;
-
- let ExeDomain = SSEPackedDouble in {
- defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", VR256, vx128mem, vx256mem>, VEX_W;
- defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", VR256, vx128mem, vy256mem>, VEX_W;
- }
-
- let ExeDomain = SSEPackedSingle in {
- defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", VR256, vx128mem, vy256mem>;
- defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", VR128, vx64mem, vy128mem>;
+ [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
+ (GatherNode256 RC256:$src1, RC256:$mask,
+ vectoraddr:$src2))]>,
+ VEX, VEX_L, Sched<[WriteLoad]>;
+}
+
+let Predicates = [UseAVX2] in {
+ let mayLoad = 1, hasSideEffects = 0, Constraints
+ = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
+ in {
+ defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
+ mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
+ defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
+ mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
+ defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
+ mgatherv8i32, VR256, vx128mem, vy256mem>;
+ defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
+ mgatherv4i64, VR128, vx64mem, vy128mem>;
+
+ let ExeDomain = SSEPackedDouble in {
+ defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
+ mgatherv4i32, VR256, vx128mem, vx256mem,
+ v2i64, v4i64>, VEX_W;
+ defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
+ mgatherv4i64, VR256, vx128mem, vy256mem,
+ v2i64, v4i64>, VEX_W;
+ }
+
+ let ExeDomain = SSEPackedSingle in {
+ defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
+ mgatherv8i32, VR256, vx128mem, vy256mem,
+ v4i32, v8i32>;
+ defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
+ mgatherv4i64, VR128, vx64mem, vy128mem,
+ v4i32, v4i32>;
+ }
}
}
@@ -8708,3 +8600,82 @@ def : Pat<(xor FR128:$src1, FR128:$src2),
(COPY_TO_REGCLASS
(XORPSrr (COPY_TO_REGCLASS FR128:$src1, VR128),
(COPY_TO_REGCLASS FR128:$src2, VR128)), FR128)>;
+
+//===----------------------------------------------------------------------===//
+// GFNI instructions
+//===----------------------------------------------------------------------===//
+
+multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
+ RegisterClass RC, PatFrag MemOpFrag,
+ X86MemOperand X86MemOp, bit Is2Addr = 0> {
+ let ExeDomain = SSEPackedInt,
+ AsmString = !if(Is2Addr,
+ OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
+ OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
+ let isCommutable = 1 in
+ def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
+ [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))],
+ SSE_INTALU_ITINS_P.rr>,
+ Sched<[SSE_INTALU_ITINS_P.Sched]>, T8PD;
+
+ def rm : PDI<0xCF, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, X86MemOp:$src2), "",
+ [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1,
+ (bitconvert (MemOpFrag addr:$src2)))))],
+ SSE_INTALU_ITINS_P.rm>,
+ Sched<[SSE_INTALU_ITINS_P.Sched.Folded, ReadAfterLd]>, T8PD;
+ }
+}
+
+multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
+ SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
+ X86MemOperand X86MemOp, bit Is2Addr = 0> {
+ let AsmString = !if(Is2Addr,
+ OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+ OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
+ def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
+ (ins RC:$src1, RC:$src2, u8imm:$src3), "",
+ [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, imm:$src3)))],
+ SSE_INTALU_ITINS_P.rr, SSEPackedInt>,
+ Sched<[WriteVecALU]>;
+ def rmi : Ii8<Op, MRMSrcMem, (outs RC:$dst),
+ (ins RC:$src1, X86MemOp:$src2, u8imm:$src3), "",
+ [(set RC:$dst, (OpVT (OpNode RC:$src1,
+ (bitconvert (MemOpFrag addr:$src2)),
+ imm:$src3)))],
+ SSE_INTALU_ITINS_P.rm, SSEPackedInt>,
+ Sched<[WriteVecALU.Folded, ReadAfterLd]>;
+ }
+}
+
+multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
+ let Constraints = "$src1 = $dst",
+ Predicates = [HasGFNI, UseSSE2] in
+ defm NAME : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
+ VR128, loadv2i64, i128mem, 1>;
+ let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
+ defm V##NAME : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
+ loadv2i64, i128mem>, VEX_4V, VEX_W;
+ defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
+ loadv4i64, i256mem>, VEX_4V, VEX_L, VEX_W;
+ }
+}
+
+// GF2P8MULB
+let Constraints = "$src1 = $dst",
+ Predicates = [HasGFNI, UseSSE2] in
+defm GF2P8MULB : GF2P8MULB_rm<"gf2p8mulb", v16i8, VR128, memopv2i64,
+ i128mem, 1>;
+let Predicates = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
+ defm VGF2P8MULB : GF2P8MULB_rm<"vgf2p8mulb", v16i8, VR128, loadv2i64,
+ i128mem>, VEX_4V;
+ defm VGF2P8MULBY : GF2P8MULB_rm<"vgf2p8mulb", v32i8, VR256, loadv4i64,
+ i256mem>, VEX_4V, VEX_L;
+}
+// GF2P8AFFINEINVQB, GF2P8AFFINEQB
+let isCommutable = 0 in {
+ defm GF2P8AFFINEINVQB : GF2P8AFFINE_common<0xCF, "gf2p8affineinvqb",
+ X86GF2P8affineinvqb>, TAPD;
+ defm GF2P8AFFINEQB : GF2P8AFFINE_common<0xCE, "gf2p8affineqb",
+ X86GF2P8affineqb>, TAPD;
+}
+
diff --git a/lib/Target/X86/X86InstrSVM.td b/lib/Target/X86/X86InstrSVM.td
index c847be7ec099..bdf478600279 100644
--- a/lib/Target/X86/X86InstrSVM.td
+++ b/lib/Target/X86/X86InstrSVM.td
@@ -15,48 +15,49 @@
//===----------------------------------------------------------------------===//
// SVM instructions
+let SchedRW = [WriteSystem] in {
// 0F 01 D9
-def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", []>, TB;
+def VMMCALL : I<0x01, MRM_D9, (outs), (ins), "vmmcall", [], IIC_SVM>, TB;
// 0F 01 DC
-def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", []>, TB;
+def STGI : I<0x01, MRM_DC, (outs), (ins), "stgi", [], IIC_STGI>, TB;
// 0F 01 DD
-def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", []>, TB;
+def CLGI : I<0x01, MRM_DD, (outs), (ins), "clgi", [], IIC_CLGI>, TB;
// 0F 01 DE
let Uses = [EAX] in
-def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", []>, TB;
+def SKINIT : I<0x01, MRM_DE, (outs), (ins), "skinit\t{%eax|eax}", [], IIC_SKINIT>, TB;
// 0F 01 D8
let Uses = [EAX] in
def VMRUN32 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+ "vmrun\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX] in
def VMRUN64 : I<0x01, MRM_D8, (outs), (ins),
- "vmrun\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+ "vmrun\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
// 0F 01 DA
let Uses = [EAX] in
def VMLOAD32 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+ "vmload\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX] in
def VMLOAD64 : I<0x01, MRM_DA, (outs), (ins),
- "vmload\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+ "vmload\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
// 0F 01 DB
let Uses = [EAX] in
def VMSAVE32 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%eax|eax}", []>, TB, Requires<[Not64BitMode]>;
+ "vmsave\t{%eax|eax}", [], IIC_SVM>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX] in
def VMSAVE64 : I<0x01, MRM_DB, (outs), (ins),
- "vmsave\t{%rax|rax}", []>, TB, Requires<[In64BitMode]>;
+ "vmsave\t{%rax|rax}", [], IIC_SVM>, TB, Requires<[In64BitMode]>;
// 0F 01 DF
let Uses = [EAX, ECX] in
def INVLPGA32 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %eax|eax, ecx}", []>, TB, Requires<[Not64BitMode]>;
+ "invlpga\t{%ecx, %eax|eax, ecx}", [], IIC_INVLPG>, TB, Requires<[Not64BitMode]>;
let Uses = [RAX, ECX] in
def INVLPGA64 : I<0x01, MRM_DF, (outs), (ins),
- "invlpga\t{%ecx, %rax|rax, ecx}", []>, TB, Requires<[In64BitMode]>;
-
+ "invlpga\t{%ecx, %rax|rax, ecx}", [], IIC_INVLPG>, TB, Requires<[In64BitMode]>;
+} // SchedRW
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index 0efb383e1c8d..43e1752f2df2 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -83,7 +83,8 @@ def SHL32mCL : I<0xD3, MRM4m, (outs), (ins i32mem:$dst),
OpSize32;
def SHL64mCL : RI<0xD3, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t{%cl, $dst|$dst, cl}",
- [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+ [(store (shl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ Requires<[In64BitMode]>;
}
def SHL8mi : Ii8<0xC0, MRM4m, (outs), (ins i8mem :$dst, u8imm:$src),
"shl{b}\t{$src, $dst|$dst, $src}",
@@ -100,7 +101,7 @@ def SHL32mi : Ii8<0xC1, MRM4m, (outs), (ins i32mem:$dst, u8imm:$src),
def SHL64mi : RIi8<0xC1, MRM4m, (outs), (ins i64mem:$dst, u8imm:$src),
"shl{q}\t{$src, $dst|$dst, $src}",
[(store (shl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
// Shift by 1
def SHL8m1 : I<0xD0, MRM4m, (outs), (ins i8mem :$dst),
@@ -118,7 +119,7 @@ def SHL32m1 : I<0xD1, MRM4m, (outs), (ins i32mem:$dst),
def SHL64m1 : RI<0xD1, MRM4m, (outs), (ins i64mem:$dst),
"shl{q}\t$dst",
[(store (shl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
} // SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
@@ -183,7 +184,8 @@ def SHR32mCL : I<0xD3, MRM5m, (outs), (ins i32mem:$dst),
OpSize32;
def SHR64mCL : RI<0xD3, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t{%cl, $dst|$dst, cl}",
- [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>;
+ [(store (srl (loadi64 addr:$dst), CL), addr:$dst)], IIC_SR>,
+ Requires<[In64BitMode]>;
}
def SHR8mi : Ii8<0xC0, MRM5m, (outs), (ins i8mem :$dst, u8imm:$src),
"shr{b}\t{$src, $dst|$dst, $src}",
@@ -200,7 +202,7 @@ def SHR32mi : Ii8<0xC1, MRM5m, (outs), (ins i32mem:$dst, u8imm:$src),
def SHR64mi : RIi8<0xC1, MRM5m, (outs), (ins i64mem:$dst, u8imm:$src),
"shr{q}\t{$src, $dst|$dst, $src}",
[(store (srl (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
// Shift by 1
def SHR8m1 : I<0xD0, MRM5m, (outs), (ins i8mem :$dst),
@@ -218,7 +220,7 @@ def SHR32m1 : I<0xD1, MRM5m, (outs), (ins i32mem:$dst),
def SHR64m1 : RI<0xD1, MRM5m, (outs), (ins i64mem:$dst),
"shr{q}\t$dst",
[(store (srl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
} // SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
@@ -296,7 +298,7 @@ def SAR32mCL : I<0xD3, MRM7m, (outs), (ins i32mem:$dst),
def SAR64mCL : RI<0xD3, MRM7m, (outs), (ins i64mem:$dst),
"sar{q}\t{%cl, $dst|$dst, cl}",
[(store (sra (loadi64 addr:$dst), CL), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
}
def SAR8mi : Ii8<0xC0, MRM7m, (outs), (ins i8mem :$dst, u8imm:$src),
"sar{b}\t{$src, $dst|$dst, $src}",
@@ -313,7 +315,7 @@ def SAR32mi : Ii8<0xC1, MRM7m, (outs), (ins i32mem:$dst, u8imm:$src),
def SAR64mi : RIi8<0xC1, MRM7m, (outs), (ins i64mem:$dst, u8imm:$src),
"sar{q}\t{$src, $dst|$dst, $src}",
[(store (sra (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
// Shift by 1
def SAR8m1 : I<0xD0, MRM7m, (outs), (ins i8mem :$dst),
@@ -331,7 +333,7 @@ def SAR32m1 : I<0xD1, MRM7m, (outs), (ins i32mem:$dst),
def SAR64m1 : RI<0xD1, MRM7m, (outs), (ins i64mem:$dst),
"sar{q}\t$dst",
[(store (sra (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -418,9 +420,10 @@ def RCL32m1 : I<0xD1, MRM2m, (outs), (ins i32mem:$dst),
def RCL32mi : Ii8<0xC1, MRM2m, (outs), (ins i32mem:$dst, u8imm:$cnt),
"rcl{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
def RCL64m1 : RI<0xD1, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t$dst", [], IIC_SR>;
+ "rcl{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>;
def RCL64mi : RIi8<0xC1, MRM2m, (outs), (ins i64mem:$dst, u8imm:$cnt),
- "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcl{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>,
+ Requires<[In64BitMode]>;
def RCR8m1 : I<0xD0, MRM3m, (outs), (ins i8mem:$dst),
"rcr{b}\t$dst", [], IIC_SR>;
@@ -435,9 +438,10 @@ def RCR32m1 : I<0xD1, MRM3m, (outs), (ins i32mem:$dst),
def RCR32mi : Ii8<0xC1, MRM3m, (outs), (ins i32mem:$dst, u8imm:$cnt),
"rcr{l}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>, OpSize32;
def RCR64m1 : RI<0xD1, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t$dst", [], IIC_SR>;
+ "rcr{q}\t$dst", [], IIC_SR>, Requires<[In64BitMode]>;
def RCR64mi : RIi8<0xC1, MRM3m, (outs), (ins i64mem:$dst, u8imm:$cnt),
- "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>;
+ "rcr{q}\t{$cnt, $dst|$dst, $cnt}", [], IIC_SR>,
+ Requires<[In64BitMode]>;
} // Uses = [EFLAGS]
let Uses = [CL, EFLAGS] in {
@@ -448,7 +452,8 @@ def RCL16mCL : I<0xD3, MRM2m, (outs), (ins i16mem:$dst),
def RCL32mCL : I<0xD3, MRM2m, (outs), (ins i32mem:$dst),
"rcl{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
def RCL64mCL : RI<0xD3, MRM2m, (outs), (ins i64mem:$dst),
- "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcl{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>,
+ Requires<[In64BitMode]>;
def RCR8mCL : I<0xD2, MRM3m, (outs), (ins i8mem:$dst),
"rcr{b}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
@@ -457,7 +462,8 @@ def RCR16mCL : I<0xD3, MRM3m, (outs), (ins i16mem:$dst),
def RCR32mCL : I<0xD3, MRM3m, (outs), (ins i32mem:$dst),
"rcr{l}\t{%cl, $dst|$dst, cl}", [], IIC_SR>, OpSize32;
def RCR64mCL : RI<0xD3, MRM3m, (outs), (ins i64mem:$dst),
- "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>;
+ "rcr{q}\t{%cl, $dst|$dst, cl}", [], IIC_SR>,
+ Requires<[In64BitMode]>;
} // Uses = [CL, EFLAGS]
} // SchedRW
} // hasSideEffects = 0
@@ -532,7 +538,7 @@ def ROL32mCL : I<0xD3, MRM0m, (outs), (ins i32mem:$dst),
def ROL64mCL : RI<0xD3, MRM0m, (outs), (ins i64mem:$dst),
"rol{q}\t{%cl, $dst|$dst, cl}",
[(store (rotl (loadi64 addr:$dst), CL), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
}
def ROL8mi : Ii8<0xC0, MRM0m, (outs), (ins i8mem :$dst, u8imm:$src1),
"rol{b}\t{$src1, $dst|$dst, $src1}",
@@ -549,7 +555,7 @@ def ROL32mi : Ii8<0xC1, MRM0m, (outs), (ins i32mem:$dst, u8imm:$src1),
def ROL64mi : RIi8<0xC1, MRM0m, (outs), (ins i64mem:$dst, u8imm:$src1),
"rol{q}\t{$src1, $dst|$dst, $src1}",
[(store (rotl (loadi64 addr:$dst), (i8 imm:$src1)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
// Rotate by 1
def ROL8m1 : I<0xD0, MRM0m, (outs), (ins i8mem :$dst),
@@ -567,7 +573,7 @@ def ROL32m1 : I<0xD1, MRM0m, (outs), (ins i32mem:$dst),
def ROL64m1 : RI<0xD1, MRM0m, (outs), (ins i64mem:$dst),
"rol{q}\t$dst",
[(store (rotl (loadi64 addr:$dst), (i8 1)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
} // SchedRW
let Constraints = "$src1 = $dst", SchedRW = [WriteShift] in {
@@ -640,7 +646,7 @@ def ROR32mCL : I<0xD3, MRM1m, (outs), (ins i32mem:$dst),
def ROR64mCL : RI<0xD3, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t{%cl, $dst|$dst, cl}",
[(store (rotr (loadi64 addr:$dst), CL), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
}
def ROR8mi : Ii8<0xC0, MRM1m, (outs), (ins i8mem :$dst, u8imm:$src),
"ror{b}\t{$src, $dst|$dst, $src}",
@@ -657,7 +663,7 @@ def ROR32mi : Ii8<0xC1, MRM1m, (outs), (ins i32mem:$dst, u8imm:$src),
def ROR64mi : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
"ror{q}\t{$src, $dst|$dst, $src}",
[(store (rotr (loadi64 addr:$dst), (i8 imm:$src)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
// Rotate by 1
def ROR8m1 : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
@@ -675,7 +681,7 @@ def ROR32m1 : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
def ROR64m1 : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
"ror{q}\t$dst",
[(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)],
- IIC_SR>;
+ IIC_SR>, Requires<[In64BitMode]>;
} // SchedRW
@@ -961,16 +967,40 @@ let Predicates = [HasBMI2] in {
(i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
}
- // Patterns on SARXrm/SHRXrm/SHLXrm are explicitly omitted to favor
- //
+ // Artificially lower the complexity so that we'll favor
// mov (%ecx), %esi
// shl $imm, $esi
//
// over
//
- // movb $imm %al
+ // movb $imm, %al
// shlx %al, (%ecx), %esi
- //
- // As SARXrr/SHRXrr/SHLXrr is favored on variable shift, the peephole
- // optimization will fold them into SARXrm/SHRXrm/SHLXrm if possible.
+ let AddedComplexity = -20 in {
+ def : Pat<(sra (loadi32 addr:$src1), GR8:$src2),
+ (SARX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(sra (loadi64 addr:$src1), GR8:$src2),
+ (SARX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(srl (loadi32 addr:$src1), GR8:$src2),
+ (SHRX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(srl (loadi64 addr:$src1), GR8:$src2),
+ (SHRX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+
+ def : Pat<(shl (loadi32 addr:$src1), GR8:$src2),
+ (SHLX32rm addr:$src1,
+ (INSERT_SUBREG
+ (i32 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ def : Pat<(shl (loadi64 addr:$src1), GR8:$src2),
+ (SHLX64rm addr:$src1,
+ (INSERT_SUBREG
+ (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+ }
}
diff --git a/lib/Target/X86/X86InstrSystem.td b/lib/Target/X86/X86InstrSystem.td
index 2e5350ce979e..40d2dca4f9ec 100644
--- a/lib/Target/X86/X86InstrSystem.td
+++ b/lib/Target/X86/X86InstrSystem.td
@@ -19,7 +19,8 @@ let Defs = [RAX, RDX] in
TB;
let Defs = [RAX, RCX, RDX] in
- def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)]>, TB;
+ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", [(X86rdtscp)],
+ IIC_RDTSCP>, TB;
// CPU flow control instructions
@@ -33,7 +34,7 @@ def RSM : I<0xAA, RawFrm, (outs), (ins), "rsm", [], IIC_RSM>, TB;
// Interrupt and SysCall Instructions.
let Uses = [EFLAGS] in
- def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>;
+ def INTO : I<0xce, RawFrm, (outs), (ins), "into", []>, Requires<[Not64BitMode]>;
def INT3 : I<0xcc, RawFrm, (outs), (ins), "int3",
[(int_x86_int (i8 3))], IIC_INT3>;
} // SchedRW
@@ -154,13 +155,14 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
//===----------------------------------------------------------------------===//
// Segment override instruction prefixes
-def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
-def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
-def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
-def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
-def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
-def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
-
+let SchedRW = [WriteNop] in {
+def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", [], IIC_NOP>;
+def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", [], IIC_NOP>;
+def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", [], IIC_NOP>;
+def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", [], IIC_NOP>;
+def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", [], IIC_NOP>;
+def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", [], IIC_NOP>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// Moves to and from segment registers.
@@ -175,11 +177,7 @@ def MOV64rs : RI<0x8C, MRMDestReg, (outs GR64:$dst), (ins SEGMENT_REG:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_REG_SR>;
let mayStore = 1 in {
def MOV16ms : I<0x8C, MRMDestMem, (outs), (ins i16mem:$dst, SEGMENT_REG:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize16;
-def MOV32ms : I<0x8C, MRMDestMem, (outs), (ins i32mem:$dst, SEGMENT_REG:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSize32;
-def MOV64ms : RI<0x8C, MRMDestMem, (outs), (ins i64mem:$dst, SEGMENT_REG:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_MEM_SR>, OpSizeIgnore;
}
def MOV16sr : I<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR16:$src),
"mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>, OpSize16;
@@ -189,11 +187,7 @@ def MOV64sr : RI<0x8E, MRMSrcReg, (outs SEGMENT_REG:$dst), (ins GR64:$src),
"mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_REG>;
let mayLoad = 1 in {
def MOV16sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i16mem:$src),
- "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize16;
-def MOV32sm : I<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i32mem:$src),
- "mov{l}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSize32;
-def MOV64sm : RI<0x8E, MRMSrcMem, (outs SEGMENT_REG:$dst), (ins i64mem:$src),
- "mov{q}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>;
+ "mov{w}\t{$src, $dst|$dst, $src}", [], IIC_MOV_SR_MEM>, OpSizeIgnore;
}
} // SchedRW
@@ -489,6 +483,60 @@ def WBINVD : I<0x09, RawFrm, (outs), (ins), "wbinvd", [], IIC_INVD>, TB;
} // SchedRW
//===----------------------------------------------------------------------===//
+// CET instructions
+let SchedRW = [WriteSystem], Predicates = [HasSHSTK] in{
+ let Uses = [SSP] in {
+ let Defs = [SSP] in {
+ def INCSSPD : I<0xAE, MRM5r, (outs), (ins GR32:$src), "incsspd\t$src",
+ [(int_x86_incsspd GR32:$src)]>, XS;
+ def INCSSPQ : RI<0xAE, MRM5r, (outs), (ins GR64:$src), "incsspq\t$src",
+ [(int_x86_incsspq GR64:$src)]>, XS;
+ } // Defs SSP
+
+ let Constraints = "$src = $dst" in {
+ def RDSSPD : I<0x1E, MRM1r, (outs GR32:$dst), (ins GR32:$src),
+ "rdsspd\t$dst",
+ [(set GR32:$dst, (int_x86_rdsspd GR32:$src))]>, XS;
+ def RDSSPQ : RI<0x1E, MRM1r, (outs GR64:$dst), (ins GR64:$src),
+ "rdsspq\t$dst",
+ [(set GR64:$dst, (int_x86_rdsspq GR64:$src))]>, XS;
+ }
+
+ let Defs = [SSP] in {
+ def SAVEPREVSSP : I<0x01, MRM_EA, (outs), (ins), "saveprevssp",
+ [(int_x86_saveprevssp)]>, XS;
+ def RSTORSSP : I<0x01, MRM5m, (outs), (ins i32mem:$src),
+ "rstorssp\t$src",
+ [(int_x86_rstorssp addr:$src)]>, XS;
+ } // Defs SSP
+ } // Uses SSP
+
+ def WRSSD : I<0xF6, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "wrssd\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrssd GR32:$src, addr:$dst)]>, T8PS;
+ def WRSSQ : RI<0xF6, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "wrssq\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrssq GR64:$src, addr:$dst)]>, T8PS;
+ def WRUSSD : I<0xF5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
+ "wrussd\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrussd GR32:$src, addr:$dst)]>, T8PD;
+ def WRUSSQ : RI<0xF5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
+ "wrussq\t{$src, $dst|$dst, $src}",
+ [(int_x86_wrussq GR64:$src, addr:$dst)]>, T8PD;
+
+ let Defs = [SSP] in {
+ let Uses = [SSP] in {
+ def SETSSBSY : I<0x01, MRM_E8, (outs), (ins), "setssbsy",
+ [(int_x86_setssbsy)]>, XS;
+ } // Uses SSP
+
+ def CLRSSBSY : I<0xAE, MRM6m, (outs), (ins i32mem:$src),
+ "clrssbsy\t$src",
+ [(int_x86_clrssbsy addr:$src)]>, XS;
+ } // Defs SSP
+} // SchedRW && HasSHSTK
+
+//===----------------------------------------------------------------------===//
// XSAVE instructions
let SchedRW = [WriteSystem] in {
let Predicates = [HasXSAVE] in {
@@ -496,67 +544,60 @@ let Defs = [EDX, EAX], Uses = [ECX] in
def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
let Uses = [EDX, EAX, ECX] in
- def XSETBV : I<0x01, MRM_D1, (outs), (ins),
- "xsetbv",
+ def XSETBV : I<0x01, MRM_D1, (outs), (ins),
+ "xsetbv",
[(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
} // HasXSAVE
let Uses = [EDX, EAX] in {
-let Predicates = [HasXSAVE] in {
- def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
- "xsave\t$dst",
- [(int_x86_xsave addr:$dst, EDX, EAX)]>, TB;
- def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
- "xsave64\t$dst",
- [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
- def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
- "xrstor\t$dst",
- [(int_x86_xrstor addr:$dst, EDX, EAX)]>, TB;
- def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
- "xrstor64\t$dst",
- [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
-}
-let Predicates = [HasXSAVEOPT] in {
- def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
- "xsaveopt\t$dst",
- [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS;
- def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
- "xsaveopt64\t$dst",
- [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[In64BitMode]>;
-}
-let Predicates = [HasXSAVEC] in {
- def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
- "xsavec\t$dst",
- [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB;
- def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
- "xsavec64\t$dst",
- [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
-}
-let Predicates = [HasXSAVES] in {
- def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
- "xsaves\t$dst",
- [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB;
- def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
- "xsaves64\t$dst",
- [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
- def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
- "xrstors\t$dst",
- [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB;
- def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
- "xrstors64\t$dst",
- [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[In64BitMode]>;
-}
+def XSAVE : I<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave\t$dst",
+ [(int_x86_xsave addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+def XSAVE64 : RI<0xAE, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsave64\t$dst",
+ [(int_x86_xsave64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+def XRSTOR : I<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor\t$dst",
+ [(int_x86_xrstor addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE]>;
+def XRSTOR64 : RI<0xAE, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xrstor64\t$dst",
+ [(int_x86_xrstor64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
+def XSAVEOPT : I<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt\t$dst",
+ [(int_x86_xsaveopt addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT]>;
+def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaque512mem:$dst),
+ "xsaveopt64\t$dst",
+ [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
+def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec\t$dst",
+ [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
+def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaque512mem:$dst),
+ "xsavec64\t$dst",
+ [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
+def XSAVES : I<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves\t$dst",
+ [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
+def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaque512mem:$dst),
+ "xsaves64\t$dst",
+ [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
+def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+ "xrstors\t$dst",
+ [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
+def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaque512mem:$dst),
+ "xrstors64\t$dst",
+ [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
} // Uses
} // SchedRW
//===----------------------------------------------------------------------===//
// VIA PadLock crypto instructions
-let Defs = [RAX, RDI], Uses = [RDX, RDI] in
+let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in
def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
def : InstAlias<"xstorerng", (XSTORE)>;
+let SchedRW = [WriteSystem] in {
let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
@@ -571,67 +612,110 @@ let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
}
let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+} // SchedRW
+
//==-----------------------------------------------------------------------===//
// PKU - enable protection key
-let usesCustomInserter = 1 in {
+let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in {
def WRPKRU : PseudoI<(outs), (ins GR32:$src),
[(int_x86_wrpkru GR32:$src)]>;
def RDPKRU : PseudoI<(outs GR32:$dst), (ins),
[(set GR32:$dst, (int_x86_rdpkru))]>;
}
+let SchedRW = [WriteSystem] in {
let Defs = [EAX, EDX], Uses = [ECX] in
- def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", []>, TB;
+ def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru", [], IIC_PKU>, TB;
let Uses = [EAX, ECX, EDX] in
- def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", []>, TB;
+ def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru", [], IIC_PKU>, TB;
+} // SchedRW
//===----------------------------------------------------------------------===//
// FS/GS Base Instructions
-let Predicates = [HasFSGSBase, In64BitMode] in {
+let Predicates = [HasFSGSBase, In64BitMode], SchedRW = [WriteSystem] in {
def RDFSBASE : I<0xAE, MRM0r, (outs GR32:$dst), (ins),
"rdfsbase{l}\t$dst",
- [(set GR32:$dst, (int_x86_rdfsbase_32))]>, XS;
+ [(set GR32:$dst, (int_x86_rdfsbase_32))],
+ IIC_SEGMENT_BASE_R>, XS;
def RDFSBASE64 : RI<0xAE, MRM0r, (outs GR64:$dst), (ins),
"rdfsbase{q}\t$dst",
- [(set GR64:$dst, (int_x86_rdfsbase_64))]>, XS;
+ [(set GR64:$dst, (int_x86_rdfsbase_64))],
+ IIC_SEGMENT_BASE_R>, XS;
def RDGSBASE : I<0xAE, MRM1r, (outs GR32:$dst), (ins),
"rdgsbase{l}\t$dst",
- [(set GR32:$dst, (int_x86_rdgsbase_32))]>, XS;
+ [(set GR32:$dst, (int_x86_rdgsbase_32))],
+ IIC_SEGMENT_BASE_R>, XS;
def RDGSBASE64 : RI<0xAE, MRM1r, (outs GR64:$dst), (ins),
"rdgsbase{q}\t$dst",
- [(set GR64:$dst, (int_x86_rdgsbase_64))]>, XS;
+ [(set GR64:$dst, (int_x86_rdgsbase_64))],
+ IIC_SEGMENT_BASE_R>, XS;
def WRFSBASE : I<0xAE, MRM2r, (outs), (ins GR32:$src),
"wrfsbase{l}\t$src",
- [(int_x86_wrfsbase_32 GR32:$src)]>, XS;
+ [(int_x86_wrfsbase_32 GR32:$src)],
+ IIC_SEGMENT_BASE_W>, XS;
def WRFSBASE64 : RI<0xAE, MRM2r, (outs), (ins GR64:$src),
"wrfsbase{q}\t$src",
- [(int_x86_wrfsbase_64 GR64:$src)]>, XS;
+ [(int_x86_wrfsbase_64 GR64:$src)],
+ IIC_SEGMENT_BASE_W>, XS;
def WRGSBASE : I<0xAE, MRM3r, (outs), (ins GR32:$src),
"wrgsbase{l}\t$src",
- [(int_x86_wrgsbase_32 GR32:$src)]>, XS;
+ [(int_x86_wrgsbase_32 GR32:$src)], IIC_SEGMENT_BASE_W>, XS;
def WRGSBASE64 : RI<0xAE, MRM3r, (outs), (ins GR64:$src),
"wrgsbase{q}\t$src",
- [(int_x86_wrgsbase_64 GR64:$src)]>, XS;
+ [(int_x86_wrgsbase_64 GR64:$src)],
+ IIC_SEGMENT_BASE_W>, XS;
}
//===----------------------------------------------------------------------===//
// INVPCID Instruction
+let SchedRW = [WriteSystem] in {
def INVPCID32 : I<0x82, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD,
Requires<[Not64BitMode]>;
def INVPCID64 : I<0x82, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invpcid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ "invpcid\t{$src2, $src1|$src1, $src2}", [], IIC_INVPCID>, T8PD,
Requires<[In64BitMode]>;
+} // SchedRW
//===----------------------------------------------------------------------===//
// SMAP Instruction
-let Defs = [EFLAGS] in {
- def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
- def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
+ def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", [], IIC_SMAP>, TB;
+ def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", [], IIC_SMAP>, TB;
}
//===----------------------------------------------------------------------===//
// SMX Instruction
+let SchedRW = [WriteSystem] in {
let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
- def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
-}
+ def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", [], IIC_SMX>, TB;
+} // Uses, Defs
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// RDPID Instruction
+let SchedRW = [WriteSystem] in {
+def RDPID32 : I<0xC7, MRM7r, (outs GR32:$src), (ins),
+ "rdpid\t$src", [], IIC_RDPID>, XS,
+ Requires<[Not64BitMode]>;
+def RDPID64 : I<0xC7, MRM7r, (outs GR64:$src), (ins),
+ "rdpid\t$src", [], IIC_RDPID>, XS,
+ Requires<[In64BitMode]>;
+} // SchedRW
+
+//===----------------------------------------------------------------------===//
+// PTWRITE Instruction
+let SchedRW = [WriteSystem] in {
+
+def PTWRITEm: I<0xAE, MRM4m, (outs), (ins i32mem:$dst),
+ "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS;
+def PTWRITE64m : RI<0xAE, MRM4m, (outs), (ins i64mem:$dst),
+ "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS,
+ Requires<[In64BitMode]>;
+
+def PTWRITEr : I<0xAE, MRM4r, (outs), (ins GR32:$dst),
+ "ptwrite{l}\t$dst", [], IIC_PTWRITE>, XS;
+def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
+ "ptwrite{q}\t$dst", [], IIC_PTWRITE>, XS,
+ Requires<[In64BitMode]>;
+} // SchedRW
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 61aac58a491f..10c6eef78639 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -18,6 +18,8 @@
def X86xtest: SDNode<"X86ISD::XTEST", SDTypeProfile<1, 0, [SDTCisVT<0, i32>]>,
[SDNPHasChain, SDNPSideEffect]>;
+let SchedRW = [WriteSystem] in {
+
let usesCustomInserter = 1 in
def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
"# XBEGIN", [(set GR32:$dst, (int_x86_xbegin))]>,
@@ -45,11 +47,14 @@ def XTEST : I<0x01, MRM_D6, (outs), (ins),
def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
"xabort\t$imm",
[(int_x86_xabort imm:$imm)]>, Requires<[HasRTM]>;
+} // SchedRW
// HLE prefixes
+let SchedRW = [WriteSystem] in {
let isAsmParserOnly = 1 in {
def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>;
def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>;
}
+} // SchedRW
diff --git a/lib/Target/X86/X86InstrVMX.td b/lib/Target/X86/X86InstrVMX.td
index 315a69e6a2a2..4bb2c204b368 100644
--- a/lib/Target/X86/X86InstrVMX.td
+++ b/lib/Target/X86/X86InstrVMX.td
@@ -15,56 +15,66 @@
//===----------------------------------------------------------------------===//
// VMX instructions
+let SchedRW = [WriteSystem] in {
// 66 0F 38 80
def INVEPT32 : I<0x80, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
Requires<[Not64BitMode]>;
def INVEPT64 : I<0x80, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invept\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ "invept\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
Requires<[In64BitMode]>;
+
// 66 0F 38 81
def INVVPID32 : I<0x81, MRMSrcMem, (outs), (ins GR32:$src1, i128mem:$src2),
- "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
Requires<[Not64BitMode]>;
def INVVPID64 : I<0x81, MRMSrcMem, (outs), (ins GR64:$src1, i128mem:$src2),
- "invvpid\t{$src2, $src1|$src1, $src2}", []>, T8PD,
+ "invvpid\t{$src2, $src1|$src1, $src2}", [], IIC_VMX>, T8PD,
Requires<[In64BitMode]>;
+
// 0F 01 C1
-def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", []>, TB;
+def VMCALL : I<0x01, MRM_C1, (outs), (ins), "vmcall", [], IIC_VMX>, TB;
def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
"vmclear\t$vmcs", []>, PD;
+
// OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", [], IIC_VMX>, TB;
+
// 0F 01 C2
-def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
+def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", [], IIC_VMX>, TB;
+
// 0F 01 C3
-def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", []>, TB;
+def VMRESUME : I<0x01, MRM_C3, (outs), (ins), "vmresume", [], IIC_VMX>, TB;
def VMPTRLDm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
- "vmptrld\t$vmcs", []>, PS;
+ "vmptrld\t$vmcs", [], IIC_VMX>, PS;
def VMPTRSTm : I<0xC7, MRM7m, (outs), (ins i64mem:$vmcs),
- "vmptrst\t$vmcs", []>, TB;
+ "vmptrst\t$vmcs", [], IIC_VMX>, PS;
def VMREAD64rr : I<0x78, MRMDestReg, (outs GR64:$dst), (ins GR64:$src),
- "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+ "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
def VMREAD32rr : I<0x78, MRMDestReg, (outs GR32:$dst), (ins GR32:$src),
- "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+ "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+
let mayStore = 1 in {
def VMREAD64mr : I<0x78, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
- "vmread{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+ "vmread{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
def VMREAD32mr : I<0x78, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
- "vmread{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
-}
+ "vmread{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+} // mayStore
+
def VMWRITE64rr : I<0x79, MRMSrcReg, (outs GR64:$dst), (ins GR64:$src),
- "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
def VMWRITE32rr : I<0x79, MRMSrcReg, (outs GR32:$dst), (ins GR32:$src),
- "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+
let mayLoad = 1 in {
def VMWRITE64rm : I<0x79, MRMSrcMem, (outs GR64:$dst), (ins i64mem:$src),
- "vmwrite{q}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[In64BitMode]>;
+ "vmwrite{q}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[In64BitMode]>;
def VMWRITE32rm : I<0x79, MRMSrcMem, (outs GR32:$dst), (ins i32mem:$src),
- "vmwrite{l}\t{$src, $dst|$dst, $src}", []>, PS, Requires<[Not64BitMode]>;
-}
+ "vmwrite{l}\t{$src, $dst|$dst, $src}", [], IIC_VMX>, PS, Requires<[Not64BitMode]>;
+} // mayLoad
+
// 0F 01 C4
def VMXOFF : I<0x01, MRM_C4, (outs), (ins), "vmxoff", []>, TB;
def VMXON : I<0xC7, MRM6m, (outs), (ins i64mem:$vmxon),
"vmxon\t$vmxon", []>, XS;
-
+} // SchedRW
diff --git a/lib/Target/X86/X86InstrVecCompiler.td b/lib/Target/X86/X86InstrVecCompiler.td
new file mode 100644
index 000000000000..c1cb4dcb16be
--- /dev/null
+++ b/lib/Target/X86/X86InstrVecCompiler.td
@@ -0,0 +1,586 @@
+//===- X86InstrVecCompiler.td - Vector Compiler Patterns ---*- tablegen -*-===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the various vector pseudo instructions used by the
+// compiler, as well as Pat patterns used during instruction selection.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// No op bitconverts
+//===----------------------------------------------------------------------===//
+
+// Bitcasts between 128-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+def : Pat<(v2i64 (bitconvert (v4i32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8i16 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v16i8 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v2f64 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v4f32 VR128:$src))), (v2i64 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2i64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8i16 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v16i8 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v2f64 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v4f32 VR128:$src))), (v4i32 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2i64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4i32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v16i8 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v2f64 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v4f32 VR128:$src))), (v8i16 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2i64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4i32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8i16 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v2f64 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v4f32 VR128:$src))), (v16i8 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2i64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v4i32 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8i16 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v16i8 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v2f64 VR128:$src))), (v4f32 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v2i64 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4i32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8i16 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v16i8 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v4f32 VR128:$src))), (v2f64 VR128:$src)>;
+def : Pat<(f128 (bitconvert (i128 FR128:$src))), (f128 FR128:$src)>;
+def : Pat<(i128 (bitconvert (f128 FR128:$src))), (i128 FR128:$src)>;
+
+// Bitcasts between 256-bit vector types. Return the original type since
+// no instruction is needed for the conversion
+def : Pat<(v4i64 (bitconvert (v8i32 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v16i16 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v32i8 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v8f32 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v4i64 (bitconvert (v4f64 VR256:$src))), (v4i64 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v4i64 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v16i16 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v32i8 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v4f64 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v8i32 (bitconvert (v8f32 VR256:$src))), (v8i32 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4i64 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8i32 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v32i8 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v4f64 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v16i16 (bitconvert (v8f32 VR256:$src))), (v16i16 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v4i64 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v8i32 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v16i16 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v4f64 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v32i8 (bitconvert (v8f32 VR256:$src))), (v32i8 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v4i64 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v8i32 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v16i16 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v32i8 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v8f32 (bitconvert (v4f64 VR256:$src))), (v8f32 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v4i64 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v8i32 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v16i16 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v32i8 VR256:$src))), (v4f64 VR256:$src)>;
+def : Pat<(v4f64 (bitconvert (v8f32 VR256:$src))), (v4f64 VR256:$src)>;
+
+// Bitcasts between 512-bit vector types. Return the original type since
+// no instruction is needed for the conversion.
+def : Pat<(v8f64 (bitconvert (v8i64 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v16i32 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v32i16 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v64i8 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v8f64 (bitconvert (v16f32 VR512:$src))), (v8f64 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8i64 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v16i32 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v32i16 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v64i8 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v16f32 (bitconvert (v8f64 VR512:$src))), (v16f32 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v16i32 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v32i16 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v64i8 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v8f64 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v8i64 (bitconvert (v16f32 VR512:$src))), (v8i64 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8i64 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v16f32 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v32i16 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v64i8 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v16i32 (bitconvert (v8f64 VR512:$src))), (v16i32 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8i64 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16i32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v64i8 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v8f64 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v32i16 (bitconvert (v16f32 VR512:$src))), (v32i16 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v8i64 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v16i32 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v32i16 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v8f64 VR512:$src))), (v64i8 VR512:$src)>;
+def : Pat<(v64i8 (bitconvert (v16f32 VR512:$src))), (v64i8 VR512:$src)>;
+
+
+//===----------------------------------------------------------------------===//
+// Non-instruction patterns
+//===----------------------------------------------------------------------===//
+
+// A vector extract of the first f32/f64 position is a subregister copy
+def : Pat<(f32 (extractelt (v4f32 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v4f32 VR128:$src), FR32)>;
+def : Pat<(f64 (extractelt (v2f64 VR128:$src), (iPTR 0))),
+ (COPY_TO_REGCLASS (v2f64 VR128:$src), FR64)>;
+
+// Implicitly promote a 32-bit scalar to a vector.
+def : Pat<(v4f32 (scalar_to_vector FR32:$src)),
+ (COPY_TO_REGCLASS FR32:$src, VR128)>;
+// Implicitly promote a 64-bit scalar to a vector.
+def : Pat<(v2f64 (scalar_to_vector FR64:$src)),
+ (COPY_TO_REGCLASS FR64:$src, VR128)>;
+
+
+//===----------------------------------------------------------------------===//
+// Subvector tricks
+//===----------------------------------------------------------------------===//
+
+// Patterns for insert_subvector/extract_subvector to/from index=0
+multiclass subvector_subreg_lowering<RegisterClass subRC, ValueType subVT,
+ RegisterClass RC, ValueType VT,
+ SubRegIndex subIdx> {
+ def : Pat<(subVT (extract_subvector (VT RC:$src), (iPTR 0))),
+ (subVT (EXTRACT_SUBREG RC:$src, subIdx))>;
+
+ let AddedComplexity = 25 in // to give priority over vinsertf128rm
+ def : Pat<(VT (insert_subvector undef, subRC:$src, (iPTR 0))),
+ (VT (INSERT_SUBREG (IMPLICIT_DEF), subRC:$src, subIdx))>;
+}
+
+// A 128-bit subvector extract from the first 256-bit vector position is a
+// subregister copy that needs no instruction. Likewise, a 128-bit subvector
+// insert to the first 256-bit vector position is a subregister copy that needs
+// no instruction.
+defm : subvector_subreg_lowering<VR128, v4i32, VR256, v8i32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v4f32, VR256, v8f32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2i64, VR256, v4i64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2f64, VR256, v4f64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8i16, VR256, v16i16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v16i8, VR256, v32i8, sub_xmm>;
+
+// A 128-bit subvector extract from the first 512-bit vector position is a
+// subregister copy that needs no instruction. Likewise, a 128-bit subvector
+// insert to the first 512-bit vector position is a subregister copy that needs
+// no instruction.
+defm : subvector_subreg_lowering<VR128, v4i32, VR512, v16i32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v4f32, VR512, v16f32, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2i64, VR512, v8i64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v2f64, VR512, v8f64, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v8i16, VR512, v32i16, sub_xmm>;
+defm : subvector_subreg_lowering<VR128, v16i8, VR512, v64i8, sub_xmm>;
+
+// A 128-bit subvector extract from the first 512-bit vector position is a
+// subregister copy that needs no instruction. Likewise, a 128-bit subvector
+// insert to the first 512-bit vector position is a subregister copy that needs
+// no instruction.
+defm : subvector_subreg_lowering<VR256, v8i32, VR512, v16i32, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v8f32, VR512, v16f32, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v4i64, VR512, v8i64, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v4f64, VR512, v8f64, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v16i16, VR512, v32i16, sub_ymm>;
+defm : subvector_subreg_lowering<VR256, v32i8, VR512, v64i8, sub_ymm>;
+
+
+multiclass subvector_store_lowering<string AlignedStr, string UnalignedStr,
+ RegisterClass RC, ValueType DstTy,
+ ValueType SrcTy, SubRegIndex SubIdx> {
+ def : Pat<(alignedstore (DstTy (extract_subvector
+ (SrcTy RC:$src), (iPTR 0))), addr:$dst),
+ (!cast<Instruction>("VMOV"#AlignedStr#"mr") addr:$dst,
+ (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
+
+ def : Pat<(store (DstTy (extract_subvector
+ (SrcTy RC:$src), (iPTR 0))), addr:$dst),
+ (!cast<Instruction>("VMOV"#UnalignedStr#"mr") addr:$dst,
+ (DstTy (EXTRACT_SUBREG RC:$src, SubIdx)))>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm : subvector_store_lowering<"APD", "UPD", VR256X, v2f64, v4f64, sub_xmm>;
+ defm : subvector_store_lowering<"APS", "UPS", VR256X, v4f32, v8f32, sub_xmm>;
+ defm : subvector_store_lowering<"DQA", "DQU", VR256X, v2i64, v4i64, sub_xmm>;
+ defm : subvector_store_lowering<"DQA", "DQU", VR256X, v4i32, v8i32, sub_xmm>;
+ defm : subvector_store_lowering<"DQA", "DQU", VR256X, v8i16, v16i16, sub_xmm>;
+ defm : subvector_store_lowering<"DQA", "DQU", VR256X, v16i8, v32i8, sub_xmm>;
+}
+
+let Predicates = [HasVLX] in {
+ // Special patterns for storing subvector extracts of lower 128-bits
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR256X, v2f64, v4f64,
+ sub_xmm>;
+ defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR256X, v4f32, v8f32,
+ sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v2i64,
+ v4i64, sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v4i32,
+ v8i32, sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v8i16,
+ v16i16, sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR256X, v16i8,
+ v32i8, sub_xmm>;
+
+ // Special patterns for storing subvector extracts of lower 128-bits of 512.
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ defm : subvector_store_lowering<"APDZ128", "UPDZ128", VR512, v2f64, v8f64,
+ sub_xmm>;
+ defm : subvector_store_lowering<"APSZ128", "UPSZ128", VR512, v4f32, v16f32,
+ sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v2i64,
+ v8i64, sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v4i32,
+ v16i32, sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v8i16,
+ v32i16, sub_xmm>;
+ defm : subvector_store_lowering<"DQA32Z128", "DQU32Z128", VR512, v16i8,
+ v64i8, sub_xmm>;
+
+ // Special patterns for storing subvector extracts of lower 256-bits of 512.
+ // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
+ defm : subvector_store_lowering<"APDZ256", "UPDZ256", VR512, v4f64, v8f64,
+ sub_ymm>;
+ defm : subvector_store_lowering<"APSZ256", "UPSZ256", VR512, v8f32, v16f32,
+ sub_ymm>;
+ defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v4i64,
+ v8i64, sub_ymm>;
+ defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v8i32,
+ v16i32, sub_ymm>;
+ defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v16i16,
+ v32i16, sub_ymm>;
+ defm : subvector_store_lowering<"DQA32Z256", "DQU32Z256", VR512, v32i8,
+ v64i8, sub_ymm>;
+}
+
+// If we're inserting into an all zeros vector, just use a plain move which
+// will zero the upper bits.
+// TODO: Is there a safe way to detect whether the producing instruction
+// already zeroed the upper bits?
+multiclass subvector_zero_lowering<string MoveStr, RegisterClass RC,
+ ValueType DstTy, ValueType SrcTy,
+ ValueType ZeroTy, PatFrag memop,
+ SubRegIndex SubIdx> {
+ def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+ (SrcTy RC:$src), (iPTR 0))),
+ (SUBREG_TO_REG (i64 0),
+ (!cast<Instruction>("VMOV"#MoveStr#"rr") RC:$src), SubIdx)>;
+
+ def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+ (SrcTy (bitconvert (memop addr:$src))),
+ (iPTR 0))),
+ (SUBREG_TO_REG (i64 0),
+ (!cast<Instruction>("VMOV"#MoveStr#"rm") addr:$src), SubIdx)>;
+}
+
+let Predicates = [HasAVX, NoVLX] in {
+ defm : subvector_zero_lowering<"APD", VR128, v4f64, v2f64, v8i32, loadv2f64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"APS", VR128, v8f32, v4f32, v8i32, loadv4f32,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v4i64, v2i64, v8i32, loadv2i64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v8i32, v4i32, v8i32, loadv2i64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v16i16, v8i16, v8i32, loadv2i64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v32i8, v16i8, v8i32, loadv2i64,
+ sub_xmm>;
+}
+
+let Predicates = [HasVLX] in {
+ defm : subvector_zero_lowering<"APDZ128", VR128X, v4f64, v2f64, v8i32,
+ loadv2f64, sub_xmm>;
+ defm : subvector_zero_lowering<"APSZ128", VR128X, v8f32, v4f32, v8i32,
+ loadv4f32, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v4i64, v2i64, v8i32,
+ loadv2i64, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i32, v4i32, v8i32,
+ loadv2i64, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i16, v8i16, v8i32,
+ loadv2i64, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i8, v16i8, v8i32,
+ loadv2i64, sub_xmm>;
+
+ defm : subvector_zero_lowering<"APDZ128", VR128X, v8f64, v2f64, v16i32,
+ loadv2f64, sub_xmm>;
+ defm : subvector_zero_lowering<"APSZ128", VR128X, v16f32, v4f32, v16i32,
+ loadv4f32, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v8i64, v2i64, v16i32,
+ loadv2i64, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v16i32, v4i32, v16i32,
+ loadv2i64, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v32i16, v8i16, v16i32,
+ loadv2i64, sub_xmm>;
+ defm : subvector_zero_lowering<"DQA64Z128", VR128X, v64i8, v16i8, v16i32,
+ loadv2i64, sub_xmm>;
+
+ defm : subvector_zero_lowering<"APDZ256", VR256X, v8f64, v4f64, v16i32,
+ loadv4f64, sub_ymm>;
+ defm : subvector_zero_lowering<"APSZ256", VR256X, v16f32, v8f32, v16i32,
+ loadv8f32, sub_ymm>;
+ defm : subvector_zero_lowering<"DQA64Z256", VR256X, v8i64, v4i64, v16i32,
+ loadv4i64, sub_ymm>;
+ defm : subvector_zero_lowering<"DQA64Z256", VR256X, v16i32, v8i32, v16i32,
+ loadv4i64, sub_ymm>;
+ defm : subvector_zero_lowering<"DQA64Z256", VR256X, v32i16, v16i16, v16i32,
+ loadv4i64, sub_ymm>;
+ defm : subvector_zero_lowering<"DQA64Z256", VR256X, v64i8, v32i8, v16i32,
+ loadv4i64, sub_ymm>;
+}
+
+let Predicates = [HasAVX512, NoVLX] in {
+ defm : subvector_zero_lowering<"APD", VR128, v8f64, v2f64, v16i32, loadv2f64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"APS", VR128, v16f32, v4f32, v16i32, loadv4f32,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v8i64, v2i64, v16i32, loadv2i64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v16i32, v4i32, v16i32, loadv2i64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v32i16, v8i16, v16i32, loadv2i64,
+ sub_xmm>;
+ defm : subvector_zero_lowering<"DQA", VR128, v64i8, v16i8, v16i32, loadv2i64,
+ sub_xmm>;
+
+ defm : subvector_zero_lowering<"APDY", VR256, v8f64, v4f64, v16i32,
+ loadv4f64, sub_ymm>;
+ defm : subvector_zero_lowering<"APSY", VR256, v16f32, v8f32, v16i32,
+ loadv8f32, sub_ymm>;
+ defm : subvector_zero_lowering<"DQAY", VR256, v8i64, v4i64, v16i32,
+ loadv4i64, sub_ymm>;
+ defm : subvector_zero_lowering<"DQAY", VR256, v16i32, v8i32, v16i32,
+ loadv4i64, sub_ymm>;
+ defm : subvector_zero_lowering<"DQAY", VR256, v32i16, v16i16, v16i32,
+ loadv4i64, sub_ymm>;
+ defm : subvector_zero_lowering<"DQAY", VR256, v64i8, v32i8, v16i32,
+ loadv4i64, sub_ymm>;
+}
+
+// List of opcodes that guaranteed to zero the upper elements of vector regs.
+// TODO: Ideally this would be a blacklist instead of a whitelist. But SHA
+// intrinsics and some MMX->XMM move instructions that aren't VEX encoded make
+// this difficult. So starting with a couple opcodes used by reduction loops
+// where we explicitly insert zeros.
+class veczeroupper<ValueType vt, RegisterClass RC> :
+ PatLeaf<(vt RC:$src), [{
+ return N->getOpcode() == X86ISD::VPMADDWD ||
+ N->getOpcode() == X86ISD::PSADBW;
+ }]>;
+
+def zeroupperv2f64 : veczeroupper<v2f64, VR128>;
+def zeroupperv4f32 : veczeroupper<v4f32, VR128>;
+def zeroupperv2i64 : veczeroupper<v2i64, VR128>;
+def zeroupperv4i32 : veczeroupper<v4i32, VR128>;
+def zeroupperv8i16 : veczeroupper<v8i16, VR128>;
+def zeroupperv16i8 : veczeroupper<v16i8, VR128>;
+
+def zeroupperv4f64 : veczeroupper<v4f64, VR256>;
+def zeroupperv8f32 : veczeroupper<v8f32, VR256>;
+def zeroupperv4i64 : veczeroupper<v4i64, VR256>;
+def zeroupperv8i32 : veczeroupper<v8i32, VR256>;
+def zeroupperv16i16 : veczeroupper<v16i16, VR256>;
+def zeroupperv32i8 : veczeroupper<v32i8, VR256>;
+
+
+// If we can guarantee the upper elements have already been zeroed we can elide
+// an explicit zeroing.
+multiclass subvector_zero_ellision<RegisterClass RC, ValueType DstTy,
+ ValueType SrcTy, ValueType ZeroTy,
+ SubRegIndex SubIdx, PatLeaf Zeroupper> {
+ def : Pat<(DstTy (insert_subvector (bitconvert (ZeroTy immAllZerosV)),
+ Zeroupper:$src, (iPTR 0))),
+ (SUBREG_TO_REG (i64 0), RC:$src, SubIdx)>;
+}
+
+// 128->256
+defm: subvector_zero_ellision<VR128, v4f64, v2f64, v8i32, sub_xmm, zeroupperv2f64>;
+defm: subvector_zero_ellision<VR128, v8f32, v4f32, v8i32, sub_xmm, zeroupperv4f32>;
+defm: subvector_zero_ellision<VR128, v4i64, v2i64, v8i32, sub_xmm, zeroupperv2i64>;
+defm: subvector_zero_ellision<VR128, v8i32, v4i32, v8i32, sub_xmm, zeroupperv4i32>;
+defm: subvector_zero_ellision<VR128, v16i16, v8i16, v8i32, sub_xmm, zeroupperv8i16>;
+defm: subvector_zero_ellision<VR128, v32i8, v16i8, v8i32, sub_xmm, zeroupperv16i8>;
+
+// 128->512
+defm: subvector_zero_ellision<VR128, v8f64, v2f64, v16i32, sub_xmm, zeroupperv2f64>;
+defm: subvector_zero_ellision<VR128, v16f32, v4f32, v16i32, sub_xmm, zeroupperv4f32>;
+defm: subvector_zero_ellision<VR128, v8i64, v2i64, v16i32, sub_xmm, zeroupperv2i64>;
+defm: subvector_zero_ellision<VR128, v16i32, v4i32, v16i32, sub_xmm, zeroupperv4i32>;
+defm: subvector_zero_ellision<VR128, v32i16, v8i16, v16i32, sub_xmm, zeroupperv8i16>;
+defm: subvector_zero_ellision<VR128, v64i8, v16i8, v16i32, sub_xmm, zeroupperv16i8>;
+
+// 256->512
+defm: subvector_zero_ellision<VR256, v8f64, v4f64, v16i32, sub_ymm, zeroupperv4f64>;
+defm: subvector_zero_ellision<VR256, v16f32, v8f32, v16i32, sub_ymm, zeroupperv8f32>;
+defm: subvector_zero_ellision<VR256, v8i64, v4i64, v16i32, sub_ymm, zeroupperv4i64>;
+defm: subvector_zero_ellision<VR256, v16i32, v8i32, v16i32, sub_ymm, zeroupperv8i32>;
+defm: subvector_zero_ellision<VR256, v32i16, v16i16, v16i32, sub_ymm, zeroupperv16i16>;
+defm: subvector_zero_ellision<VR256, v64i8, v32i8, v16i32, sub_ymm, zeroupperv32i8>;
+
+
+class maskzeroupper<ValueType vt, RegisterClass RC> :
+ PatLeaf<(vt RC:$src), [{
+ return isMaskZeroExtended(N);
+ }]>;
+
+def maskzeroupperv2i1 : maskzeroupper<v2i1, VK2>;
+def maskzeroupperv4i1 : maskzeroupper<v4i1, VK4>;
+def maskzeroupperv8i1 : maskzeroupper<v8i1, VK8>;
+def maskzeroupperv16i1 : maskzeroupper<v16i1, VK16>;
+def maskzeroupperv32i1 : maskzeroupper<v32i1, VK32>;
+
+// The patterns determine if we can depend on the upper bits of a mask register
+// being zeroed by the previous operation so that we can skip explicit
+// zeroing.
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv8i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv16i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK16:$src, VK32)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv8i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv16i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK16:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv32i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK32:$src, VK64)>;
+}
+
+let Predicates = [HasAVX512] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv8i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK8:$src, VK16)>;
+}
+
+let Predicates = [HasVLX, HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK8)>;
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK8)>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK16)>;
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK16)>;
+}
+
+let Predicates = [HasBWI, HasVLX] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK32)>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK32)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv2i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK2:$src, VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ maskzeroupperv4i1:$src, (iPTR 0))),
+ (COPY_TO_REGCLASS VK4:$src, VK64)>;
+}
+
+// If the bits are not zero we have to fall back to explicitly zeroing by
+// using shifts.
+let Predicates = [HasAVX512, NoDQI] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK8:$mask, VK16),
+ (i8 8)), (i8 8))>;
+}
+
+let Predicates = [HasDQI] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK16)>;
+}
+
+let Predicates = [HasVLX, HasDQI] in {
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK2:$mask, VK8),
+ (i8 6)), (i8 6))>;
+ def : Pat<(v8i1 (insert_subvector (v8i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRBri (KSHIFTLBri (COPY_TO_REGCLASS VK4:$mask, VK8),
+ (i8 4)), (i8 4))>;
+}
+
+let Predicates = [HasVLX] in {
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK2:$mask, VK16),
+ (i8 14)), (i8 14))>;
+ def : Pat<(v16i1 (insert_subvector (v16i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRWri (KSHIFTLWri (COPY_TO_REGCLASS VK4:$mask, VK16),
+ (i8 12)), (i8 12))>;
+}
+
+let Predicates = [HasBWI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v16i1 VK16:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v16i1 VK16:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVWkk VK16:$mask), VK64)>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v32i1 VK32:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVDkk VK32:$mask), VK64)>;
+}
+
+let Predicates = [HasBWI, NoDQI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK8:$mask, VK32),
+ (i8 24)), (i8 24))>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK8:$mask, VK64),
+ (i8 56)), (i8 56))>;
+}
+
+let Predicates = [HasBWI, HasDQI] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK32)>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v8i1 VK8:$mask), (iPTR 0))),
+ (COPY_TO_REGCLASS (KMOVBkk VK8:$mask), VK64)>;
+}
+
+let Predicates = [HasBWI, HasVLX] in {
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK2:$mask, VK32),
+ (i8 30)), (i8 30))>;
+ def : Pat<(v32i1 (insert_subvector (v32i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRDri (KSHIFTLDri (COPY_TO_REGCLASS VK4:$mask, VK32),
+ (i8 28)), (i8 28))>;
+
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v2i1 VK2:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK2:$mask, VK64),
+ (i8 62)), (i8 62))>;
+ def : Pat<(v64i1 (insert_subvector (v64i1 immAllZerosV),
+ (v4i1 VK4:$mask), (iPTR 0))),
+ (KSHIFTRQri (KSHIFTLQri (COPY_TO_REGCLASS VK4:$mask, VK64),
+ (i8 60)), (i8 60))>;
+}
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 5dde2d07babe..c4b8e3e90d29 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -14,10 +14,11 @@
multiclass xop2op<bits<8> opc, string OpcodeStr, Intrinsic Int, PatFrag memop> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WritePHAdd]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+ Sched<[WritePHAddLd, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
@@ -43,30 +44,33 @@ multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
Operand memop, ComplexPattern mem_cpat> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP;
+ [(set VR128:$dst, (Int (bitconvert mem_cpat:$src)))]>, XOP,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
}
multiclass xop2op128<bits<8> opc, string OpcodeStr, Intrinsic Int,
PatFrag memop> {
def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int VR128:$src))]>, XOP;
+ [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[WriteFAdd]>;
def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP;
+ [(set VR128:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
}
multiclass xop2op256<bits<8> opc, string OpcodeStr, Intrinsic Int,
PatFrag memop> {
def rrY : IXOP<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L;
+ [(set VR256:$dst, (Int VR256:$src))]>, XOP, VEX_L, Sched<[WriteFAdd]>;
def rmY : IXOP<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
!strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
- [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L;
+ [(set VR256:$dst, (Int (bitconvert (memop addr:$src))))]>, XOP, VEX_L,
+ Sched<[WriteFAddLd, ReadAfterLd]>;
}
let ExeDomain = SSEPackedSingle in {
@@ -97,14 +101,14 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (bitconvert (loadv2i64 addr:$src2))))))]>,
- XOP_4V, VEX_W, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ XOP_4V, VEX_W, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
def mr : IXOP<opc, MRMSrcMem4VOp3, (outs VR128:$dst),
(ins i128mem:$src1, VR128:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))),
(vt128 VR128:$src2))))]>,
- XOP, Sched<[WriteVarVecShift, ReadAfterLd]>;
+ XOP, Sched<[WriteVarVecShiftLd, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : IXOP<opc, MRMSrcReg, (outs VR128:$dst),
@@ -115,10 +119,10 @@ multiclass xop3op<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
let ExeDomain = SSEPackedInt in {
- defm VPROTB : xop3op<0x90, "vprotb", X86vprot, v16i8>;
- defm VPROTD : xop3op<0x92, "vprotd", X86vprot, v4i32>;
- defm VPROTQ : xop3op<0x93, "vprotq", X86vprot, v2i64>;
- defm VPROTW : xop3op<0x91, "vprotw", X86vprot, v8i16>;
+ defm VPROTB : xop3op<0x90, "vprotb", rotl, v16i8>;
+ defm VPROTD : xop3op<0x92, "vprotd", rotl, v4i32>;
+ defm VPROTQ : xop3op<0x93, "vprotq", rotl, v2i64>;
+ defm VPROTW : xop3op<0x91, "vprotw", rotl, v8i16>;
defm VPSHAB : xop3op<0x98, "vpshab", X86vpsha, v16i8>;
defm VPSHAD : xop3op<0x9A, "vpshad", X86vpsha, v4i32>;
defm VPSHAQ : xop3op<0x9B, "vpshaq", X86vpsha, v2i64>;
@@ -135,19 +139,21 @@ multiclass xop3opimm<bits<8> opc, string OpcodeStr, SDNode OpNode,
(ins VR128:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>, XOP;
+ (vt128 (OpNode (vt128 VR128:$src1), imm:$src2)))]>,
+ XOP, Sched<[WriteVecShift]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins i128mem:$src1, u8imm:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
- (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>, XOP;
+ (vt128 (OpNode (vt128 (bitconvert (loadv2i64 addr:$src1))), imm:$src2)))]>,
+ XOP, Sched<[WriteVecShiftLd, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
- defm VPROTB : xop3opimm<0xC0, "vprotb", X86vproti, v16i8>;
- defm VPROTD : xop3opimm<0xC2, "vprotd", X86vproti, v4i32>;
- defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vproti, v2i64>;
- defm VPROTW : xop3opimm<0xC1, "vprotw", X86vproti, v8i16>;
+ defm VPROTB : xop3opimm<0xC0, "vprotb", X86vrotli, v16i8>;
+ defm VPROTD : xop3opimm<0xC2, "vprotd", X86vrotli, v4i32>;
+ defm VPROTQ : xop3opimm<0xC3, "vprotq", X86vrotli, v2i64>;
+ defm VPROTW : xop3opimm<0xC1, "vprotw", X86vrotli, v8i16>;
}
// Instruction where second source can be memory, but third must be register
@@ -158,14 +164,15 @@ multiclass xop4opm2<bits<8> opc, string OpcodeStr, Intrinsic Int> {
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
- (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V;
+ (Int VR128:$src1, VR128:$src2, VR128:$src3))]>, XOP_4V,
+ Sched<[WriteVecIMul]>;
def rm : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set VR128:$dst,
(Int VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
- VR128:$src3))]>, XOP_4V;
+ VR128:$src3))]>, XOP_4V, Sched<[WriteVecIMulLd, ReadAfterLd]>;
}
let ExeDomain = SSEPackedInt in {
@@ -213,8 +220,8 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128>
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
- i8immZExt3:$cc)))]>,
- XOP_4V;
+ imm:$cc)))]>,
+ XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
def mi : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, XOPCC:$cc),
!strconcat("vpcom${cc}", Suffix,
@@ -222,20 +229,20 @@ multiclass xopvpcom<bits<8> opc, string Suffix, SDNode OpNode, ValueType vt128>
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1),
(vt128 (bitconvert (loadv2i64 addr:$src2))),
- i8immZExt3:$cc)))]>,
- XOP_4V;
+ imm:$cc)))]>,
+ XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
let isAsmParserOnly = 1, hasSideEffects = 0 in {
def ri_alt : IXOPi8<opc, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, u8imm:$src3),
!strconcat("vpcom", Suffix,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V;
+ []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
let mayLoad = 1 in
def mi_alt : IXOPi8<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, u8imm:$src3),
!strconcat("vpcom", Suffix,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V;
+ []>, XOP_4V, Sched<[WriteVecALULd, ReadAfterLd]>;
}
}
@@ -259,7 +266,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 VR128:$src3))))]>,
- XOP_4V;
+ XOP_4V, Sched<[WriteShuffle]>;
def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, i128mem:$src3),
!strconcat(OpcodeStr,
@@ -267,7 +274,7 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
(vt128 (bitconvert (loadv2i64 addr:$src3))))))]>,
- XOP_4V, VEX_W;
+ XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
(ins VR128:$src1, i128mem:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -275,14 +282,14 @@ multiclass xop4op<bits<8> opc, string OpcodeStr, SDNode OpNode,
[(set VR128:$dst,
(v16i8 (OpNode (vt128 VR128:$src1), (vt128 (bitconvert (loadv2i64 addr:$src2))),
(vt128 VR128:$src3))))]>,
- XOP_4V;
+ XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
+ []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>;
}
let ExeDomain = SSEPackedInt in {
@@ -297,28 +304,29 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
- (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V;
+ (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
+ Sched<[WriteShuffle]>;
def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, x86memop:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
(X86andnp (load addr:$src3), RC:$src2))))]>,
- XOP_4V, VEX_W;
+ XOP_4V, VEX_W, Sched<[WriteShuffleLd, ReadAfterLd]>;
def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, x86memop:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
[(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
(X86andnp RC:$src3, (load addr:$src2)))))]>,
- XOP_4V;
+ XOP_4V, Sched<[WriteShuffleLd, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
- []>, XOP_4V, VEX_W, FoldGenData<NAME#rrr>;
+ []>, XOP_4V, VEX_W, Sched<[WriteShuffle]>, FoldGenData<NAME#rrr>;
}
let ExeDomain = SSEPackedInt in {
@@ -335,7 +343,8 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
- (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>;
+ (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>,
+ Sched<[WriteFShuffle]>;
def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
@@ -343,21 +352,23 @@ multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, RC:$src2,
(bitconvert (IntLdFrag addr:$src3)),
- (i8 imm:$src4))))]>, VEX_W;
+ (i8 imm:$src4))))]>, VEX_W,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
(ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
[(set RC:$dst,
(VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
- RC:$src3, (i8 imm:$src4))))]>;
+ RC:$src3, (i8 imm:$src4))))]>,
+ Sched<[WriteFShuffleLd, ReadAfterLd]>;
// For disassembler
let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
(ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
!strconcat(OpcodeStr,
"\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
- []>, VEX_W, FoldGenData<NAME#rr>;
+ []>, VEX_W, Sched<[WriteFShuffle]>, FoldGenData<NAME#rr>;
}
let ExeDomain = SSEPackedDouble in {
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
index 859d3288db89..44bbc3f1b3fa 100644
--- a/lib/Target/X86/X86InstructionSelector.cpp
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -1,4 +1,4 @@
-//===- X86InstructionSelector.cpp ----------------------------*- C++ -*-==//
+//===- X86InstructionSelector.cpp -----------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -12,6 +12,7 @@
/// \todo This should be generated by TableGen.
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86InstrBuilder.h"
#include "X86InstrInfo.h"
#include "X86RegisterBankInfo.h"
@@ -19,27 +20,36 @@
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/Utils.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/Type.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/Support/AtomicOrdering.h"
+#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <tuple>
#define DEBUG_TYPE "X86-isel"
-#include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
-
using namespace llvm;
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
-
namespace {
#define GET_GLOBALISEL_PREDICATE_BITSET
@@ -51,15 +61,16 @@ public:
X86InstructionSelector(const X86TargetMachine &TM, const X86Subtarget &STI,
const X86RegisterBankInfo &RBI);
- bool select(MachineInstr &I) const override;
+ bool select(MachineInstr &I, CodeGenCoverage &CoverageInfo) const override;
+ static const char *getName() { return DEBUG_TYPE; }
private:
/// tblgen-erated 'select' implementation, used as the initial selector for
/// the patterns that don't require complex C++.
- bool selectImpl(MachineInstr &I) const;
+ bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
// TODO: remove after supported by Tablegen-erated instruction selection.
- unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
+ unsigned getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc,
uint64_t Alignment) const;
bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -74,19 +85,28 @@ private:
MachineFunction &MF) const;
bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectAnyext(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectUadde(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectCopy(MachineInstr &I, MachineRegisterInfo &MRI) const;
bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF) const;
+ MachineFunction &MF,
+ CodeGenCoverage &CoverageInfo) const;
bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF) const;
+ MachineFunction &MF,
+ CodeGenCoverage &CoverageInfo) const;
bool selectInsert(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectExtract(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
+ bool selectCondBranch(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI,
+ MachineFunction &MF) const;
+ bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const;
// emit insert subreg instruction and insert it before MachineInstr &I
bool emitInsertSubreg(unsigned DstReg, unsigned SrcReg, MachineInstr &I,
@@ -171,21 +191,71 @@ X86InstructionSelector::getRegClass(LLT Ty, unsigned Reg,
return getRegClass(Ty, RegBank);
}
+static unsigned getSubRegIndex(const TargetRegisterClass *RC) {
+ unsigned SubIdx = X86::NoSubRegister;
+ if (RC == &X86::GR32RegClass) {
+ SubIdx = X86::sub_32bit;
+ } else if (RC == &X86::GR16RegClass) {
+ SubIdx = X86::sub_16bit;
+ } else if (RC == &X86::GR8RegClass) {
+ SubIdx = X86::sub_8bit;
+ }
+
+ return SubIdx;
+}
+
+static const TargetRegisterClass *getRegClassFromGRPhysReg(unsigned Reg) {
+ assert(TargetRegisterInfo::isPhysicalRegister(Reg));
+ if (X86::GR64RegClass.contains(Reg))
+ return &X86::GR64RegClass;
+ if (X86::GR32RegClass.contains(Reg))
+ return &X86::GR32RegClass;
+ if (X86::GR16RegClass.contains(Reg))
+ return &X86::GR16RegClass;
+ if (X86::GR8RegClass.contains(Reg))
+ return &X86::GR8RegClass;
+
+ llvm_unreachable("Unknown RegClass for PhysReg!");
+}
+
// Set X86 Opcode and constrain DestReg.
bool X86InstructionSelector::selectCopy(MachineInstr &I,
MachineRegisterInfo &MRI) const {
-
unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned DstSize = RBI.getSizeInBits(DstReg, MRI, TRI);
+ const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+
+ unsigned SrcReg = I.getOperand(1).getReg();
+ const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+ const RegisterBank &SrcRegBank = *RBI.getRegBank(SrcReg, MRI, TRI);
+
if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
assert(I.isCopy() && "Generic operators do not allow physical registers");
+
+ if (DstSize > SrcSize && SrcRegBank.getID() == X86::GPRRegBankID &&
+ DstRegBank.getID() == X86::GPRRegBankID) {
+
+ const TargetRegisterClass *SrcRC =
+ getRegClass(MRI.getType(SrcReg), SrcRegBank);
+ const TargetRegisterClass *DstRC = getRegClassFromGRPhysReg(DstReg);
+
+ if (SrcRC != DstRC) {
+ // This case can be generated by ABI lowering, performe anyext
+ unsigned ExtSrc = MRI.createVirtualRegister(DstRC);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(ExtSrc)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(getSubRegIndex(SrcRC));
+
+ I.getOperand(1).setReg(ExtSrc);
+ }
+ }
+
return true;
}
- const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
- const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
- unsigned SrcReg = I.getOperand(1).getReg();
- const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
-
assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
"No phys reg on generic operators");
assert((DstSize == SrcSize ||
@@ -195,38 +265,28 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
"Copy with different width?!");
- const TargetRegisterClass *RC = nullptr;
+ const TargetRegisterClass *DstRC =
+ getRegClass(MRI.getType(DstReg), DstRegBank);
- switch (RegBank.getID()) {
- case X86::GPRRegBankID:
- assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
- RC = getRegClass(MRI.getType(DstReg), RegBank);
+ if (SrcRegBank.getID() == X86::GPRRegBankID &&
+ DstRegBank.getID() == X86::GPRRegBankID && SrcSize > DstSize &&
+ TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
+ // Change the physical register to performe truncate.
- // Change the physical register
- if (SrcSize > DstSize && TargetRegisterInfo::isPhysicalRegister(SrcReg)) {
- if (RC == &X86::GR32RegClass)
- I.getOperand(1).setSubReg(X86::sub_32bit);
- else if (RC == &X86::GR16RegClass)
- I.getOperand(1).setSubReg(X86::sub_16bit);
- else if (RC == &X86::GR8RegClass)
- I.getOperand(1).setSubReg(X86::sub_8bit);
+ const TargetRegisterClass *SrcRC = getRegClassFromGRPhysReg(SrcReg);
+ if (DstRC != SrcRC) {
+ I.getOperand(1).setSubReg(getSubRegIndex(DstRC));
I.getOperand(1).substPhysReg(SrcReg, TRI);
}
- break;
- case X86::VECRRegBankID:
- RC = getRegClass(MRI.getType(DstReg), RegBank);
- break;
- default:
- llvm_unreachable("Unknown RegBank!");
}
// No need to constrain SrcReg. It will get constrained when
// we hit another of its use or its defs.
// Copies do not have constraints.
const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
- if (!OldRC || !RC->hasSubClassEq(OldRC)) {
- if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ if (!OldRC || !DstRC->hasSubClassEq(OldRC)) {
+ if (!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
<< " operand\n");
return false;
@@ -236,7 +296,8 @@ bool X86InstructionSelector::selectCopy(MachineInstr &I,
return true;
}
-bool X86InstructionSelector::select(MachineInstr &I) const {
+bool X86InstructionSelector::select(MachineInstr &I,
+ CodeGenCoverage &CoverageInfo) const {
assert(I.getParent() && "Instruction should be in a basic block!");
assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -248,51 +309,69 @@ bool X86InstructionSelector::select(MachineInstr &I) const {
if (!isPreISelGenericOpcode(Opcode)) {
// Certain non-generic instructions also need some special handling.
+ if (Opcode == TargetOpcode::LOAD_STACK_GUARD)
+ return false;
+
if (I.isCopy())
return selectCopy(I, MRI);
- // TODO: handle more cases - LOAD_STACK_GUARD, PHI
return true;
}
assert(I.getNumOperands() == I.getNumExplicitOperands() &&
"Generic instruction has unexpected implicit operands\n");
- if (selectImpl(I))
+ if (selectImpl(I, CoverageInfo))
return true;
DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs()));
// TODO: This should be implemented by tblgen.
- if (selectLoadStoreOp(I, MRI, MF))
- return true;
- if (selectFrameIndexOrGep(I, MRI, MF))
- return true;
- if (selectGlobalValue(I, MRI, MF))
- return true;
- if (selectConstant(I, MRI, MF))
- return true;
- if (selectTrunc(I, MRI, MF))
- return true;
- if (selectZext(I, MRI, MF))
- return true;
- if (selectCmp(I, MRI, MF))
- return true;
- if (selectUadde(I, MRI, MF))
- return true;
- if (selectUnmergeValues(I, MRI, MF))
- return true;
- if (selectMergeValues(I, MRI, MF))
- return true;
- if (selectExtract(I, MRI, MF))
- return true;
- if (selectInsert(I, MRI, MF))
- return true;
+ switch (I.getOpcode()) {
+ default:
+ return false;
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_LOAD:
+ return selectLoadStoreOp(I, MRI, MF);
+ case TargetOpcode::G_GEP:
+ case TargetOpcode::G_FRAME_INDEX:
+ return selectFrameIndexOrGep(I, MRI, MF);
+ case TargetOpcode::G_GLOBAL_VALUE:
+ return selectGlobalValue(I, MRI, MF);
+ case TargetOpcode::G_CONSTANT:
+ return selectConstant(I, MRI, MF);
+ case TargetOpcode::G_FCONSTANT:
+ return materializeFP(I, MRI, MF);
+ case TargetOpcode::G_TRUNC:
+ return selectTrunc(I, MRI, MF);
+ case TargetOpcode::G_ZEXT:
+ return selectZext(I, MRI, MF);
+ case TargetOpcode::G_ANYEXT:
+ return selectAnyext(I, MRI, MF);
+ case TargetOpcode::G_ICMP:
+ return selectCmp(I, MRI, MF);
+ case TargetOpcode::G_UADDE:
+ return selectUadde(I, MRI, MF);
+ case TargetOpcode::G_UNMERGE_VALUES:
+ return selectUnmergeValues(I, MRI, MF, CoverageInfo);
+ case TargetOpcode::G_MERGE_VALUES:
+ return selectMergeValues(I, MRI, MF, CoverageInfo);
+ case TargetOpcode::G_EXTRACT:
+ return selectExtract(I, MRI, MF);
+ case TargetOpcode::G_INSERT:
+ return selectInsert(I, MRI, MF);
+ case TargetOpcode::G_BRCOND:
+ return selectCondBranch(I, MRI, MF);
+ case TargetOpcode::G_IMPLICIT_DEF:
+ case TargetOpcode::G_PHI:
+ return selectImplicitDefOrPHI(I, MRI);
+ }
return false;
}
-unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
+unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
+ const RegisterBank &RB,
unsigned Opc,
uint64_t Alignment) const {
bool Isload = (Opc == TargetOpcode::G_LOAD);
@@ -366,9 +445,9 @@ unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
}
// Fill in an address from the given instruction.
-void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
- X86AddressMode &AM) {
-
+static void X86SelectAddress(const MachineInstr &I,
+ const MachineRegisterInfo &MRI,
+ X86AddressMode &AM) {
assert(I.getOperand(0).isReg() && "unsupported opperand.");
assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
"unsupported type.");
@@ -390,17 +469,15 @@ void X86SelectAddress(const MachineInstr &I, const MachineRegisterInfo &MRI,
// Default behavior.
AM.Base.Reg = I.getOperand(0).getReg();
- return;
}
bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
-
unsigned Opc = I.getOpcode();
- if (Opc != TargetOpcode::G_STORE && Opc != TargetOpcode::G_LOAD)
- return false;
+ assert((Opc == TargetOpcode::G_STORE || Opc == TargetOpcode::G_LOAD) &&
+ "unexpected instruction");
const unsigned DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
@@ -447,8 +524,8 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
MachineFunction &MF) const {
unsigned Opc = I.getOpcode();
- if (Opc != TargetOpcode::G_FRAME_INDEX && Opc != TargetOpcode::G_GEP)
- return false;
+ assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) &&
+ "unexpected instruction");
const unsigned DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
@@ -473,10 +550,8 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
- unsigned Opc = I.getOpcode();
-
- if (Opc != TargetOpcode::G_GLOBAL_VALUE)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_GLOBAL_VALUE) &&
+ "unexpected instruction");
auto GV = I.getOperand(1).getGlobal();
if (GV->isThreadLocal()) {
@@ -485,7 +560,7 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
// Can't handle alternate code models yet.
if (TM.getCodeModel() != CodeModel::Small)
- return 0;
+ return false;
X86AddressMode AM;
AM.GV = GV;
@@ -521,8 +596,8 @@ bool X86InstructionSelector::selectGlobalValue(MachineInstr &I,
bool X86InstructionSelector::selectConstant(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_CONSTANT)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_CONSTANT) &&
+ "unexpected instruction");
const unsigned DefReg = I.getOperand(0).getReg();
LLT Ty = MRI.getType(DefReg);
@@ -550,14 +625,13 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
case 32:
NewOpc = X86::MOV32ri;
break;
- case 64: {
+ case 64:
// TODO: in case isUInt<32>(Val), X86::MOV32ri can be used
if (isInt<32>(Val))
NewOpc = X86::MOV64ri32;
else
NewOpc = X86::MOV64ri;
break;
- }
default:
llvm_unreachable("Can't select G_CONSTANT, unsupported type.");
}
@@ -569,8 +643,7 @@ bool X86InstructionSelector::selectConstant(MachineInstr &I,
bool X86InstructionSelector::selectTrunc(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_TRUNC)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_TRUNC) && "unexpected instruction");
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned SrcReg = I.getOperand(1).getReg();
@@ -628,8 +701,7 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I,
bool X86InstructionSelector::selectZext(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_ZEXT)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_ZEXT) && "unexpected instruction");
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned SrcReg = I.getOperand(1).getReg();
@@ -673,11 +745,59 @@ bool X86InstructionSelector::selectZext(MachineInstr &I,
return true;
}
+bool X86InstructionSelector::selectAnyext(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_ANYEXT) && "unexpected instruction");
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const unsigned SrcReg = I.getOperand(1).getReg();
+
+ const LLT DstTy = MRI.getType(DstReg);
+ const LLT SrcTy = MRI.getType(SrcReg);
+
+ const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
+ const RegisterBank &SrcRB = *RBI.getRegBank(SrcReg, MRI, TRI);
+
+ assert(DstRB.getID() == SrcRB.getID() &&
+ "G_ANYEXT input/output on different banks\n");
+
+ assert(DstTy.getSizeInBits() > SrcTy.getSizeInBits() &&
+ "G_ANYEXT incorrect operand size");
+
+ if (DstRB.getID() != X86::GPRRegBankID)
+ return false;
+
+ const TargetRegisterClass *DstRC = getRegClass(DstTy, DstRB);
+ const TargetRegisterClass *SrcRC = getRegClass(SrcTy, SrcRB);
+
+ if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
+ !RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+
+ if (SrcRC == DstRC) {
+ I.setDesc(TII.get(X86::COPY));
+ return true;
+ }
+
+ BuildMI(*I.getParent(), I, I.getDebugLoc(),
+ TII.get(TargetOpcode::SUBREG_TO_REG))
+ .addDef(DstReg)
+ .addImm(0)
+ .addReg(SrcReg)
+ .addImm(getSubRegIndex(SrcRC));
+
+ I.eraseFromParent();
+ return true;
+}
+
bool X86InstructionSelector::selectCmp(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_ICMP)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_ICMP) && "unexpected instruction");
X86::CondCode CC;
bool SwapArgs;
@@ -729,8 +849,7 @@ bool X86InstructionSelector::selectCmp(MachineInstr &I,
bool X86InstructionSelector::selectUadde(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_UADDE)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_UADDE) && "unexpected instruction");
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned CarryOutReg = I.getOperand(1).getReg();
@@ -789,9 +908,8 @@ bool X86InstructionSelector::selectUadde(MachineInstr &I,
bool X86InstructionSelector::selectExtract(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
-
- if (I.getOpcode() != TargetOpcode::G_EXTRACT)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_EXTRACT) &&
+ "unexpected instruction");
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned SrcReg = I.getOperand(1).getReg();
@@ -848,7 +966,6 @@ bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
-
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
unsigned SubIdx = X86::NoSubRegister;
@@ -887,7 +1004,6 @@ bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
-
const LLT DstTy = MRI.getType(DstReg);
const LLT SrcTy = MRI.getType(SrcReg);
unsigned SubIdx = X86::NoSubRegister;
@@ -925,9 +1041,7 @@ bool X86InstructionSelector::emitInsertSubreg(unsigned DstReg, unsigned SrcReg,
bool X86InstructionSelector::selectInsert(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
-
- if (I.getOpcode() != TargetOpcode::G_INSERT)
- return false;
+ assert((I.getOpcode() == TargetOpcode::G_INSERT) && "unexpected instruction");
const unsigned DstReg = I.getOperand(0).getReg();
const unsigned SrcReg = I.getOperand(1).getReg();
@@ -982,11 +1096,11 @@ bool X86InstructionSelector::selectInsert(MachineInstr &I,
return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
}
-bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I,
- MachineRegisterInfo &MRI,
- MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
- return false;
+bool X86InstructionSelector::selectUnmergeValues(
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
+ CodeGenCoverage &CoverageInfo) const {
+ assert((I.getOpcode() == TargetOpcode::G_UNMERGE_VALUES) &&
+ "unexpected instruction");
// Split to extracts.
unsigned NumDefs = I.getNumOperands() - 1;
@@ -994,14 +1108,13 @@ bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I,
unsigned DefSize = MRI.getType(I.getOperand(0).getReg()).getSizeInBits();
for (unsigned Idx = 0; Idx < NumDefs; ++Idx) {
-
MachineInstr &ExtrInst =
*BuildMI(*I.getParent(), I, I.getDebugLoc(),
TII.get(TargetOpcode::G_EXTRACT), I.getOperand(Idx).getReg())
.addReg(SrcReg)
.addImm(Idx * DefSize);
- if (!select(ExtrInst))
+ if (!select(ExtrInst, CoverageInfo))
return false;
}
@@ -1009,11 +1122,11 @@ bool X86InstructionSelector::selectUnmergeValues(MachineInstr &I,
return true;
}
-bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
- MachineRegisterInfo &MRI,
- MachineFunction &MF) const {
- if (I.getOpcode() != TargetOpcode::G_MERGE_VALUES)
- return false;
+bool X86InstructionSelector::selectMergeValues(
+ MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF,
+ CodeGenCoverage &CoverageInfo) const {
+ assert((I.getOpcode() == TargetOpcode::G_MERGE_VALUES) &&
+ "unexpected instruction");
// Split to inserts.
unsigned DstReg = I.getOperand(0).getReg();
@@ -1032,7 +1145,6 @@ bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
return false;
for (unsigned Idx = 2; Idx < I.getNumOperands(); ++Idx) {
-
unsigned Tmp = MRI.createGenericVirtualRegister(DstTy);
MRI.setRegBank(Tmp, RegBank);
@@ -1044,7 +1156,7 @@ bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
DefReg = Tmp;
- if (!select(InsertInst))
+ if (!select(InsertInst, CoverageInfo))
return false;
}
@@ -1052,12 +1164,127 @@ bool X86InstructionSelector::selectMergeValues(MachineInstr &I,
TII.get(TargetOpcode::COPY), DstReg)
.addReg(DefReg);
- if (!select(CopyInst))
+ if (!select(CopyInst, CoverageInfo))
return false;
I.eraseFromParent();
return true;
}
+
+bool X86InstructionSelector::selectCondBranch(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_BRCOND) && "unexpected instruction");
+
+ const unsigned CondReg = I.getOperand(0).getReg();
+ MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+
+ MachineInstr &TestInst =
+ *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::TEST8ri))
+ .addReg(CondReg)
+ .addImm(1);
+ BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(X86::JNE_1))
+ .addMBB(DestMBB);
+
+ constrainSelectedInstRegOperands(TestInst, TII, TRI, RBI);
+
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::materializeFP(MachineInstr &I,
+ MachineRegisterInfo &MRI,
+ MachineFunction &MF) const {
+ assert((I.getOpcode() == TargetOpcode::G_FCONSTANT) &&
+ "unexpected instruction");
+
+ // Can't handle alternate code models yet.
+ CodeModel::Model CM = TM.getCodeModel();
+ if (CM != CodeModel::Small && CM != CodeModel::Large)
+ return false;
+
+ const unsigned DstReg = I.getOperand(0).getReg();
+ const LLT DstTy = MRI.getType(DstReg);
+ const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+ unsigned Align = DstTy.getSizeInBits();
+ const DebugLoc &DbgLoc = I.getDebugLoc();
+
+ unsigned Opc = getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Align);
+
+ // Create the load from the constant pool.
+ const ConstantFP *CFP = I.getOperand(1).getFPImm();
+ unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Align);
+ MachineInstr *LoadInst = nullptr;
+ unsigned char OpFlag = STI.classifyLocalReference(nullptr);
+
+ if (CM == CodeModel::Large && STI.is64Bit()) {
+ // Under X86-64 non-small code model, GV (and friends) are 64-bits, so
+ // they cannot be folded into immediate fields.
+
+ unsigned AddrReg = MRI.createVirtualRegister(&X86::GR64RegClass);
+ BuildMI(*I.getParent(), I, DbgLoc, TII.get(X86::MOV64ri), AddrReg)
+ .addConstantPoolIndex(CPI, 0, OpFlag);
+
+ MachineMemOperand *MMO = MF.getMachineMemOperand(
+ MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
+ MF.getDataLayout().getPointerSize(), Align);
+
+ LoadInst =
+ addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg),
+ AddrReg)
+ .addMemOperand(MMO);
+
+ } else if (CM == CodeModel::Small || !STI.is64Bit()) {
+ // Handle the case when globals fit in our immediate field.
+ // This is true for X86-32 always and X86-64 when in -mcmodel=small mode.
+
+ // x86-32 PIC requires a PIC base register for constant pools.
+ unsigned PICBase = 0;
+ if (OpFlag == X86II::MO_PIC_BASE_OFFSET || OpFlag == X86II::MO_GOTOFF) {
+ // PICBase can be allocated by TII.getGlobalBaseReg(&MF).
+ // In DAGISEL the code that initialize it generated by the CGBR pass.
+ return false; // TODO support the mode.
+ } else if (STI.is64Bit() && TM.getCodeModel() == CodeModel::Small)
+ PICBase = X86::RIP;
+
+ LoadInst = addConstantPoolReference(
+ BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg), CPI, PICBase,
+ OpFlag);
+ } else
+ return false;
+
+ constrainSelectedInstRegOperands(*LoadInst, TII, TRI, RBI);
+ I.eraseFromParent();
+ return true;
+}
+
+bool X86InstructionSelector::selectImplicitDefOrPHI(
+ MachineInstr &I, MachineRegisterInfo &MRI) const {
+ assert((I.getOpcode() == TargetOpcode::G_IMPLICIT_DEF ||
+ I.getOpcode() == TargetOpcode::G_PHI) &&
+ "unexpected instruction");
+
+ unsigned DstReg = I.getOperand(0).getReg();
+
+ if (!MRI.getRegClassOrNull(DstReg)) {
+ const LLT DstTy = MRI.getType(DstReg);
+ const TargetRegisterClass *RC = getRegClass(DstTy, DstReg, MRI);
+
+ if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+ DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+ << " operand\n");
+ return false;
+ }
+ }
+
+ if (I.getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
+ I.setDesc(TII.get(X86::IMPLICIT_DEF));
+ else
+ I.setDesc(TII.get(X86::PHI));
+
+ return true;
+}
+
InstructionSelector *
llvm::createX86InstructionSelector(const X86TargetMachine &TM,
X86Subtarget &Subtarget,
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index f0ed4bc16e2f..cdb24b9d40a6 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -1,26 +1,44 @@
-//===--------- X86InterleavedAccess.cpp ----------------------------------===//
+//===- X86InterleavedAccess.cpp -------------------------------------------===//
//
// The LLVM Compiler Infrastructure
//
// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.
//
-//===--------------------------------------------------------------------===//
-///
+//===----------------------------------------------------------------------===//
+//
/// \file
/// This file contains the X86 implementation of the interleaved accesses
/// optimization generating X86-specific instructions/intrinsics for
/// interleaved access groups.
-///
-//===--------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
#include "X86ISelLowering.h"
-#include "X86TargetMachine.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
using namespace llvm;
namespace {
+
/// \brief This class holds necessary information to represent an interleaved
/// access group and supports utilities to lower the group into
/// X86-specific instructions/intrinsics.
@@ -69,7 +87,18 @@ class X86InterleavedAccessGroup {
/// Out-V2 = p3, q3, r3, s3
/// Out-V3 = P4, q4, r4, s4
void transpose_4x4(ArrayRef<Instruction *> InputVectors,
- SmallVectorImpl<Value *> &TrasposedVectors);
+ SmallVectorImpl<Value *> &TransposedMatrix);
+ void interleave8bitStride4(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumSubVecElems);
+ void interleave8bitStride4VF8(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix);
+ void interleave8bitStride3(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumSubVecElems);
+ void deinterleave8bitStride3(ArrayRef<Instruction *> InputVectors,
+ SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumSubVecElems);
public:
/// In order to form an interleaved access group X86InterleavedAccessGroup
@@ -94,38 +123,58 @@ public:
/// instructions/intrinsics.
bool lowerIntoOptimizedSequence();
};
+
} // end anonymous namespace
bool X86InterleavedAccessGroup::isSupported() const {
VectorType *ShuffleVecTy = Shuffles[0]->getType();
- uint64_t ShuffleVecSize = DL.getTypeSizeInBits(ShuffleVecTy);
Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+ unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
+ unsigned WideInstSize;
+
+ // Currently, lowering is supported for the following vectors:
+ // Stride 4:
+ // 1. Store and load of 4-element vectors of 64 bits on AVX.
+ // 2. Store of 16/32-element vectors of 8 bits on AVX.
+ // Stride 3:
+ // 1. Load of 16/32-element vectors of 8 bits on AVX.
+ if (!Subtarget.hasAVX() || (Factor != 4 && Factor != 3))
+ return false;
- // Currently, lowering is supported for 4-element vectors of 64 bits on AVX.
- uint64_t ExpectedShuffleVecSize;
- if (isa<LoadInst>(Inst))
- ExpectedShuffleVecSize = 256;
- else
- ExpectedShuffleVecSize = 1024;
+ if (isa<LoadInst>(Inst)) {
+ WideInstSize = DL.getTypeSizeInBits(Inst->getType());
+ if (cast<LoadInst>(Inst)->getPointerAddressSpace())
+ return false;
+ } else
+ WideInstSize = DL.getTypeSizeInBits(Shuffles[0]->getType());
+
+ // We support shuffle represents stride 4 for byte type with size of
+ // WideInstSize.
+ if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
+ return true;
+
+ if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
+ (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
+ WideInstSize == 2048))
+ return true;
- if (!Subtarget.hasAVX() || ShuffleVecSize != ExpectedShuffleVecSize ||
- DL.getTypeSizeInBits(ShuffleEltTy) != 64 || Factor != 4)
- return false;
+ if (ShuffleElemSize == 8 && Factor == 3 &&
+ (WideInstSize == 384 || WideInstSize == 768 || WideInstSize == 1536))
+ return true;
- return true;
+ return false;
}
void X86InterleavedAccessGroup::decompose(
Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
SmallVectorImpl<Instruction *> &DecomposedVectors) {
-
assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
"Expected Load or Shuffle");
- Type *VecTy = VecInst->getType();
- (void)VecTy;
- assert(VecTy->isVectorTy() &&
- DL.getTypeSizeInBits(VecTy) >=
+ Type *VecWidth = VecInst->getType();
+ (void)VecWidth;
+ assert(VecWidth->isVectorTy() &&
+ DL.getTypeSizeInBits(VecWidth) >=
DL.getTypeSizeInBits(SubVecTy) * NumSubVectors &&
"Invalid Inst-size!!!");
@@ -137,19 +186,30 @@ void X86InterleavedAccessGroup::decompose(
for (unsigned i = 0; i < NumSubVectors; ++i)
DecomposedVectors.push_back(
cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
- Op0, Op1, createSequentialMask(Builder, Indices[i],
- SubVecTy->getVectorNumElements(), 0))));
+ Op0, Op1,
+ createSequentialMask(Builder, Indices[i],
+ SubVecTy->getVectorNumElements(), 0))));
return;
}
// Decompose the load instruction.
LoadInst *LI = cast<LoadInst>(VecInst);
Type *VecBasePtrTy = SubVecTy->getPointerTo(LI->getPointerAddressSpace());
- Value *VecBasePtr =
- Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
-
+ Value *VecBasePtr;
+ unsigned int NumLoads = NumSubVectors;
+ // In the case of stride 3 with a vector of 32 elements load the information
+ // in the following way:
+ // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
+ unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
+ if (VecLength == 768 || VecLength == 1536) {
+ Type *VecTran =
+ VectorType::get(Type::getInt8Ty(LI->getContext()), 16)->getPointerTo();
+ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecTran);
+ NumLoads = NumSubVectors * (VecLength / 384);
+ } else
+ VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
// Generate N loads of T type.
- for (unsigned i = 0; i < NumSubVectors; i++) {
+ for (unsigned i = 0; i < NumLoads; i++) {
// TODO: Support inbounds GEP.
Value *NewBasePtr = Builder.CreateGEP(VecBasePtr, Builder.getInt32(i));
Instruction *NewLoad =
@@ -158,6 +218,470 @@ void X86InterleavedAccessGroup::decompose(
}
}
+// Changing the scale of the vector type by reducing the number of elements and
+// doubling the scalar size.
+static MVT scaleVectorType(MVT VT) {
+ unsigned ScalarSize = VT.getVectorElementType().getScalarSizeInBits() * 2;
+ return MVT::getVectorVT(MVT::getIntegerVT(ScalarSize),
+ VT.getVectorNumElements() / 2);
+}
+
+static uint32_t Concat[] = {
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
+
+// genShuffleBland - Creates shuffle according to two vectors.This function is
+// only works on instructions with lane inside 256 registers. According to
+// the mask 'Mask' creates a new Mask 'Out' by the offset of the mask. The
+// offset amount depends on the two integer, 'LowOffset' and 'HighOffset'.
+// Where the 'LowOffset' refers to the first vector and the highOffset refers to
+// the second vector.
+// |a0....a5,b0....b4,c0....c4|a16..a21,b16..b20,c16..c20|
+// |c5...c10,a5....a9,b5....b9|c21..c26,a22..a26,b21..b25|
+// |b10..b15,c11..c15,a10..a15|b26..b31,c27..c31,a27..a31|
+// For the sequence to work as a mirror to the load.
+// We must consider the elements order as above.
+// In this function we are combining two types of shuffles.
+// The first one is vpshufed and the second is a type of "blend" shuffle.
+// By computing the shuffle on a sequence of 16 elements(one lane) and add the
+// correct offset. We are creating a vpsuffed + blend sequence between two
+// shuffles.
+static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
+ SmallVectorImpl<uint32_t> &Out, int LowOffset,
+ int HighOffset) {
+ assert(VT.getSizeInBits() >= 256 &&
+ "This function doesn't accept width smaller then 256");
+ unsigned NumOfElm = VT.getVectorNumElements();
+ for (unsigned i = 0; i < Mask.size(); i++)
+ Out.push_back(Mask[i] + LowOffset);
+ for (unsigned i = 0; i < Mask.size(); i++)
+ Out.push_back(Mask[i] + HighOffset + NumOfElm);
+}
+
+// reorderSubVector returns the data to is the original state. And de-facto is
+// the opposite of the function concatSubVector.
+
+// For VecElems = 16
+// Invec[0] - |0| TransposedMatrix[0] - |0|
+// Invec[1] - |1| => TransposedMatrix[1] - |1|
+// Invec[2] - |2| TransposedMatrix[2] - |2|
+
+// For VecElems = 32
+// Invec[0] - |0|3| TransposedMatrix[0] - |0|1|
+// Invec[1] - |1|4| => TransposedMatrix[1] - |2|3|
+// Invec[2] - |2|5| TransposedMatrix[2] - |4|5|
+
+// For VecElems = 64
+// Invec[0] - |0|3|6|9 | TransposedMatrix[0] - |0|1|2 |3 |
+// Invec[1] - |1|4|7|10| => TransposedMatrix[1] - |4|5|6 |7 |
+// Invec[2] - |2|5|8|11| TransposedMatrix[2] - |8|9|10|11|
+
+static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
+ ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
+ unsigned VecElems, unsigned Stride,
+ IRBuilder<> Builder) {
+
+ if (VecElems == 16) {
+ for (unsigned i = 0; i < Stride; i++)
+ TransposedMatrix[i] = Builder.CreateShuffleVector(
+ Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
+ return;
+ }
+
+ SmallVector<uint32_t, 32> OptimizeShuf;
+ Value *Temp[8];
+
+ for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
+ genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
+ (i + 1) / Stride * 16);
+ Temp[i / 2] = Builder.CreateShuffleVector(
+ Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
+ OptimizeShuf.clear();
+ }
+
+ if (VecElems == 32) {
+ std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
+ return;
+ }
+ else
+ for (unsigned i = 0; i < Stride; i++)
+ TransposedMatrix[i] =
+ Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
+}
+
+void X86InterleavedAccessGroup::interleave8bitStride4VF8(
+ ArrayRef<Instruction *> Matrix,
+ SmallVectorImpl<Value *> &TransposedMatrix) {
+ // Assuming we start from the following vectors:
+ // Matrix[0]= c0 c1 c2 c3 c4 ... c7
+ // Matrix[1]= m0 m1 m2 m3 m4 ... m7
+ // Matrix[2]= y0 y1 y2 y3 y4 ... y7
+ // Matrix[3]= k0 k1 k2 k3 k4 ... k7
+
+ MVT VT = MVT::v8i16;
+ TransposedMatrix.resize(2);
+ SmallVector<uint32_t, 16> MaskLow;
+ SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
+ SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
+
+ for (unsigned i = 0; i < 8; ++i) {
+ MaskLow.push_back(i);
+ MaskLow.push_back(i + 8);
+ }
+
+ createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
+ createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
+ scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
+ scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
+ // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
+ // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
+ Value *IntrVec1Low =
+ Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
+ Value *IntrVec2Low =
+ Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
+
+ // TransposedMatrix[0] = c0 m0 y0 k0 c1 m1 y1 k1 c2 m2 y2 k2 c3 m3 y3 k3
+ // TransposedMatrix[1] = c4 m4 y4 k4 c5 m5 y5 k5 c6 m6 y6 k6 c7 m7 y7 k7
+
+ TransposedMatrix[0] =
+ Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskLowWord);
+ TransposedMatrix[1] =
+ Builder.CreateShuffleVector(IntrVec1Low, IntrVec2Low, MaskHighWord);
+}
+
+void X86InterleavedAccessGroup::interleave8bitStride4(
+ ArrayRef<Instruction *> Matrix, SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned NumOfElm) {
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= c0 c1 c2 c3 c4 ... c31
+ // Matrix[1]= m0 m1 m2 m3 m4 ... m31
+ // Matrix[2]= y0 y1 y2 y3 y4 ... y31
+ // Matrix[3]= k0 k1 k2 k3 k4 ... k31
+
+ MVT VT = MVT::getVectorVT(MVT::i8, NumOfElm);
+ MVT HalfVT = scaleVectorType(VT);
+
+ TransposedMatrix.resize(4);
+ SmallVector<uint32_t, 32> MaskHigh;
+ SmallVector<uint32_t, 32> MaskLow;
+ SmallVector<uint32_t, 32> LowHighMask[2];
+ SmallVector<uint32_t, 32> MaskHighTemp;
+ SmallVector<uint32_t, 32> MaskLowTemp;
+
+ // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
+ // shuffle pattern.
+
+ createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
+ createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
+
+ // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
+ // shuffle pattern.
+
+ createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
+ createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
+ scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
+ scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
+
+ // IntrVec1Low = c0 m0 c1 m1 ... c7 m7 | c16 m16 c17 m17 ... c23 m23
+ // IntrVec1High = c8 m8 c9 m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
+ // IntrVec2Low = y0 k0 y1 k1 ... y7 k7 | y16 k16 y17 k17 ... y23 k23
+ // IntrVec2High = y8 k8 y9 k9 ... y15 k15 | y24 k24 y25 k25 ... y31 k31
+ Value *IntrVec[4];
+
+ IntrVec[0] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskLow);
+ IntrVec[1] = Builder.CreateShuffleVector(Matrix[0], Matrix[1], MaskHigh);
+ IntrVec[2] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskLow);
+ IntrVec[3] = Builder.CreateShuffleVector(Matrix[2], Matrix[3], MaskHigh);
+
+ // cmyk4 cmyk5 cmyk6 cmyk7 | cmyk20 cmyk21 cmyk22 cmyk23
+ // cmyk12 cmyk13 cmyk14 cmyk15 | cmyk28 cmyk29 cmyk30 cmyk31
+ // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk16 cmyk17 cmyk18 cmyk19
+ // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk24 cmyk25 cmyk26 cmyk27
+
+ Value *VecOut[4];
+ for (int i = 0; i < 4; i++)
+ VecOut[i] = Builder.CreateShuffleVector(IntrVec[i / 2], IntrVec[i / 2 + 2],
+ LowHighMask[i % 2]);
+
+ // cmyk0 cmyk1 cmyk2 cmyk3 | cmyk4 cmyk5 cmyk6 cmyk7
+ // cmyk8 cmyk9 cmyk10 cmyk11 | cmyk12 cmyk13 cmyk14 cmyk15
+ // cmyk16 cmyk17 cmyk18 cmyk19 | cmyk20 cmyk21 cmyk22 cmyk23
+ // cmyk24 cmyk25 cmyk26 cmyk27 | cmyk28 cmyk29 cmyk30 cmyk31
+
+ if (VT == MVT::v16i8) {
+ std::copy(VecOut, VecOut + 4, TransposedMatrix.begin());
+ return;
+ }
+
+ reorderSubVector(VT, TransposedMatrix, VecOut, makeArrayRef(Concat, 16),
+ NumOfElm, 4, Builder);
+}
+
+// createShuffleStride returns shuffle mask of size N.
+// The shuffle pattern is as following :
+// {0, Stride%(VF/Lane), (2*Stride%(VF/Lane))...(VF*Stride/Lane)%(VF/Lane),
+// (VF/ Lane) ,(VF / Lane)+Stride%(VF/Lane),...,
+// (VF / Lane)+(VF*Stride/Lane)%(VF/Lane)}
+// Where Lane is the # of lanes in a register:
+// VectorSize = 128 => Lane = 1
+// VectorSize = 256 => Lane = 2
+// For example shuffle pattern for VF 16 register size 256 -> lanes = 2
+// {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
+static void createShuffleStride(MVT VT, int Stride,
+ SmallVectorImpl<uint32_t> &Mask) {
+ int VectorSize = VT.getSizeInBits();
+ int VF = VT.getVectorNumElements();
+ int LaneCount = std::max(VectorSize / 128, 1);
+ for (int Lane = 0; Lane < LaneCount; Lane++)
+ for (int i = 0, LaneSize = VF / LaneCount; i != LaneSize; ++i)
+ Mask.push_back((i * Stride) % LaneSize + LaneSize * Lane);
+}
+
+// setGroupSize sets 'SizeInfo' to the size(number of elements) of group
+// inside mask a shuffleMask. A mask contains exactly 3 groups, where
+// each group is a monotonically increasing sequence with stride 3.
+// For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
+static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
+ int VectorSize = VT.getSizeInBits();
+ int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
+ for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
+ int GroupSize = std::ceil((VF - FirstGroupElement) / 3.0);
+ SizeInfo.push_back(GroupSize);
+ FirstGroupElement = ((GroupSize)*3 + FirstGroupElement) % VF;
+ }
+}
+
+// DecodePALIGNRMask returns the shuffle mask of vpalign instruction.
+// vpalign works according to lanes
+// Where Lane is the # of lanes in a register:
+// VectorWide = 128 => Lane = 1
+// VectorWide = 256 => Lane = 2
+// For Lane = 1 shuffle pattern is: {DiffToJump,...,DiffToJump+VF-1}.
+// For Lane = 2 shuffle pattern is:
+// {DiffToJump,...,VF/2-1,VF,...,DiffToJump+VF-1}.
+// Imm variable sets the offset amount. The result of the
+// function is stored inside ShuffleMask vector and it built as described in
+// the begin of the description. AlignDirection is a boolean that indecat the
+// direction of the alignment. (false - align to the "right" side while true -
+// align to the "left" side)
+static void DecodePALIGNRMask(MVT VT, unsigned Imm,
+ SmallVectorImpl<uint32_t> &ShuffleMask,
+ bool AlignDirection = true, bool Unary = false) {
+ unsigned NumElts = VT.getVectorNumElements();
+ unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
+ unsigned NumLaneElts = NumElts / NumLanes;
+
+ Imm = AlignDirection ? Imm : (NumLaneElts - Imm);
+ unsigned Offset = Imm * (VT.getScalarSizeInBits() / 8);
+
+ for (unsigned l = 0; l != NumElts; l += NumLaneElts) {
+ for (unsigned i = 0; i != NumLaneElts; ++i) {
+ unsigned Base = i + Offset;
+ // if i+offset is out of this lane then we actually need the other source
+ // If Unary the other source is the first source.
+ if (Base >= NumLaneElts)
+ Base = Unary ? Base % NumLaneElts : Base + NumElts - NumLaneElts;
+ ShuffleMask.push_back(Base + l);
+ }
+ }
+}
+
+// concatSubVector - The function rebuilds the data to a correct expected
+// order. An assumption(The shape of the matrix) was taken for the
+// deinterleaved to work with lane's instructions like 'vpalign' or 'vphuf'.
+// This function ensures that the data is built in correct way for the lane
+// instructions. Each lane inside the vector is a 128-bit length.
+//
+// The 'InVec' argument contains the data in increasing order. In InVec[0] You
+// can find the first 128 bit data. The number of different lanes inside a
+// vector depends on the 'VecElems'.In general, the formula is
+// VecElems * type / 128. The size of the array 'InVec' depends and equal to
+// 'VecElems'.
+
+// For VecElems = 16
+// Invec[0] - |0| Vec[0] - |0|
+// Invec[1] - |1| => Vec[1] - |1|
+// Invec[2] - |2| Vec[2] - |2|
+
+// For VecElems = 32
+// Invec[0] - |0|1| Vec[0] - |0|3|
+// Invec[1] - |2|3| => Vec[1] - |1|4|
+// Invec[2] - |4|5| Vec[2] - |2|5|
+
+// For VecElems = 64
+// Invec[0] - |0|1|2 |3 | Vec[0] - |0|3|6|9 |
+// Invec[1] - |4|5|6 |7 | => Vec[1] - |1|4|7|10|
+// Invec[2] - |8|9|10|11| Vec[2] - |2|5|8|11|
+
+static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
+ unsigned VecElems, IRBuilder<> Builder) {
+ if (VecElems == 16) {
+ for (int i = 0; i < 3; i++)
+ Vec[i] = InVec[i];
+ return;
+ }
+
+ for (unsigned j = 0; j < VecElems / 32; j++)
+ for (int i = 0; i < 3; i++)
+ Vec[i + j * 3] = Builder.CreateShuffleVector(
+ InVec[j * 6 + i], InVec[j * 6 + i + 3], makeArrayRef(Concat, 32));
+
+ if (VecElems == 32)
+ return;
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(Vec[i], Vec[i + 3], Concat);
+}
+
+void X86InterleavedAccessGroup::deinterleave8bitStride3(
+ ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned VecElems) {
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= a0 b0 c0 a1 b1 c1 a2 b2
+ // Matrix[1]= c2 a3 b3 c3 a4 b4 c4 a5
+ // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
+
+ TransposedMatrix.resize(3);
+ SmallVector<uint32_t, 32> VPShuf;
+ SmallVector<uint32_t, 32> VPAlign[2];
+ SmallVector<uint32_t, 32> VPAlign2;
+ SmallVector<uint32_t, 32> VPAlign3;
+ SmallVector<uint32_t, 3> GroupSize;
+ Value *Vec[6], *TempVector[3];
+
+ MVT VT = MVT::getVT(Shuffles[0]->getType());
+
+ createShuffleStride(VT, 3, VPShuf);
+ setGroupSize(VT, GroupSize);
+
+ for (int i = 0; i < 2; i++)
+ DecodePALIGNRMask(VT, GroupSize[2 - i], VPAlign[i], false);
+
+ DecodePALIGNRMask(VT, GroupSize[2] + GroupSize[1], VPAlign2, true, true);
+ DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, true, true);
+
+ concatSubVector(Vec, InVec, VecElems, Builder);
+ // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
+ // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
+ // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(
+ Vec[i], UndefValue::get(Vec[0]->getType()), VPShuf);
+
+ // TempVector[0]= a6 a7 a0 a1 a2 b0 b1 b2
+ // TempVector[1]= c0 c1 c2 c3 c4 a3 a4 a5
+ // TempVector[2]= b3 b4 b5 b6 b7 c5 c6 c7
+
+ for (int i = 0; i < 3; i++)
+ TempVector[i] =
+ Builder.CreateShuffleVector(Vec[(i + 2) % 3], Vec[i], VPAlign[0]);
+
+ // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
+ // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
+ // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(TempVector[(i + 1) % 3], TempVector[i],
+ VPAlign[1]);
+
+ // TransposedMatrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
+ // TransposedMatrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
+ // TransposedMatrix[2]= c0 c1 c2 c3 c4 c5 c6 c7
+
+ Value *TempVec = Builder.CreateShuffleVector(
+ Vec[1], UndefValue::get(Vec[1]->getType()), VPAlign3);
+ TransposedMatrix[0] = Builder.CreateShuffleVector(
+ Vec[0], UndefValue::get(Vec[1]->getType()), VPAlign2);
+ TransposedMatrix[1] = VecElems == 8 ? Vec[2] : TempVec;
+ TransposedMatrix[2] = VecElems == 8 ? TempVec : Vec[2];
+}
+
+// group2Shuffle reorder the shuffle stride back into continuous order.
+// For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
+// MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
+static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
+ SmallVectorImpl<uint32_t> &Output) {
+ int IndexGroup[3] = {0, 0, 0};
+ int Index = 0;
+ int VectorWidth = VT.getSizeInBits();
+ int VF = VT.getVectorNumElements();
+ // Find the index of the different groups.
+ int Lane = (VectorWidth / 128 > 0) ? VectorWidth / 128 : 1;
+ for (int i = 0; i < 3; i++) {
+ IndexGroup[(Index * 3) % (VF / Lane)] = Index;
+ Index += Mask[i];
+ }
+ // According to the index compute the convert mask.
+ for (int i = 0; i < VF / Lane; i++) {
+ Output.push_back(IndexGroup[i % 3]);
+ IndexGroup[i % 3]++;
+ }
+}
+
+void X86InterleavedAccessGroup::interleave8bitStride3(
+ ArrayRef<Instruction *> InVec, SmallVectorImpl<Value *> &TransposedMatrix,
+ unsigned VecElems) {
+ // Example: Assuming we start from the following vectors:
+ // Matrix[0]= a0 a1 a2 a3 a4 a5 a6 a7
+ // Matrix[1]= b0 b1 b2 b3 b4 b5 b6 b7
+ // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
+
+ TransposedMatrix.resize(3);
+ SmallVector<uint32_t, 3> GroupSize;
+ SmallVector<uint32_t, 32> VPShuf;
+ SmallVector<uint32_t, 32> VPAlign[3];
+ SmallVector<uint32_t, 32> VPAlign2;
+ SmallVector<uint32_t, 32> VPAlign3;
+
+ Value *Vec[3], *TempVector[3];
+ MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
+
+ setGroupSize(VT, GroupSize);
+
+ for (int i = 0; i < 3; i++)
+ DecodePALIGNRMask(VT, GroupSize[i], VPAlign[i]);
+
+ DecodePALIGNRMask(VT, GroupSize[1] + GroupSize[2], VPAlign2, false, true);
+ DecodePALIGNRMask(VT, GroupSize[1], VPAlign3, false, true);
+
+ // Vec[0]= a3 a4 a5 a6 a7 a0 a1 a2
+ // Vec[1]= c5 c6 c7 c0 c1 c2 c3 c4
+ // Vec[2]= b0 b1 b2 b3 b4 b5 b6 b7
+
+ Vec[0] = Builder.CreateShuffleVector(
+ InVec[0], UndefValue::get(InVec[0]->getType()), VPAlign2);
+ Vec[1] = Builder.CreateShuffleVector(
+ InVec[1], UndefValue::get(InVec[1]->getType()), VPAlign3);
+ Vec[2] = InVec[2];
+
+ // Vec[0]= a6 a7 a0 a1 a2 b0 b1 b2
+ // Vec[1]= c0 c1 c2 c3 c4 a3 a4 a5
+ // Vec[2]= b3 b4 b5 b6 b7 c5 c6 c7
+
+ for (int i = 0; i < 3; i++)
+ TempVector[i] =
+ Builder.CreateShuffleVector(Vec[i], Vec[(i + 2) % 3], VPAlign[1]);
+
+ // Vec[0]= a0 a1 a2 b0 b1 b2 c0 c1
+ // Vec[1]= c2 c3 c4 a3 a4 a5 b3 b4
+ // Vec[2]= b5 b6 b7 c5 c6 c7 a6 a7
+
+ for (int i = 0; i < 3; i++)
+ Vec[i] = Builder.CreateShuffleVector(TempVector[i], TempVector[(i + 1) % 3],
+ VPAlign[2]);
+
+ // TransposedMatrix[0] = a0 b0 c0 a1 b1 c1 a2 b2
+ // TransposedMatrix[1] = c2 a3 b3 c3 a4 b4 c4 a5
+ // TransposedMatrix[2] = b5 c5 a6 b6 c6 a7 b7 c7
+
+ unsigned NumOfElm = VT.getVectorNumElements();
+ group2Shuffle(VT, GroupSize, VPShuf);
+ reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
+}
+
void X86InterleavedAccessGroup::transpose_4x4(
ArrayRef<Instruction *> Matrix,
SmallVectorImpl<Value *> &TransposedMatrix) {
@@ -200,10 +724,26 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// Try to generate target-sized register(/instruction).
decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
+ Type *ShuffleEltTy = Inst->getType();
+ unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
// Perform matrix-transposition in order to compute interleaved
// results by generating some sort of (optimized) target-specific
// instructions.
- transpose_4x4(DecomposedVectors, TransposedVectors);
+
+ switch (NumSubVecElems) {
+ default:
+ return false;
+ case 4:
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ break;
+ case 8:
+ case 16:
+ case 32:
+ case 64:
+ deinterleave8bitStride3(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+ break;
+ }
// Now replace the unoptimized-interleaved-vectors with the
// transposed-interleaved vectors.
@@ -219,12 +759,31 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
// Lower the interleaved stores:
// 1. Decompose the interleaved wide shuffle into individual shuffle
// vectors.
- decompose(Shuffles[0], Factor,
- VectorType::get(ShuffleEltTy, NumSubVecElems), DecomposedVectors);
+ decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
+ DecomposedVectors);
// 2. Transpose the interleaved-vectors into vectors of contiguous
// elements.
- transpose_4x4(DecomposedVectors, TransposedVectors);
+ switch (NumSubVecElems) {
+ case 4:
+ transpose_4x4(DecomposedVectors, TransposedVectors);
+ break;
+ case 8:
+ interleave8bitStride4VF8(DecomposedVectors, TransposedVectors);
+ break;
+ case 16:
+ case 32:
+ case 64:
+ if (Factor == 4)
+ interleave8bitStride4(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+ if (Factor == 3)
+ interleave8bitStride3(DecomposedVectors, TransposedVectors,
+ NumSubVecElems);
+ break;
+ default:
+ return false;
+ }
// 3. Concatenate the contiguous-vectors back into a wide vector.
Value *WideVec = concatenateVectors(Builder, TransposedVectors);
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 6b1add8ff8ed..0782d5598746 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -30,13 +30,15 @@ enum IntrinsicType : uint16_t {
INTR_TYPE_3OP_MASK, INTR_TYPE_3OP_MASK_RM, INTR_TYPE_3OP_IMM8_MASK,
FMA_OP_MASK, FMA_OP_MASKZ, FMA_OP_MASK3,
FMA_OP_SCALAR_MASK, FMA_OP_SCALAR_MASKZ, FMA_OP_SCALAR_MASK3,
+ IFMA_OP_MASK, IFMA_OP_MASKZ,
VPERM_2OP_MASK, VPERM_3OP_MASK, VPERM_3OP_MASKZ, INTR_TYPE_SCALAR_MASK,
- INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK_RM,
- COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM, BRCST_SUBVEC_TO_VEC, BRCST32x2_TO_VEC,
+ INTR_TYPE_SCALAR_MASK_RM, INTR_TYPE_3OP_SCALAR_MASK,
+ COMPRESS_EXPAND_IN_REG, COMPRESS_TO_MEM,
TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
EXPAND_FROM_MEM,
- TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
+ TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
+ ROUNDP, ROUNDS
};
struct IntrinsicData {
@@ -118,6 +120,12 @@ static const IntrinsicData IntrinsicsWithChain[] = {
X86_INTRINSIC_DATA(avx512_gatherpf_qps_512, PREFETCH,
X86::VGATHERPF0QPSm, X86::VGATHERPF1QPSm),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_b_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_b_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_b_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_d_128,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_d_256,
@@ -142,6 +150,18 @@ static const IntrinsicData IntrinsicsWithChain[] = {
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_store_q_512,
COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_w_128,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_w_256,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_store_w_512,
+ COMPRESS_TO_MEM, X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_b_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_b_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_b_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_load_d_128,
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_load_d_256,
@@ -166,6 +186,12 @@ static const IntrinsicData IntrinsicsWithChain[] = {
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_load_q_512,
EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_w_128,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_w_256,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_load_w_512,
+ EXPAND_FROM_MEM, X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_128, TRUNCATE_TO_MEM_VI8,
X86ISD::VTRUNC, 0),
X86_INTRINSIC_DATA(avx512_mask_pmov_db_mem_256, TRUNCATE_TO_MEM_VI8,
@@ -342,6 +368,8 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
* the alphabetical order.
*/
static const IntrinsicData IntrinsicsWithoutChain[] = {
+ X86_INTRINSIC_DATA(avx_addsub_pd_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(avx_addsub_ps_256, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
X86_INTRINSIC_DATA(avx_cmp_pd_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cmp_ps_256, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
@@ -360,19 +388,15 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx_movmsk_pd_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx_movmsk_ps_256, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(avx_rcp_ps_256, INTR_TYPE_1OP, X86ISD::FRCP, 0),
+ X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0),
X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0),
- X86_INTRINSIC_DATA(avx_vperm2f128_pd_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
- X86_INTRINSIC_DATA(avx_vperm2f128_ps_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
- X86_INTRINSIC_DATA(avx_vperm2f128_si_256, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, ISD::ABS, 0),
X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
@@ -381,8 +405,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
- X86_INTRINSIC_DATA(avx2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phadd_w, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(avx2_phsub_d, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -427,13 +449,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0),
X86_INTRINSIC_DATA(avx2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
X86_INTRINSIC_DATA(avx2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0),
- X86_INTRINSIC_DATA(avx2_vperm2i128, INTR_TYPE_3OP, X86ISD::VPERM2X128, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmb_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_128, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_256, BROADCASTM, X86ISD::VBROADCASTM, 0),
- X86_INTRINSIC_DATA(avx512_broadcastmw_512, BROADCASTM, X86ISD::VBROADCASTM, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
X86_INTRINSIC_DATA(avx512_cvtb2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -464,9 +479,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
- X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
- X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
- X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
X86ISD::FADD_RND),
@@ -476,40 +488,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FADDS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FADDS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_256, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf32x4_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf32x8_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_256, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf64x2_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcastf64x4_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_128, BRCST32x2_TO_VEC,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_256, BRCST32x2_TO_VEC,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti32x2_512, BRCST32x2_TO_VEC,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_256, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti32x4_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti32x8_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_256, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti64x2_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_broadcasti64x4_512, BRCST_SUBVEC_TO_VEC,
- X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_256, CMP_MASK_CC, X86ISD::CMPM, 0),
X86_INTRINSIC_DATA(avx512_mask_cmp_pd_512, CMP_MASK_CC, X86ISD::CMPM,
@@ -522,6 +500,13 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
X86_INTRINSIC_DATA(avx512_mask_cmp_ss, CMP_MASK_SCALAR_CC,
X86ISD::FSETCCM, X86ISD::FSETCCM_RND),
+
+ X86_INTRINSIC_DATA(avx512_mask_compress_b_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_b_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_b_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_d_256, COMPRESS_EXPAND_IN_REG,
@@ -546,6 +531,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_compress_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_w_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_w_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_compress_w_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::COMPRESS, 0),
X86_INTRINSIC_DATA(avx512_mask_conflict_d_128, INTR_TYPE_1OP_MASK,
X86ISD::CONFLICT, 0),
X86_INTRINSIC_DATA(avx512_mask_conflict_d_256, INTR_TYPE_1OP_MASK,
@@ -720,6 +711,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FDIVS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FDIVS_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_b_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_b_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_b_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_d_128, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_d_256, COMPRESS_EXPAND_IN_REG,
@@ -744,6 +741,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_expand_q_512, COMPRESS_EXPAND_IN_REG,
X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_w_128, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_w_256, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_expand_w_512, COMPRESS_EXPAND_IN_REG,
+ X86ISD::EXPAND, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_128, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_256, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_mask_fixupimm_pd_512, FIXUPIMM, X86ISD::VFIXUPIMM, 0),
@@ -776,22 +779,22 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FGETEXPS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_getexp_ss, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FGETEXPS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK_RM,
- X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_128, INTR_TYPE_2OP_MASK,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_256, INTR_TYPE_2OP_MASK,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_pd_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_128, INTR_TYPE_2OP_MASK,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_256, INTR_TYPE_2OP_MASK,
X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK_RM,
- X86ISD::VGETMANT, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK_RM,
- X86ISD::VGETMANTS, 0),
- X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
- X86ISD::VGETMANTS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ps_512, INTR_TYPE_2OP_MASK,
+ X86ISD::VGETMANT, X86ISD::VGETMANT_RND),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_sd, INTR_TYPE_3OP_SCALAR_MASK,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
+ X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK,
+ X86ISD::VGETMANTS, X86ISD::VGETMANTS_RND),
X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
X86ISD::FMAX_RND),
X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
@@ -816,18 +819,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FMULS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FMULS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
@@ -840,36 +831,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_paddus_w_128, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_256, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(avx512_mask_paddus_w_512, INTR_TYPE_2OP_MASK, X86ISD::ADDUS, 0),
- X86_INTRINSIC_DATA(avx512_mask_pavg_b_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pavg_b_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pavg_b_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pavg_w_128, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pavg_w_256, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pavg_w_512, INTR_TYPE_2OP_MASK, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_128, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_256, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_b_gpr_512, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_128, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_256, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_d_gpr_512, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_128, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_256, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_q_gpr_512, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_128, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_256, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
- X86_INTRINSIC_DATA(avx512_mask_pbroadcast_w_gpr_512, INTR_TYPE_1OP_MASK,
- X86ISD::VBROADCAST, 0),
X86_INTRINSIC_DATA(avx512_mask_permvar_df_256, VPERM_2OP_MASK,
X86ISD::VPERMV, 0),
X86_INTRINSIC_DATA(avx512_mask_permvar_df_512, VPERM_2OP_MASK,
@@ -1081,32 +1042,32 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_mask_pternlog_q_512, TERLOG_OP_MASK,
X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VRANGE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VREDUCE, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
- X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::VREDUCES, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK_RM, X86ISD::VRNDSCALE, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VRNDSCALES, 0),
- X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK_RM,
- X86ISD::VRNDSCALES, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_pd_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_128, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_256, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_range_ps_512, INTR_TYPE_3OP_MASK, X86ISD::VRANGE, X86ISD::VRANGE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_range_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
+ X86_INTRINSIC_DATA(avx512_mask_range_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VRANGES, X86ISD::VRANGES_RND),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VREDUCE, X86ISD::VREDUCE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_sd, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
+ X86_INTRINSIC_DATA(avx512_mask_reduce_ss, INTR_TYPE_SCALAR_MASK, X86ISD::VREDUCES, X86ISD::VREDUCES_RND),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_pd_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_128, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_256, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ps_512, INTR_TYPE_2OP_MASK, X86ISD::VRNDSCALE, X86ISD::VRNDSCALE_RND),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_sd, INTR_TYPE_SCALAR_MASK,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
+ X86_INTRINSIC_DATA(avx512_mask_rndscale_ss, INTR_TYPE_SCALAR_MASK,
+ X86ISD::VRNDSCALES, X86ISD::VRNDSCALES_RND),
X86_INTRINSIC_DATA(avx512_mask_scalef_pd_128, INTR_TYPE_2OP_MASK_RM,
X86ISD::SCALEF, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_pd_256, INTR_TYPE_2OP_MASK_RM,
@@ -1123,22 +1084,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::SCALEFS, 0),
X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::SCALEFS, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_f32x4_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_f64x2_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_i32x4_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
- X86_INTRINSIC_DATA(avx512_mask_shuf_i64x2_256, INTR_TYPE_3OP_IMM8_MASK,
- X86ISD::SHUF128, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
@@ -1159,29 +1104,29 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::FSUBS_RND, 0),
X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
X86ISD::FSUBS_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK_RM,
- X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
- X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_RM,
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK,
+ X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_RND),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_256, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_512, INTR_TYPE_2OP_MASK,
X86ISD::CVTPS2PH, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, X86ISD::FMADD,
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_128, FMA_OP_MASK, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_256, FMA_OP_MASK, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_pd_512, FMA_OP_MASK, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, X86ISD::FMADD,
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_128, FMA_OP_MASK, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_256, FMA_OP_MASK, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ps_512, FMA_OP_MASK, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_sd, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+ X86_INTRINSIC_DATA(avx512_mask_vfmadd_ss, FMA_OP_SCALAR_MASK, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_128, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_256, FMA_OP_MASK, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask_vfmaddsub_pd_512, FMA_OP_MASK, X86ISD::FMADDSUB,
@@ -1209,7 +1154,20 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask_vfnmsub_ps_512, FMA_OP_MASK, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_128, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_256, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpbusd_512, FMA_OP_MASK, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_128, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_256, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpbusds_512, FMA_OP_MASK, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_128, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_256, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpwssd_512, FMA_OP_MASK, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_128, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_256, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpdpwssds_512, FMA_OP_MASK, X86ISD::VPDPWSSDS, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_128, VPERM_3OP_MASK,
X86ISD::VPERMIV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermi2var_d_256, VPERM_3OP_MASK,
X86ISD::VPERMIV3, 0),
@@ -1281,29 +1239,74 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_mask_vpermt2var_qi_512, VPERM_3OP_MASK,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_128 , IFMA_OP_MASK,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_256 , IFMA_OP_MASK,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52h_uq_512 , IFMA_OP_MASK,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_128 , IFMA_OP_MASK,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_256 , IFMA_OP_MASK,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , FMA_OP_MASK,
+ X86_INTRINSIC_DATA(avx512_mask_vpmadd52l_uq_512 , IFMA_OP_MASK,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, X86ISD::FMADD,
+
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshld_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHLD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_d_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_q_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_128, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_256, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshldv_w_512, FMA_OP_MASK, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_d_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_q_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_128, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_256, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrd_w_512, INTR_TYPE_3OP_IMM8_MASK, X86ISD::VSHRD, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_d_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_q_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_128, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_256, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshrdv_w_512, FMA_OP_MASK, X86ISD::VSHRDV, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_128, CMP_MASK,
+ X86ISD::VPSHUFBITQMB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_256, CMP_MASK,
+ X86ISD::VPSHUFBITQMB, 0),
+ X86_INTRINSIC_DATA(avx512_mask_vpshufbitqmb_512, CMP_MASK,
+ X86ISD::VPSHUFBITQMB, 0),
+
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_128, FMA_OP_MASK3, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_256, FMA_OP_MASK3, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_pd_512, FMA_OP_MASK3, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, X86ISD::FMADD,
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_128, FMA_OP_MASK3, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_256, FMA_OP_MASK3, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ps_512, FMA_OP_MASK3, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmadd_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMADDS3, X86ISD::FMADDS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_128, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_256, FMA_OP_MASK3, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmaddsub_pd_512, FMA_OP_MASK3, X86ISD::FMADDSUB,
@@ -1321,8 +1324,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_256, FMA_OP_MASK3, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ps_512, FMA_OP_MASK3, X86ISD::FMSUB,
X86ISD::FMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FMSUBS3, X86ISD::FMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_128, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfmsubadd_pd_256, FMA_OP_MASK3, X86ISD::FMSUBADD, 0),
@@ -1341,8 +1344,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_256, FMA_OP_MASK3, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ps_512, FMA_OP_MASK3, X86ISD::FNMSUB,
X86ISD::FNMSUB_RND),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
- X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3_RND, 0),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_sd, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
+ X86_INTRINSIC_DATA(avx512_mask3_vfnmsub_ss, FMA_OP_SCALAR_MASK3, X86ISD::FNMSUBS3, X86ISD::FNMSUBS3_RND),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_128, FIXUPIMM_MASKZ,
X86ISD::VFIXUPIMM, 0),
X86_INTRINSIC_DATA(avx512_maskz_fixupimm_pd_256, FIXUPIMM_MASKZ,
@@ -1371,17 +1374,17 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPTERNLOG, 0),
X86_INTRINSIC_DATA(avx512_maskz_pternlog_q_512, TERLOG_OP_MASKZ,
X86ISD::VPTERNLOG, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, X86ISD::FMADD,
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_128, FMA_OP_MASKZ, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_256, FMA_OP_MASKZ, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_pd_512, FMA_OP_MASKZ, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, X86ISD::FMADD,
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_128, FMA_OP_MASKZ, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_256, FMA_OP_MASKZ, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ps_512, FMA_OP_MASKZ, ISD::FMA,
X86ISD::FMADD_RND),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1_RND, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_sd, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vfmadd_ss, FMA_OP_SCALAR_MASKZ, X86ISD::FMADDS1, X86ISD::FMADDS1_RND),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_128, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_256, FMA_OP_MASKZ, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_pd_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
@@ -1391,6 +1394,19 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_maskz_vfmaddsub_ps_512, FMA_OP_MASKZ, X86ISD::FMADDSUB,
X86ISD::FMADDSUB_RND),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_128, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_256, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpbusd_512, FMA_OP_MASKZ, X86ISD::VPDPBUSD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_128, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_256, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpbusds_512, FMA_OP_MASKZ, X86ISD::VPDPBUSDS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_128, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_256, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpwssd_512, FMA_OP_MASKZ, X86ISD::VPDPWSSD, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_128, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_256, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpdpwssds_512, FMA_OP_MASKZ, X86ISD::VPDPWSSDS, 0),
+
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_128, VPERM_3OP_MASKZ,
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_d_256, VPERM_3OP_MASKZ,
@@ -1427,18 +1443,38 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86ISD::VPERMV3, 0),
X86_INTRINSIC_DATA(avx512_maskz_vpermt2var_qi_512, VPERM_3OP_MASKZ,
X86ISD::VPERMV3, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_128, IFMA_OP_MASKZ,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_256, IFMA_OP_MASKZ,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52h_uq_512, IFMA_OP_MASKZ,
X86ISD::VPMADD52H, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_128, IFMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_256, IFMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
- X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
+ X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, IFMA_OP_MASKZ,
X86ISD::VPMADD52L, 0),
+
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_d_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_q_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_128, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_256, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshldv_w_512, FMA_OP_MASKZ, X86ISD::VSHLDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_d_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_q_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_128, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_256, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+ X86_INTRINSIC_DATA(avx512_maskz_vpshrdv_w_512, FMA_OP_MASKZ, X86ISD::VSHRDV, 0),
+
X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
@@ -1486,50 +1522,26 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0),
X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_b_128, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_b_256, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_b_512, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_d_128, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_d_256, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_d_512, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_q_128, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_q_256, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_q_512, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_w_128, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_w_256, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestm_w_512, CMP_MASK, X86ISD::TESTM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_b_128, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_b_256, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_b_512, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_d_128, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_d_256, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_d_512, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_q_128, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_q_256, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_q_512, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_w_128, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_w_256, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_ptestnm_w_512, CMP_MASK, X86ISD::TESTNM, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRCP, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
- X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRCPS, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RCP14, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
+ X86_INTRINSIC_DATA(avx512_rcp14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RCP14S, 0),
X86_INTRINSIC_DATA(avx512_rcp28_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::RCP28, 0),
X86_INTRINSIC_DATA(avx512_rcp28_sd, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
X86_INTRINSIC_DATA(avx512_rcp28_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::RCP28S, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::FRSQRT, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
- X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::FRSQRTS, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_pd_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_128, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_256, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ps_512, INTR_TYPE_1OP_MASK, X86ISD::RSQRT14, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_sd, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
+ X86_INTRINSIC_DATA(avx512_rsqrt14_ss, INTR_TYPE_SCALAR_MASK, X86ISD::RSQRT14S, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_pd, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
@@ -1546,10 +1558,12 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(avx512_vcvtss2usi64, INTR_TYPE_2OP, X86ISD::CVTS2UI_RND, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_pd_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
X86_INTRINSIC_DATA(avx512_vpermilvar_ps_512, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, X86ISD::FMADD, 0),
- X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, X86ISD::FMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_pd, INTR_TYPE_3OP, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_pd_256, INTR_TYPE_3OP, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ps, INTR_TYPE_3OP, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ps_256, INTR_TYPE_3OP, ISD::FMA, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
+ X86_INTRINSIC_DATA(fma_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADDS1, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
X86_INTRINSIC_DATA(fma_vfmaddsub_ps, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
@@ -1558,6 +1572,8 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfmsub_pd_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(fma_vfmsub_ps, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
X86_INTRINSIC_DATA(fma_vfmsub_ps_256, INTR_TYPE_3OP, X86ISD::FMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_sd, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma_vfmsub_ss, INTR_TYPE_3OP, X86ISD::FMSUBS1, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_pd, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_pd_256, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
X86_INTRINSIC_DATA(fma_vfmsubadd_ps, INTR_TYPE_3OP, X86ISD::FMSUBADD, 0),
@@ -1566,10 +1582,16 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(fma_vfnmadd_pd_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(fma_vfnmadd_ps, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
X86_INTRINSIC_DATA(fma_vfnmadd_ps_256, INTR_TYPE_3OP, X86ISD::FNMADD, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_sd, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
+ X86_INTRINSIC_DATA(fma_vfnmadd_ss, INTR_TYPE_3OP, X86ISD::FNMADDS1, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_pd, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_pd_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ps, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
X86_INTRINSIC_DATA(fma_vfnmsub_ps_256, INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_sd, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma_vfnmsub_ss, INTR_TYPE_3OP, X86ISD::FNMSUBS1, 0),
+ X86_INTRINSIC_DATA(fma4_vfmadd_sd, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
+ X86_INTRINSIC_DATA(fma4_vfmadd_ss, INTR_TYPE_3OP, X86ISD::FMADD4S, 0),
X86_INTRINSIC_DATA(sse_cmp_ps, INTR_TYPE_3OP, X86ISD::CMPP, 0),
X86_INTRINSIC_DATA(sse_comieq_ss, COMI, X86ISD::COMI, ISD::SETEQ),
X86_INTRINSIC_DATA(sse_comige_ss, COMI, X86ISD::COMI, ISD::SETGE),
@@ -1615,8 +1637,6 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, X86ISD::ADDS, 0),
X86_INTRINSIC_DATA(sse2_paddus_b, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
X86_INTRINSIC_DATA(sse2_paddus_w, INTR_TYPE_2OP, X86ISD::ADDUS, 0),
- X86_INTRINSIC_DATA(sse2_pavg_b, INTR_TYPE_2OP, X86ISD::AVG, 0),
- X86_INTRINSIC_DATA(sse2_pavg_w, INTR_TYPE_2OP, X86ISD::AVG, 0),
X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0),
X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0),
@@ -1650,18 +1670,22 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(sse2_ucomile_sd, COMI, X86ISD::UCOMI, ISD::SETLE),
X86_INTRINSIC_DATA(sse2_ucomilt_sd, COMI, X86ISD::UCOMI, ISD::SETLT),
X86_INTRINSIC_DATA(sse2_ucomineq_sd, COMI, X86ISD::UCOMI, ISD::SETNE),
+ X86_INTRINSIC_DATA(sse3_addsub_pd, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
+ X86_INTRINSIC_DATA(sse3_addsub_ps, INTR_TYPE_2OP, X86ISD::ADDSUB, 0),
X86_INTRINSIC_DATA(sse3_hadd_pd, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(sse3_hadd_ps, INTR_TYPE_2OP, X86ISD::FHADD, 0),
X86_INTRINSIC_DATA(sse3_hsub_pd, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse3_hsub_ps, INTR_TYPE_2OP, X86ISD::FHSUB, 0),
X86_INTRINSIC_DATA(sse41_insertps, INTR_TYPE_3OP, X86ISD::INSERTPS, 0),
X86_INTRINSIC_DATA(sse41_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+ X86_INTRINSIC_DATA(sse41_phminposuw, INTR_TYPE_1OP, X86ISD::PHMINPOS, 0),
X86_INTRINSIC_DATA(sse41_pmuldq, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
+ X86_INTRINSIC_DATA(sse41_round_pd, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(sse41_round_ps, ROUNDP, X86ISD::VRNDSCALE, 0),
+ X86_INTRINSIC_DATA(sse41_round_sd, ROUNDS, X86ISD::VRNDSCALES, 0),
+ X86_INTRINSIC_DATA(sse41_round_ss, ROUNDS, X86ISD::VRNDSCALES, 0),
X86_INTRINSIC_DATA(sse4a_extrqi, INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
X86_INTRINSIC_DATA(sse4a_insertqi, INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
- X86_INTRINSIC_DATA(ssse3_pabs_b_128, INTR_TYPE_1OP, ISD::ABS, 0),
- X86_INTRINSIC_DATA(ssse3_pabs_d_128, INTR_TYPE_1OP, ISD::ABS, 0),
- X86_INTRINSIC_DATA(ssse3_pabs_w_128, INTR_TYPE_1OP, ISD::ABS, 0),
X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
@@ -1669,6 +1693,30 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(ssse3_pmadd_ub_sw_128, INTR_TYPE_2OP, X86ISD::VPMADDUBSW, 0),
X86_INTRINSIC_DATA(ssse3_pmul_hr_sw_128, INTR_TYPE_2OP, X86ISD::MULHRS, 0),
X86_INTRINSIC_DATA(ssse3_pshuf_b_128, INTR_TYPE_2OP, X86ISD::PSHUFB, 0),
+ X86_INTRINSIC_DATA(vcvtph2ps_128, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(vcvtph2ps_256, INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_128, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+ X86_INTRINSIC_DATA(vcvtps2ph_256, INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
+
+ X86_INTRINSIC_DATA(vgf2p8affineinvqb_128, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEINVQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineinvqb_256, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEINVQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineinvqb_512, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEINVQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineqb_128, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineqb_256, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8affineqb_512, INTR_TYPE_3OP,
+ X86ISD::GF2P8AFFINEQB, 0),
+ X86_INTRINSIC_DATA(vgf2p8mulb_128, INTR_TYPE_2OP,
+ X86ISD::GF2P8MULB, 0),
+ X86_INTRINSIC_DATA(vgf2p8mulb_256, INTR_TYPE_2OP,
+ X86ISD::GF2P8MULB, 0),
+ X86_INTRINSIC_DATA(vgf2p8mulb_512, INTR_TYPE_2OP,
+ X86ISD::GF2P8MULB, 0),
+
X86_INTRINSIC_DATA(xop_vpcomb, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomd, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
X86_INTRINSIC_DATA(xop_vpcomq, INTR_TYPE_3OP, X86ISD::VPCOM, 0),
@@ -1682,14 +1730,14 @@ static const IntrinsicData IntrinsicsWithoutChain[] = {
X86_INTRINSIC_DATA(xop_vpermil2ps, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpermil2ps_256, INTR_TYPE_4OP, X86ISD::VPERMIL2, 0),
X86_INTRINSIC_DATA(xop_vpperm, INTR_TYPE_3OP, X86ISD::VPPERM, 0),
- X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, X86ISD::VPROT, 0),
- X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
- X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, X86ISD::VPROT, 0),
- X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
- X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, X86ISD::VPROT, 0),
- X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
- X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, X86ISD::VPROT, 0),
- X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VPROTI, 0),
+ X86_INTRINSIC_DATA(xop_vprotb, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(xop_vprotbi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(xop_vprotd, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(xop_vprotdi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(xop_vprotq, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(xop_vprotqi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
+ X86_INTRINSIC_DATA(xop_vprotw, INTR_TYPE_2OP, ISD::ROTL, 0),
+ X86_INTRINSIC_DATA(xop_vprotwi, INTR_TYPE_2OP, X86ISD::VROTLI, 0),
X86_INTRINSIC_DATA(xop_vpshab, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
X86_INTRINSIC_DATA(xop_vpshad, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
X86_INTRINSIC_DATA(xop_vpshaq, INTR_TYPE_2OP, X86ISD::VPSHA, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
index 744ba21011af..4108a58fa7a5 100644
--- a/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -14,17 +14,45 @@
#include "X86LegalizerInfo.h"
#include "X86Subtarget.h"
#include "X86TargetMachine.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/ValueTypes.h"
#include "llvm/IR/DerivedTypes.h"
#include "llvm/IR/Type.h"
-#include "llvm/Target/TargetOpcodes.h"
using namespace llvm;
using namespace TargetOpcode;
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
+/// FIXME: The following static functions are SizeChangeStrategy functions
+/// that are meant to temporarily mimic the behaviour of the old legalization
+/// based on doubling/halving non-legal types as closely as possible. This is
+/// not entirly possible as only legalizing the types that are exactly a power
+/// of 2 times the size of the legal types would require specifying all those
+/// sizes explicitly.
+/// In practice, not specifying those isn't a problem, and the below functions
+/// should disappear quickly as we add support for legalizing non-power-of-2
+/// sized types further.
+static void
+addAndInterleaveWithUnsupported(LegalizerInfo::SizeAndActionsVec &result,
+ const LegalizerInfo::SizeAndActionsVec &v) {
+ for (unsigned i = 0; i < v.size(); ++i) {
+ result.push_back(v[i]);
+ if (i + 1 < v[i].first && i + 1 < v.size() &&
+ v[i + 1].first != v[i].first + 1)
+ result.push_back({v[i].first + 1, LegalizerInfo::Unsupported});
+ }
+}
+
+static LegalizerInfo::SizeAndActionsVec
+widen_1(const LegalizerInfo::SizeAndActionsVec &v) {
+ assert(v.size() >= 1);
+ assert(v[0].first > 1);
+ LegalizerInfo::SizeAndActionsVec result = {{1, LegalizerInfo::WidenScalar},
+ {2, LegalizerInfo::Unsupported}};
+ addAndInterleaveWithUnsupported(result, v);
+ auto Largest = result.back().first;
+ result.push_back({Largest + 1, LegalizerInfo::Unsupported});
+ return result;
+}
X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
const X86TargetMachine &TM)
@@ -41,21 +69,35 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizerInfoAVX512DQ();
setLegalizerInfoAVX512BW();
+ setLegalizeScalarToDifferentSizeStrategy(G_PHI, 0, widen_1);
+ for (unsigned BinOp : {G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ setLegalizeScalarToDifferentSizeStrategy(BinOp, 0, widen_1);
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
+ narrowToSmallerAndWidenToSmallest);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_GEP, 1, widenToLargerTypesUnsupportedOtherwise);
+ setLegalizeScalarToDifferentSizeStrategy(
+ G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
+
computeTables();
}
void X86LegalizerInfo::setLegalizerInfo32bit() {
- if (Subtarget.is64Bit())
- return;
-
- const LLT p0 = LLT::pointer(0, 32);
+ const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
const LLT s1 = LLT::scalar(1);
const LLT s8 = LLT::scalar(8);
const LLT s16 = LLT::scalar(16);
const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
+ for (auto Ty : {p0, s1, s8, s16, s32})
+ setAction({G_IMPLICIT_DEF, Ty}, Legal);
+
+ for (auto Ty : {s8, s16, s32, p0})
+ setAction({G_PHI, Ty}, Legal);
+
for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
for (auto Ty : {s8, s16, s32})
setAction({BinOp, Ty}, Legal);
@@ -69,7 +111,6 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({MemOp, Ty}, Legal);
- setAction({MemOp, s1}, WidenScalar);
// And everything's fine in addrspace 0.
setAction({MemOp, 1, p0}, Legal);
}
@@ -81,25 +122,18 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_GEP, p0}, Legal);
setAction({G_GEP, 1, s32}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({G_GEP, 1, Ty}, WidenScalar);
+ // Control-flow
+ setAction({G_BRCOND, s1}, Legal);
// Constants
for (auto Ty : {s8, s16, s32, p0})
setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
- setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
- setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
-
// Extensions
for (auto Ty : {s8, s16, s32}) {
setAction({G_ZEXT, Ty}, Legal);
setAction({G_SEXT, Ty}, Legal);
- }
-
- for (auto Ty : {s1, s8, s16}) {
- setAction({G_ZEXT, 1, Ty}, Legal);
- setAction({G_SEXT, 1, Ty}, Legal);
+ setAction({G_ANYEXT, Ty}, Legal);
}
// Comparison
@@ -107,6 +141,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
for (auto Ty : {s8, s16, s32, p0})
setAction({G_ICMP, 1, Ty}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty : {s16, s32, s64}) {
+ setAction({G_MERGE_VALUES, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty : {s8, s16, s32}) {
+ setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
}
void X86LegalizerInfo::setLegalizerInfo64bit() {
@@ -114,59 +158,38 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
if (!Subtarget.is64Bit())
return;
- const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
- const LLT s1 = LLT::scalar(1);
- const LLT s8 = LLT::scalar(8);
- const LLT s16 = LLT::scalar(16);
- const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
+ const LLT s128 = LLT::scalar(128);
- for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
- for (auto Ty : {s8, s16, s32, s64})
- setAction({BinOp, Ty}, Legal);
+ setAction({G_IMPLICIT_DEF, s64}, Legal);
- for (unsigned MemOp : {G_LOAD, G_STORE}) {
- for (auto Ty : {s8, s16, s32, s64, p0})
- setAction({MemOp, Ty}, Legal);
+ setAction({G_PHI, s64}, Legal);
- setAction({MemOp, s1}, WidenScalar);
- // And everything's fine in addrspace 0.
- setAction({MemOp, 1, p0}, Legal);
- }
+ for (unsigned BinOp : {G_ADD, G_SUB, G_MUL, G_AND, G_OR, G_XOR})
+ setAction({BinOp, s64}, Legal);
- // Pointer-handling
- setAction({G_FRAME_INDEX, p0}, Legal);
- setAction({G_GLOBAL_VALUE, p0}, Legal);
+ for (unsigned MemOp : {G_LOAD, G_STORE})
+ setAction({MemOp, s64}, Legal);
- setAction({G_GEP, p0}, Legal);
- setAction({G_GEP, 1, s32}, Legal);
+ // Pointer-handling
setAction({G_GEP, 1, s64}, Legal);
- for (auto Ty : {s1, s8, s16})
- setAction({G_GEP, 1, Ty}, WidenScalar);
-
// Constants
- for (auto Ty : {s8, s16, s32, s64, p0})
- setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
-
- setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+ setAction({TargetOpcode::G_CONSTANT, s64}, Legal);
// Extensions
- for (auto Ty : {s8, s16, s32, s64}) {
- setAction({G_ZEXT, Ty}, Legal);
- setAction({G_SEXT, Ty}, Legal);
- }
-
- for (auto Ty : {s1, s8, s16, s32}) {
- setAction({G_ZEXT, 1, Ty}, Legal);
- setAction({G_SEXT, 1, Ty}, Legal);
+ for (unsigned extOp : {G_ZEXT, G_SEXT, G_ANYEXT}) {
+ setAction({extOp, s64}, Legal);
}
// Comparison
- setAction({G_ICMP, s1}, Legal);
+ setAction({G_ICMP, 1, s64}, Legal);
- for (auto Ty : {s8, s16, s32, s64, p0})
- setAction({G_ICMP, 1, Ty}, Legal);
+ // Merge/Unmerge
+ setAction({G_MERGE_VALUES, s128}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, s128}, Legal);
+ setAction({G_MERGE_VALUES, 1, s128}, Legal);
+ setAction({G_UNMERGE_VALUES, s128}, Legal);
}
void X86LegalizerInfo::setLegalizerInfoSSE1() {
@@ -174,6 +197,7 @@ void X86LegalizerInfo::setLegalizerInfoSSE1() {
return;
const LLT s32 = LLT::scalar(32);
+ const LLT s64 = LLT::scalar(64);
const LLT v4s32 = LLT::vector(4, 32);
const LLT v2s64 = LLT::vector(2, 64);
@@ -184,18 +208,35 @@ void X86LegalizerInfo::setLegalizerInfoSSE1() {
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v4s32, v2s64})
setAction({MemOp, Ty}, Legal);
+
+ // Constants
+ setAction({TargetOpcode::G_FCONSTANT, s32}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty : {v4s32, v2s64}) {
+ setAction({G_MERGE_VALUES, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ setAction({G_MERGE_VALUES, 1, s64}, Legal);
+ setAction({G_UNMERGE_VALUES, s64}, Legal);
}
void X86LegalizerInfo::setLegalizerInfoSSE2() {
if (!Subtarget.hasSSE2())
return;
+ const LLT s32 = LLT::scalar(32);
const LLT s64 = LLT::scalar(64);
const LLT v16s8 = LLT::vector(16, 8);
const LLT v8s16 = LLT::vector(8, 16);
const LLT v4s32 = LLT::vector(4, 32);
const LLT v2s64 = LLT::vector(2, 64);
+ const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v4s64 = LLT::vector(4, 64);
+
for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
for (auto Ty : {s64, v2s64})
setAction({BinOp, Ty}, Legal);
@@ -205,6 +246,23 @@ void X86LegalizerInfo::setLegalizerInfoSSE2() {
setAction({BinOp, Ty}, Legal);
setAction({G_MUL, v8s16}, Legal);
+
+ setAction({G_FPEXT, s64}, Legal);
+ setAction({G_FPEXT, 1, s32}, Legal);
+
+ // Constants
+ setAction({TargetOpcode::G_FCONSTANT, s64}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty :
+ {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
+ setAction({G_MERGE_VALUES, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty : {v16s8, v8s16, v4s32, v2s64}) {
+ setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
}
void X86LegalizerInfo::setLegalizerInfoSSE41() {
@@ -226,9 +284,13 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
const LLT v2s64 = LLT::vector(2, 64);
const LLT v32s8 = LLT::vector(32, 8);
+ const LLT v64s8 = LLT::vector(64, 8);
const LLT v16s16 = LLT::vector(16, 16);
+ const LLT v32s16 = LLT::vector(32, 16);
const LLT v8s32 = LLT::vector(8, 32);
+ const LLT v16s32 = LLT::vector(16, 32);
const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v8s64 = LLT::vector(8, 64);
for (unsigned MemOp : {G_LOAD, G_STORE})
for (auto Ty : {v8s32, v4s64})
@@ -242,6 +304,17 @@ void X86LegalizerInfo::setLegalizerInfoAVX() {
setAction({G_INSERT, 1, Ty}, Legal);
setAction({G_EXTRACT, Ty}, Legal);
}
+ // Merge/Unmerge
+ for (const auto &Ty :
+ {v32s8, v64s8, v16s16, v32s16, v8s32, v16s32, v4s64, v8s64}) {
+ setAction({G_MERGE_VALUES, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty :
+ {v16s8, v32s8, v8s16, v16s16, v4s32, v8s32, v2s64, v4s64}) {
+ setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
}
void X86LegalizerInfo::setLegalizerInfoAVX2() {
@@ -253,12 +326,27 @@ void X86LegalizerInfo::setLegalizerInfoAVX2() {
const LLT v8s32 = LLT::vector(8, 32);
const LLT v4s64 = LLT::vector(4, 64);
+ const LLT v64s8 = LLT::vector(64, 8);
+ const LLT v32s16 = LLT::vector(32, 16);
+ const LLT v16s32 = LLT::vector(16, 32);
+ const LLT v8s64 = LLT::vector(8, 64);
+
for (unsigned BinOp : {G_ADD, G_SUB})
for (auto Ty : {v32s8, v16s16, v8s32, v4s64})
setAction({BinOp, Ty}, Legal);
for (auto Ty : {v16s16, v8s32})
setAction({G_MUL, Ty}, Legal);
+
+ // Merge/Unmerge
+ for (const auto &Ty : {v64s8, v32s16, v16s32, v8s64}) {
+ setAction({G_MERGE_VALUES, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, 1, Ty}, Legal);
+ }
+ for (const auto &Ty : {v32s8, v16s16, v8s32, v4s64}) {
+ setAction({G_MERGE_VALUES, 1, Ty}, Legal);
+ setAction({G_UNMERGE_VALUES, Ty}, Legal);
+ }
}
void X86LegalizerInfo::setLegalizerInfoAVX512() {
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index fd2837b79103..8a7179e48a0b 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -15,6 +15,7 @@
#include "InstPrinter/X86ATTInstPrinter.h"
#include "InstPrinter/X86InstComments.h"
#include "MCTargetDesc/X86BaseInfo.h"
+#include "MCTargetDesc/X86TargetStreamer.h"
#include "Utils/X86ShuffleDecode.h"
#include "X86AsmPrinter.h"
#include "X86RegisterInfo.h"
@@ -22,12 +23,12 @@
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/SmallString.h"
#include "llvm/ADT/iterator_range.h"
-#include "llvm/BinaryFormat/ELF.h"
#include "llvm/CodeGen/MachineConstantPool.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineModuleInfoImpls.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/GlobalValue.h"
#include "llvm/IR/Mangler.h"
@@ -40,12 +41,9 @@
#include "llvm/MC/MCInstBuilder.h"
#include "llvm/MC/MCSection.h"
#include "llvm/MC/MCSectionELF.h"
-#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCStreamer.h"
#include "llvm/MC/MCSymbol.h"
#include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
using namespace llvm;
@@ -102,7 +100,9 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
}
void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
- OutStreamer->EmitInstruction(Inst, getSubtargetInfo(), EnablePrintSchedInfo);
+ OutStreamer->EmitInstruction(Inst, getSubtargetInfo(),
+ EnablePrintSchedInfo &&
+ !(Inst.getFlags() & X86::NO_SCHED_INFO));
SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
}
@@ -960,7 +960,7 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
// This is an optimization that lets us get away without emitting a nop in
// many cases.
//
- // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %R9) takes two
+ // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %r9) takes two
// bytes too, so the check on MinSize is important.
MCI.setOpcode(X86::PUSH64rmr);
} else {
@@ -1047,20 +1047,20 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// We want to emit the following pattern, which follows the x86 calling
// convention to prepare for the trampoline call to be patched in.
//
- // <args placement according SysV64 calling convention>
// .p2align 1, ...
// .Lxray_event_sled_N:
- // jmp +N // jump across the call instruction
- // callq __xray_CustomEvent // force relocation to symbol
- // <args cleanup, jump to here>
- //
- // The relative jump needs to jump forward 24 bytes:
- // 10 (args) + 5 (nops) + 9 (cleanup)
+ // jmp +N // jump across the instrumentation sled
+ // ... // set up arguments in register
+ // callq __xray_CustomEvent@plt // force dependency to symbol
+ // ...
+ // <jump here>
//
// After patching, it would look something like:
//
// nopw (2-byte nop)
+ // ...
// callq __xrayCustomEvent // already lowered
+ // ...
//
// ---
// First we emit the label and the jump.
@@ -1072,49 +1072,57 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
- OutStreamer->EmitBytes("\xeb\x14");
+ OutStreamer->EmitBinaryData("\xeb\x0f");
// The default C calling convention will place two arguments into %rcx and
// %rdx -- so we only work with those.
- unsigned UsedRegs[] = {X86::RDI, X86::RSI, X86::RAX};
-
- // Because we will use %rax, we preserve that across the call.
- EmitAndCountInstruction(MCInstBuilder(X86::PUSH64r).addReg(X86::RAX));
-
- // Then we put the operands in the %rdi and %rsi registers.
+ unsigned UsedRegs[] = {X86::RDI, X86::RSI};
+ bool UsedMask[] = {false, false};
+
+ // Then we put the operands in the %rdi and %rsi registers. We spill the
+ // values in the register before we clobber them, and mark them as used in
+ // UsedMask. In case the arguments are already in the correct register, we use
+ // emit nops appropriately sized to keep the sled the same size in every
+ // situation.
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
- if (Op->isImm())
- EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
+ assert(Op->isReg() && "Only support arguments in registers");
+ if (Op->getReg() != UsedRegs[I]) {
+ UsedMask[I] = true;
+ EmitAndCountInstruction(
+ MCInstBuilder(X86::PUSH64r).addReg(UsedRegs[I]));
+ EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
.addReg(UsedRegs[I])
- .addImm(Op->getImm()));
- else if (Op->isReg()) {
- if (Op->getReg() != UsedRegs[I])
- EmitAndCountInstruction(MCInstBuilder(X86::MOV64rr)
- .addReg(UsedRegs[I])
- .addReg(Op->getReg()));
- else
- EmitNops(*OutStreamer, 3, Subtarget->is64Bit(), getSubtargetInfo());
+ .addReg(Op->getReg()));
+ } else {
+ EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
}
}
// We emit a hard dependency on the __xray_CustomEvent symbol, which is the
- // name of the trampoline to be implemented by the XRay runtime. We put this
- // explicitly in the %rax register.
+ // name of the trampoline to be implemented by the XRay runtime.
auto TSym = OutContext.getOrCreateSymbol("__xray_CustomEvent");
MachineOperand TOp = MachineOperand::CreateMCSymbol(TSym);
- EmitAndCountInstruction(MCInstBuilder(X86::MOV64ri)
- .addReg(X86::RAX)
- .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
+ if (isPositionIndependent())
+ TOp.setTargetFlags(X86II::MO_PLT);
// Emit the call instruction.
- EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(X86::RAX));
+ EmitAndCountInstruction(MCInstBuilder(X86::CALL64pcrel32)
+ .addOperand(MCIL.LowerSymbolOperand(TOp, TSym)));
// Restore caller-saved and used registers.
+ for (unsigned I = sizeof UsedMask; I-- > 0;)
+ if (UsedMask[I])
+ EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(UsedRegs[I]));
+ else
+ EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+
OutStreamer->AddComment("xray custom event end.");
- EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(X86::RAX));
- recordSled(CurSled, MI, SledKind::CUSTOM_EVENT);
+ // Record the sled version. Older versions of this sled were spelled
+ // differently, so we let the runtime handle the different offsets we're
+ // using.
+ recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
}
void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
@@ -1125,7 +1133,6 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
// .Lxray_sled_N:
// jmp .tmpN
// # 9 bytes worth of noops
- // .tmpN
//
// We need the 9 bytes because at runtime, we'd be patching over the full 11
// bytes with the following pattern:
@@ -1136,14 +1143,12 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
OutStreamer->EmitCodeAlignment(2);
OutStreamer->EmitLabel(CurSled);
- auto Target = OutContext.createTempSymbol();
// Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
// an operand (computed as an offset from the jmp instruction).
// FIXME: Find another less hacky way do force the relative jump.
OutStreamer->EmitBytes("\xeb\x09");
EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
- OutStreamer->EmitLabel(Target);
recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
}
@@ -1358,6 +1363,82 @@ static void printConstant(const Constant *COp, raw_ostream &CS) {
}
}
+void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
+ assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+ assert(getSubtarget().isOSWindows() && "SEH_ instruction Windows only");
+ const X86RegisterInfo *RI =
+ MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+ // Use the .cv_fpo directives if we're emitting CodeView on 32-bit x86.
+ if (EmitFPOData) {
+ X86TargetStreamer *XTS =
+ static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer());
+ switch (MI->getOpcode()) {
+ case X86::SEH_PushReg:
+ XTS->emitFPOPushReg(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_StackAlloc:
+ XTS->emitFPOStackAlloc(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_SetFrame:
+ assert(MI->getOperand(1).getImm() == 0 &&
+ ".cv_fpo_setframe takes no offset");
+ XTS->emitFPOSetFrame(MI->getOperand(0).getImm());
+ break;
+ case X86::SEH_EndPrologue:
+ XTS->emitFPOEndPrologue();
+ break;
+ case X86::SEH_SaveReg:
+ case X86::SEH_SaveXMM:
+ case X86::SEH_PushFrame:
+ llvm_unreachable("SEH_ directive incompatible with FPO");
+ break;
+ default:
+ llvm_unreachable("expected SEH_ instruction");
+ }
+ return;
+ }
+
+ // Otherwise, use the .seh_ directives for all other Windows platforms.
+ switch (MI->getOpcode()) {
+ case X86::SEH_PushReg:
+ OutStreamer->EmitWinCFIPushReg(
+ RI->getSEHRegNum(MI->getOperand(0).getImm()));
+ break;
+
+ case X86::SEH_SaveReg:
+ OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ break;
+
+ case X86::SEH_SaveXMM:
+ OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ break;
+
+ case X86::SEH_StackAlloc:
+ OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
+ break;
+
+ case X86::SEH_SetFrame:
+ OutStreamer->EmitWinCFISetFrame(
+ RI->getSEHRegNum(MI->getOperand(0).getImm()),
+ MI->getOperand(1).getImm());
+ break;
+
+ case X86::SEH_PushFrame:
+ OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
+ break;
+
+ case X86::SEH_EndPrologue:
+ OutStreamer->EmitWinCFIEndProlog();
+ break;
+
+ default:
+ llvm_unreachable("expected SEH_ instruction");
+ }
+}
+
void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
X86MCInstLower MCInstLowering(*MF, *this);
const X86RegisterInfo *RI = MF->getSubtarget<X86Subtarget>().getRegisterInfo();
@@ -1535,41 +1616,13 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
return;
case X86::SEH_PushReg:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFIPushReg(RI->getSEHRegNum(MI->getOperand(0).getImm()));
- return;
-
case X86::SEH_SaveReg:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFISaveReg(RI->getSEHRegNum(MI->getOperand(0).getImm()),
- MI->getOperand(1).getImm());
- return;
-
case X86::SEH_SaveXMM:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFISaveXMM(RI->getSEHRegNum(MI->getOperand(0).getImm()),
- MI->getOperand(1).getImm());
- return;
-
case X86::SEH_StackAlloc:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFIAllocStack(MI->getOperand(0).getImm());
- return;
-
case X86::SEH_SetFrame:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFISetFrame(RI->getSEHRegNum(MI->getOperand(0).getImm()),
- MI->getOperand(1).getImm());
- return;
-
case X86::SEH_PushFrame:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFIPushFrame(MI->getOperand(0).getImm());
- return;
-
case X86::SEH_EndPrologue:
- assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
- OutStreamer->EmitWinCFIEndProlog();
+ EmitSEHInstruction(MI);
return;
case X86::SEH_Epilogue: {
@@ -1949,6 +2002,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
MCInst TmpInst;
MCInstLowering.Lower(MI, TmpInst);
+ if (MI->getAsmPrinterFlag(MachineInstr::NoSchedComment))
+ TmpInst.setFlags(TmpInst.getFlags() | X86::NO_SCHED_INFO);
// Stackmap shadows cannot include branch targets, so we can count the bytes
// in a call towards the shadow, but must ensure that the no thread returns
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index 3fcb642424ad..5433033671f3 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -10,7 +10,7 @@
#include "X86MachineFunctionInfo.h"
#include "X86RegisterInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
using namespace llvm;
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
index 8fdf10617059..67d95c2233de 100644
--- a/lib/Target/X86/X86MacroFusion.cpp
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -14,8 +14,8 @@
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/CodeGen/MacroFusion.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
using namespace llvm;
@@ -27,10 +27,8 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
const MachineInstr *FirstMI,
const MachineInstr &SecondMI) {
const X86Subtarget &ST = static_cast<const X86Subtarget&>(TSI);
- // Check if this processor supports macro-fusion. Since this is a minor
- // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
- // proxy for SandyBridge+.
- if (!ST.hasAVX())
+ // Check if this processor supports macro-fusion.
+ if (!ST.hasMacroFusion())
return false;
enum {
@@ -84,10 +82,10 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
case X86::TEST32i32:
case X86::TEST64i32:
case X86::TEST64ri32:
- case X86::TEST8rm:
- case X86::TEST16rm:
- case X86::TEST32rm:
- case X86::TEST64rm:
+ case X86::TEST8mr:
+ case X86::TEST16mr:
+ case X86::TEST32mr:
+ case X86::TEST64mr:
case X86::TEST8ri_NOREX:
case X86::AND16i16:
case X86::AND16ri:
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index e6756b975c10..1fc6f07b79fa 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -1,4 +1,4 @@
-//===-- X86OptimizeLEAs.cpp - optimize usage of LEA instructions ----------===//
+//===- X86OptimizeLEAs.cpp - optimize usage of LEA instructions -----------===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,22 +17,36 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/LiveVariables.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/IR/DIBuilder.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
using namespace llvm;
@@ -60,6 +74,7 @@ static bool isSimilarDispOp(const MachineOperand &MO1,
static inline bool isLEA(const MachineInstr &MI);
namespace {
+
/// A key based on instruction's memory operands.
class MemOpKey {
public:
@@ -92,12 +107,14 @@ public:
// Address' displacement operand.
const MachineOperand *Disp;
};
+
} // end anonymous namespace
/// Provide DenseMapInfo for MemOpKey.
namespace llvm {
+
template <> struct DenseMapInfo<MemOpKey> {
- typedef DenseMapInfo<const MachineOperand *> PtrInfo;
+ using PtrInfo = DenseMapInfo<const MachineOperand *>;
static inline MemOpKey getEmptyKey() {
return MemOpKey(PtrInfo::getEmptyKey(), PtrInfo::getEmptyKey(),
@@ -164,7 +181,8 @@ template <> struct DenseMapInfo<MemOpKey> {
return LHS == RHS;
}
};
-}
+
+} // end namespace llvm
/// \brief Returns a hash table key based on memory operands of \p MI. The
/// number of the first memory operand of \p MI is specified through \p N.
@@ -217,6 +235,7 @@ static inline bool isLEA(const MachineInstr &MI) {
}
namespace {
+
class OptimizeLEAPass : public MachineFunctionPass {
public:
OptimizeLEAPass() : MachineFunctionPass(ID) {}
@@ -229,7 +248,7 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
private:
- typedef DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>> MemOpMap;
+ using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
/// \brief Returns a distance between two instructions inside one basic block.
/// Negative result means, that instructions occur in reverse order.
@@ -281,8 +300,10 @@ private:
static char ID;
};
+
+} // end anonymous namespace
+
char OptimizeLEAPass::ID = 0;
-}
FunctionPass *llvm::createX86OptimizeLEAs() { return new OptimizeLEAPass(); }
@@ -547,16 +568,18 @@ MachineInstr *OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
if (AddrDispShift != 0)
Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, AddrDispShift,
+ DIExpression::NoDeref,
DIExpression::WithStackValue);
// Replace DBG_VALUE instruction with modified version.
MachineBasicBlock *MBB = MI.getParent();
DebugLoc DL = MI.getDebugLoc();
bool IsIndirect = MI.isIndirectDebugValue();
- int64_t Offset = IsIndirect ? MI.getOperand(1).getImm() : 0;
const MDNode *Var = MI.getDebugVariable();
+ if (IsIndirect)
+ assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset");
return BuildMI(*MBB, MBB->erase(&MI), DL, TII->get(TargetOpcode::DBG_VALUE),
- IsIndirect, VReg, Offset, Var, Expr);
+ IsIndirect, VReg, Var, Expr);
}
// Try to find similar LEAs in the list and replace one with another.
@@ -649,7 +672,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
- if (DisableX86LEAOpt || skipFunction(*MF.getFunction()))
+ if (DisableX86LEAOpt || skipFunction(MF.getFunction()))
return false;
MRI = &MF.getRegInfo();
@@ -673,7 +696,7 @@ bool OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
// Remove redundant address calculations. Do it only for -Os/-Oz since only
// a code size gain is expected from this part of the pass.
- if (MF.getFunction()->optForSize())
+ if (MF.getFunction().optForSize())
Changed |= removeRedundantAddrCalc(LEAs);
}
diff --git a/lib/Target/X86/X86PadShortFunction.cpp b/lib/Target/X86/X86PadShortFunction.cpp
index 3069d1fd3497..1da0fad8b6cf 100644
--- a/lib/Target/X86/X86PadShortFunction.cpp
+++ b/lib/Target/X86/X86PadShortFunction.cpp
@@ -13,7 +13,6 @@
//
//===----------------------------------------------------------------------===//
-#include <algorithm>
#include "X86.h"
#include "X86InstrInfo.h"
@@ -21,12 +20,11 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
@@ -98,10 +96,10 @@ FunctionPass *llvm::createX86PadShortFunctions() {
/// runOnMachineFunction - Loop over all of the basic blocks, inserting
/// NOOP instructions before early exits.
bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
- if (skipFunction(*MF.getFunction()))
+ if (skipFunction(MF.getFunction()))
return false;
- if (MF.getFunction()->optForSize()) {
+ if (MF.getFunction().optForSize()) {
return false;
}
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
index efd3df26dd42..aa0e3743c948 100644
--- a/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -16,7 +16,7 @@
#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#define GET_TARGET_REGBANK_IMPL
#include "X86GenRegisterBank.inc"
@@ -26,10 +26,6 @@ using namespace llvm;
#define GET_TARGET_REGBANK_INFO_IMPL
#include "X86GenRegisterBankInfo.def"
-#ifndef LLVM_BUILD_GLOBAL_ISEL
-#error "You shouldn't build this"
-#endif
-
X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
: X86GenRegisterBankInfo() {
@@ -164,7 +160,7 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
// Try the default logic for non-generic instructions that are either copies
// or already have some operands assigned to banks.
- if (!isPreISelGenericOpcode(Opc)) {
+ if (!isPreISelGenericOpcode(Opc) || Opc == TargetOpcode::G_PHI) {
const InstructionMapping &Mapping = getInstrMappingImpl(MI);
if (Mapping.isValid())
return Mapping;
@@ -186,10 +182,19 @@ X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
}
unsigned NumOperands = MI.getNumOperands();
-
- // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
- getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
+
+ switch (Opc) {
+ case TargetOpcode::G_FPEXT:
+ case TargetOpcode::G_FCONSTANT:
+ // Instruction having only floating-point operands (all scalars in VECRReg)
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
+ break;
+ default:
+ // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
+ getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
+ break;
+ }
// Finally construct the computed mapping.
SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
@@ -215,7 +220,8 @@ X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
switch (MI.getOpcode()) {
case TargetOpcode::G_LOAD:
- case TargetOpcode::G_STORE: {
+ case TargetOpcode::G_STORE:
+ case TargetOpcode::G_IMPLICIT_DEF: {
// we going to try to map 32/64 bit to PMI_FP32/PMI_FP64
unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
if (Size != 32 && Size != 64)
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 343da2573b55..bc31e95aa6b5 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -15,26 +15,21 @@
#include "X86RegisterInfo.h"
#include "X86FrameLowering.h"
-#include "X86InstrBuilder.h"
#include "X86MachineFunctionInfo.h"
#include "X86Subtarget.h"
-#include "X86TargetMachine.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Constants.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Type.h"
-#include "llvm/MC/MCAsmInfo.h"
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Target/TargetFrameLowering.h"
-#include "llvm/Target/TargetInstrInfo.h"
#include "llvm/Target/TargetMachine.h"
#include "llvm/Target/TargetOptions.h"
@@ -223,13 +218,13 @@ X86RegisterInfo::getPointerRegClass(const MachineFunction &MF,
const TargetRegisterClass *
X86RegisterInfo::getGPRsForTailCall(const MachineFunction &MF) const {
- const Function *F = MF.getFunction();
- if (IsWin64 || (F && F->getCallingConv() == CallingConv::Win64))
+ const Function &F = MF.getFunction();
+ if (IsWin64 || (F.getCallingConv() == CallingConv::Win64))
return &X86::GR64_TCW64RegClass;
else if (Is64Bit)
return &X86::GR64_TCRegClass;
- bool hasHipeCC = (F ? F->getCallingConv() == CallingConv::HiPE : false);
+ bool hasHipeCC = (F.getCallingConv() == CallingConv::HiPE);
if (hasHipeCC)
return &X86::GR32RegClass;
return &X86::GR32_TCRegClass;
@@ -271,16 +266,17 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
assert(MF && "MachineFunction required");
const X86Subtarget &Subtarget = MF->getSubtarget<X86Subtarget>();
+ const Function &F = MF->getFunction();
bool HasSSE = Subtarget.hasSSE1();
bool HasAVX = Subtarget.hasAVX();
bool HasAVX512 = Subtarget.hasAVX512();
bool CallsEHReturn = MF->callsEHReturn();
- CallingConv::ID CC = MF->getFunction()->getCallingConv();
+ CallingConv::ID CC = F.getCallingConv();
// If attribute NoCallerSavedRegisters exists then we set X86_INTR calling
// convention because it has the CSR list.
- if (MF->getFunction()->hasFnAttribute("no_caller_saved_registers"))
+ if (MF->getFunction().hasFnAttribute("no_caller_saved_registers"))
CC = CallingConv::X86_INTR;
switch (CC) {
@@ -365,28 +361,26 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
}
if (Is64Bit) {
- if (IsWin64) {
- if (!HasSSE)
- return CSR_Win64_NoSSE_SaveList;
- return CSR_Win64_SaveList;
- }
+ bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError);
+ if (IsSwiftCC)
+ return IsWin64 ? CSR_Win64_SwiftError_SaveList
+ : CSR_64_SwiftError_SaveList;
+
+ if (IsWin64)
+ return HasSSE ? CSR_Win64_SaveList : CSR_Win64_NoSSE_SaveList;
if (CallsEHReturn)
return CSR_64EHRet_SaveList;
- if (Subtarget.getTargetLowering()->supportSwiftError() &&
- MF->getFunction()->getAttributes().hasAttrSomewhere(
- Attribute::SwiftError))
- return CSR_64_SwiftError_SaveList;
return CSR_64_SaveList;
}
- if (CallsEHReturn)
- return CSR_32EHRet_SaveList;
- return CSR_32_SaveList;
+
+ return CallsEHReturn ? CSR_32EHRet_SaveList : CSR_32_SaveList;
}
const MCPhysReg *X86RegisterInfo::getCalleeSavedRegsViaCopy(
const MachineFunction *MF) const {
assert(MF && "Invalid MachineFunction pointer.");
- if (MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
+ if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
MF->getInfo<X86MachineFunctionInfo>()->isSplitCSR())
return CSR_64_CXX_TLS_Darwin_ViaCopy_SaveList;
return nullptr;
@@ -479,14 +473,14 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
// Unlike getCalleeSavedRegs(), we don't have MMI so we can't check
// callsEHReturn().
if (Is64Bit) {
- if (IsWin64)
- return CSR_Win64_RegMask;
- if (Subtarget.getTargetLowering()->supportSwiftError() &&
- MF.getFunction()->getAttributes().hasAttrSomewhere(
- Attribute::SwiftError))
- return CSR_64_SwiftError_RegMask;
- return CSR_64_RegMask;
+ const Function &F = MF.getFunction();
+ bool IsSwiftCC = Subtarget.getTargetLowering()->supportSwiftError() &&
+ F.getAttributes().hasAttrSomewhere(Attribute::SwiftError);
+ if (IsSwiftCC)
+ return IsWin64 ? CSR_Win64_SwiftError_RegMask : CSR_64_SwiftError_RegMask;
+ return IsWin64 ? CSR_Win64_RegMask : CSR_64_RegMask;
}
+
return CSR_32_RegMask;
}
@@ -508,6 +502,9 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
++I)
Reserved.set(*I);
+ // Set the Shadow Stack Pointer as reserved.
+ Reserved.set(X86::SSP);
+
// Set the instruction pointer register and its aliases as reserved.
for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
++I)
@@ -522,7 +519,7 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Set the base-pointer register and its aliases as reserved if needed.
if (hasBasePointer(MF)) {
- CallingConv::ID CC = MF.getFunction()->getCallingConv();
+ CallingConv::ID CC = MF.getFunction().getCallingConv();
const uint32_t *RegMask = getCallPreservedMask(MF, CC);
if (MachineOperand::clobbersPhysReg(RegMask, getBaseRegister()))
report_fatal_error(
diff --git a/lib/Target/X86/X86RegisterInfo.h b/lib/Target/X86/X86RegisterInfo.h
index 25958f0c3106..29401dadead0 100644
--- a/lib/Target/X86/X86RegisterInfo.h
+++ b/lib/Target/X86/X86RegisterInfo.h
@@ -14,7 +14,7 @@
#ifndef LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
#define LLVM_LIB_TARGET_X86_X86REGISTERINFO_H
-#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
#define GET_REGINFO_HEADER
#include "X86GenRegisterInfo.inc"
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 3a61a7247c72..2341e1fb0fac 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -308,6 +308,9 @@ def BND1 : X86Reg<"bnd1", 1>;
def BND2 : X86Reg<"bnd2", 2>;
def BND3 : X86Reg<"bnd3", 3>;
+// CET registers - Shadow Stack Pointer
+def SSP : X86Reg<"ssp", 0>;
+
//===----------------------------------------------------------------------===//
// Register Class Definitions... now that we have all of the pieces, define the
// top-level register classes. The order specified in the register list is
@@ -357,7 +360,7 @@ def GR64 : RegisterClass<"X86", [i64], 64,
def SEGMENT_REG : RegisterClass<"X86", [i16], 16, (add CS, DS, SS, ES, FS, GS)>;
// Debug registers.
-def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 7)>;
+def DEBUG_REG : RegisterClass<"X86", [i32], 32, (sequence "DR%u", 0, 15)>;
// Control registers.
def CONTROL_REG : RegisterClass<"X86", [i64], 64, (sequence "CR%u", 0, 15)>;
diff --git a/lib/Target/X86/X86SchedBroadwell.td b/lib/Target/X86/X86SchedBroadwell.td
new file mode 100755
index 000000000000..e4e0ed435103
--- /dev/null
+++ b/lib/Target/X86/X86SchedBroadwell.td
@@ -0,0 +1,3869 @@
+//=- X86SchedBroadwell.td - X86 Broadwell Scheduling ---------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Broadwell to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+def BroadwellModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and HW can decode 4
+ // instructions per cycle.
+ let IssueWidth = 4;
+ let MicroOpBufferSize = 192; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 16;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = BroadwellModel in {
+
+// Broadwell can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def BWPort0 : ProcResource<1>;
+def BWPort1 : ProcResource<1>;
+def BWPort2 : ProcResource<1>;
+def BWPort3 : ProcResource<1>;
+def BWPort4 : ProcResource<1>;
+def BWPort5 : ProcResource<1>;
+def BWPort6 : ProcResource<1>;
+def BWPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def BWPort01 : ProcResGroup<[BWPort0, BWPort1]>;
+def BWPort23 : ProcResGroup<[BWPort2, BWPort3]>;
+def BWPort237 : ProcResGroup<[BWPort2, BWPort3, BWPort7]>;
+def BWPort04 : ProcResGroup<[BWPort0, BWPort4]>;
+def BWPort05 : ProcResGroup<[BWPort0, BWPort5]>;
+def BWPort06 : ProcResGroup<[BWPort0, BWPort6]>;
+def BWPort15 : ProcResGroup<[BWPort1, BWPort5]>;
+def BWPort16 : ProcResGroup<[BWPort1, BWPort6]>;
+def BWPort56 : ProcResGroup<[BWPort5, BWPort6]>;
+def BWPort015 : ProcResGroup<[BWPort0, BWPort1, BWPort5]>;
+def BWPort056 : ProcResGroup<[BWPort0, BWPort5, BWPort6]>;
+def BWPort0156: ProcResGroup<[BWPort0, BWPort1, BWPort5, BWPort6]>;
+
+// 60 Entry Unified Scheduler
+def BWPortAny : ProcResGroup<[BWPort0, BWPort1, BWPort2, BWPort3, BWPort4,
+ BWPort5, BWPort6, BWPort7]> {
+ let BufferSize=60;
+}
+
+// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass BWWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [BWPort23, ExePort]> {
+ let Latency = !add(Lat, 5);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [BWPort4]>;
+
+// Arithmetic.
+defm : BWWriteResPair<WriteALU, BWPort0156, 1>; // Simple integer ALU op.
+defm : BWWriteResPair<WriteIMul, BWPort1, 3>; // Integer multiplication.
+def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
+def BWDivider : ProcResource<1>; // Integer division issued on port 0.
+def : WriteRes<WriteIDiv, [BWPort0, BWDivider]> { // Integer division.
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [BWPort23, BWPort0, BWDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+def : WriteRes<WriteLEA, [BWPort15]>; // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm : BWWriteResPair<WriteShift, BWPort06, 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+def : WriteRes<WriteLoad, [BWPort23]> { let Latency = 5; }
+def : WriteRes<WriteStore, [BWPort237, BWPort4]>;
+def : WriteRes<WriteMove, [BWPort0156]>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero, []>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : BWWriteResPair<WriteJump, BWPort06, 1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : BWWriteResPair<WriteFAdd, BWPort1, 3>; // Floating point add/sub/compare.
+defm : BWWriteResPair<WriteFMul, BWPort0, 5>; // Floating point multiplication.
+defm : BWWriteResPair<WriteFDiv, BWPort0, 12>; // 10-14 cycles. // Floating point division.
+defm : BWWriteResPair<WriteFSqrt, BWPort0, 15>; // Floating point square root.
+defm : BWWriteResPair<WriteFRcp, BWPort0, 5>; // Floating point reciprocal estimate.
+defm : BWWriteResPair<WriteFRsqrt, BWPort0, 5>; // Floating point reciprocal square root estimate.
+defm : BWWriteResPair<WriteFMA, BWPort01, 5>; // Fused Multiply Add.
+defm : BWWriteResPair<WriteFShuffle, BWPort5, 1>; // Floating point vector shuffles.
+defm : BWWriteResPair<WriteFBlend, BWPort015, 1>; // Floating point vector blends.
+def : WriteRes<WriteFVarBlend, [BWPort5]> { // Fp vector variable blends.
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [BWPort5, BWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : BWWriteResPair<WriteVecALU, BWPort15, 1>; // Vector integer ALU op, no logicals.
+defm : BWWriteResPair<WriteVecShift, BWPort0, 1>; // Vector integer shifts.
+defm : BWWriteResPair<WriteVecIMul, BWPort0, 5>; // Vector integer multiply.
+defm : BWWriteResPair<WriteShuffle, BWPort5, 1>; // Vector shuffles.
+defm : BWWriteResPair<WriteBlend, BWPort15, 1>; // Vector blends.
+
+def : WriteRes<WriteVarBlend, [BWPort5]> { // Vector variable blends.
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [BWPort5, BWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteMPSAD, [BWPort0, BWPort5]> { // Vector MPSAD.
+ let Latency = 6;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [BWPort23, BWPort0, BWPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 2];
+}
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm : BWWriteResPair<WriteVecLogic, BWPort015, 1>; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm : BWWriteResPair<WriteCvtF2I, BWPort1, 3>; // Float -> Integer.
+defm : BWWriteResPair<WriteCvtI2F, BWPort1, 4>; // Integer -> Float.
+defm : BWWriteResPair<WriteCvtF2F, BWPort1, 3>; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// String instructions.
+def : WriteRes<WritePCmpIStrM, [BWPort0]> {
+ let Latency = 10;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [BWPort0, BWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 1];
+}
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [BWPort0, BWPort16, BWPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [BWPort05, BWPort16, BWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [6, 2, 1];
+}
+ // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [BWPort0]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [BWPort0, BWPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [BWPort05, BWPort16]> {
+ let Latency = 11;
+ let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [BWPort0, BWPort16, BWPort5, BWPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [BWPort5]> { // Decryption, encryption.
+ let Latency = 7;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [BWPort5, BWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESIMC, [BWPort5]> { // InvMixColumn.
+ let Latency = 14;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [BWPort5, BWPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteAESKeyGen, [BWPort0, BWPort5]> { // Key Generation.
+ let Latency = 10;
+ let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [BWPort0, BWPort5, BWPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [BWPort0, BWPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [BWPort0, BWPort5, BWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1, 1];
+}
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem, [BWPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : BWWriteResPair<WriteFShuffle256, BWPort5, 3>; // Fp 256-bit width vector shuffles.
+defm : BWWriteResPair<WriteShuffle256, BWPort5, 3>; // 256-bit width vector shuffles.
+def : WriteRes<WriteVarVecShift, [BWPort0, BWPort5]> { // Variable vector shifts.
+ let Latency = 2;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [BWPort0, BWPort5, BWPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1, 1];
+}
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [BWPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence, [BWPort23, BWPort4]>;
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [BWPort1]> {
+ let Latency = 3;
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [BWPort1, BWPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [BWPort15]>;
+
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [BWPort15, BWPort23]> {
+ let Latency = 5;
+ let ResourceCycles = [1, 1];
+}
+
+// Remaining instrs.
+
+def BWWriteResGroup1 : SchedWriteRes<[BWPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_MOVD64grr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PMOVMSKBrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLDrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLQrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSLLWrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRADrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRAWrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLDrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLQrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MMX_PSRLWrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "MOVPQIto64rr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSLLDri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSLLQri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSLLWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSRADri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSRAWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSRLDri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSRLQri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "PSRLWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLDri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLQri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQYrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLVQrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSLLWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRADYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRADri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRAWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLDri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLQri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQYrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLVQrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWYri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VPSRLWri")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDYrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VTESTPDrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSYrr")>;
+def: InstRW<[BWWriteResGroup1], (instregex "VTESTPSrr")>;
+
+def BWWriteResGroup2 : SchedWriteRes<[BWPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup2], (instregex "COMP_FST0r")>;
+def: InstRW<[BWWriteResGroup2], (instregex "COM_FST0r")>;
+def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[BWWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[BWWriteResGroup2], (instregex "UCOM_FPr")>;
+def: InstRW<[BWWriteResGroup2], (instregex "UCOM_Fr")>;
+def: InstRW<[BWWriteResGroup2], (instregex "VMASKMOVDQU")>;
+
+def BWWriteResGroup3 : SchedWriteRes<[BWPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup3], (instregex "ANDNPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "ANDNPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "ANDPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "ANDPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "INSERTPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64rr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PSHUFWri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOV64toPQIrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVAPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVAPSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVDDUPrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVHLPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVLHPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVSHDUPrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVSLDUPrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVSSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "ORPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "ORPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PACKSSDWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PACKSSWBrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PACKUSDWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PACKUSWBrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PALIGNRrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PBLENDWrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVSXWQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PMOVZXWQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PSHUFBrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PSHUFDri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PSHUFHWri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PSHUFLWri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PSLLDQri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PSRLDQri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "SHUFPDrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "SHUFPSrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "UNPCKHPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "UNPCKLPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDNPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDNPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VANDPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VINSERTPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVAPSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVDDUPrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVHLPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVLHPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VORPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VORPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VORPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VORPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSDWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKSSWBrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSDWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPACKUSWBrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRYrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPALIGNRrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWYrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPBLENDWrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPBROADCASTQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPERMILPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFBrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFDri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFHWri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSHUFLWri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSLLDQri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQYri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPSRLDQri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDYrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPDrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSYrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VSHUFPSrri")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VXORPDYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VXORPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VXORPSYrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "VXORPSrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "XORPDrr")>;
+def: InstRW<[BWWriteResGroup3], (instregex "XORPSrr")>;
+
+def BWWriteResGroup4 : SchedWriteRes<[BWPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def BWWriteResGroup5 : SchedWriteRes<[BWPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup5], (instregex "FINCSTP")>;
+def: InstRW<[BWWriteResGroup5], (instregex "FNOP")>;
+
+def BWWriteResGroup6 : SchedWriteRes<[BWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup6], (instregex "ADC8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup6], (instregex "ADCX(32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "ADOX(32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)ri8")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BT(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)ri8")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BTC(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)ri8")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BTR(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)ri8")>;
+def: InstRW<[BWWriteResGroup6], (instregex "BTS(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CDQ")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVAE(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVB(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVE(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVG(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVGE(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVL(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVLE(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVNE(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVNO(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVNP(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVNS(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVO(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVP(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CMOVS(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "CQO")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JAE_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JAE_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JA_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JA_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JBE_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JBE_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JB_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JB_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JE_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JE_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JGE_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JGE_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JG_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JG_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JLE_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JLE_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JL_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JL_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JMP_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JMP_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNE_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNE_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNO_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNO_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNP_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNP_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNS_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JNS_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JO_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JO_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JP_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JP_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JS_1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "JS_4")>;
+def: InstRW<[BWWriteResGroup6], (instregex "RORX(32|64)ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SAR(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SAR8r1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SAR8ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SARX(32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SBB8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETAEr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETBr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETEr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETGEr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETGr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETLEr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETLr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETNEr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETNOr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETNPr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETNSr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETOr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETPr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SETSr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHL(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHL8r1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHL8ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHLX(32|64)rr")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHR(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHR8r1")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHR8ri")>;
+def: InstRW<[BWWriteResGroup6], (instregex "SHRX(32|64)rr")>;
+
+def BWWriteResGroup7 : SchedWriteRes<[BWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup7], (instregex "ANDN(32|64)rr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "BLSI(32|64)rr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "BLSMSK(32|64)rr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "BLSR(32|64)rr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "BZHI(32|64)rr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDDirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDQirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDSWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDUSWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PADDWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PAVGWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQDirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPEQWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTDirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PCMPGTWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXSWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMAXUBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINSWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PMINUBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBDirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBQirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBSWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSBirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBUSWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "MMX_PSUBWirr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PABSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PABSDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PABSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDQrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDUSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDUSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PADDWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PAVGBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PAVGWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQQrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPEQWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PCMPGTWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMAXSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMAXSDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMAXSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMAXUBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMAXUDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMAXUWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMINSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMINSDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMINSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMINUBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMINUDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PMINUWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSIGNBrr128")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSIGNDrr128")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSIGNWrr128")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBQrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBUSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "PSUBWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPABSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPABSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPABSDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPABSDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPABSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPABSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDQYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDQrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDUSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPADDWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPAVGBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPAVGWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQQrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPEQWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPCMPGTWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMAXUWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINSDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINUBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINUDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPMINUWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBYrr256")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNBrr128")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDYrr256")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNDrr128")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWYrr256")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSIGNWrr128")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBDrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBQrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSBrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBUSWrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWYrr")>;
+def: InstRW<[BWWriteResGroup7], (instregex "VPSUBWrr")>;
+
+def BWWriteResGroup8 : SchedWriteRes<[BWPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup8], (instregex "BLENDPDrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "BLENDPSrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_MOVQ64rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDNirr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_PANDirr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_PORirr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MMX_PXORirr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MOVDQUrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "PANDNrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "PANDrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "PORrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "PXORrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDYrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPDrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSYrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VBLENDPSrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVDQUrr(_REV)?")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPANDNYrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPANDNrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPANDYrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPANDrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDYrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPBLENDDrri")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPORYrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPORrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPXORYrr")>;
+def: InstRW<[BWWriteResGroup8], (instregex "VPXORrr")>;
+
+def BWWriteResGroup9 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "ADD(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "ADD8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "ADD8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "ADD8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "AND8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CBW")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CLC")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMC")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CMP8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "CWDE")>;
+def: InstRW<[BWWriteResGroup9], (instregex "DEC(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "DEC8r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "INC(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "INC8r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "LAHF")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOV8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr16")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr32")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOVSX(16|32|64)rr8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr16")>;
+def: InstRW<[BWWriteResGroup9], (instregex "MOVZX(16|32|64)rr8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "NEG(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "NEG8r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "NOOP")>;
+def: InstRW<[BWWriteResGroup9], (instregex "NOT(16|32|64)r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "NOT8r")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "OR8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SAHF")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SGDT64m")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SIDT64m")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SLDT64m")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SMSW16m")>;
+def: InstRW<[BWWriteResGroup9], (instregex "STC")>;
+def: InstRW<[BWWriteResGroup9], (instregex "STRm")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SUB8rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "SYSCALL")>;
+def: InstRW<[BWWriteResGroup9], (instregex "TEST(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup9], (instregex "TEST8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "TEST8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "TEST8rr")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XCHG(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR(16|32|64)rr(_REV)?")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR8i8")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR8ri")>;
+def: InstRW<[BWWriteResGroup9], (instregex "XOR8rr(_REV)?")>;
+
+def BWWriteResGroup10 : SchedWriteRes<[BWPort4,BWPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup10], (instregex "FBSTPm")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVD64mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVNTQmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MMX_MOVQ64mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOV(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOV8mi")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOV8mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVAPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVAPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVDQAmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVDQUmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVHPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVHPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVLPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVLPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVNTDQmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVNTI_64mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVNTImr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVNTPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVPDI2DImr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVPQI2QImr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVPQIto64mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVSDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVSSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVUPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "MOVUPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "ST_FP32m")>;
+def: InstRW<[BWWriteResGroup10], (instregex "ST_FP64m")>;
+def: InstRW<[BWWriteResGroup10], (instregex "ST_FP80m")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VEXTRACTI128mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVAPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQAmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVDQUmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVHPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVLPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTDQmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVNTPSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVSDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVSSmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPDmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSYmr")>;
+def: InstRW<[BWWriteResGroup10], (instregex "VMOVUPSmr")>;
+
+def BWWriteResGroup11 : SchedWriteRes<[BWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPDrr0")>;
+def: InstRW<[BWWriteResGroup11], (instregex "BLENDVPSrr0")>;
+def: InstRW<[BWWriteResGroup11], (instregex "MMX_PINSRWirri")>;
+def: InstRW<[BWWriteResGroup11], (instregex "PBLENDVBrr0")>;
+def: InstRW<[BWWriteResGroup11], (instregex "PINSRBrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "PINSRDrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "PINSRQrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "PINSRWrri")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPDrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VBLENDVPSrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBYrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VPBLENDVBrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VPINSRBrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VPINSRDrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VPINSRQrr")>;
+def: InstRW<[BWWriteResGroup11], (instregex "VPINSRWrri")>;
+
+def BWWriteResGroup12 : SchedWriteRes<[BWPort01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup12], (instregex "FDECSTP")>;
+
+def BWWriteResGroup13 : SchedWriteRes<[BWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROL(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROL8r1")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROL8ri")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROR(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROR8r1")>;
+def: InstRW<[BWWriteResGroup13], (instregex "ROR8ri")>;
+
+def BWWriteResGroup14 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup14], (instregex "LFENCE")>;
+def: InstRW<[BWWriteResGroup14], (instregex "MFENCE")>;
+def: InstRW<[BWWriteResGroup14], (instregex "WAIT")>;
+def: InstRW<[BWWriteResGroup14], (instregex "XGETBV")>;
+
+def BWWriteResGroup15 : SchedWriteRes<[BWPort0,BWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup15], (instregex "CVTPS2PDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "CVTSS2SDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "EXTRACTPSrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "MMX_PEXTRWirri")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PEXTRBrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PEXTRDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PEXTRQrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWri")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PEXTRWrr_REV")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSLLDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSLLQrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSLLWrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSRADrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSRAWrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSRLDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSRLQrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PSRLWrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "PTESTrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSYrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VCVTPH2PSrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRBrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRQrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWri")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPEXTRWrr_REV")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSLLDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSLLQrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSLLWrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSRADrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSRAWrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSRLDrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSRLQrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPSRLWrr")>;
+def: InstRW<[BWWriteResGroup15], (instregex "VPTESTrr")>;
+
+def BWWriteResGroup16 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup16], (instregex "CLFLUSH")>;
+
+def BWWriteResGroup17 : SchedWriteRes<[BWPort01,BWPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup17], (instregex "MMX_MOVDQ2Qrr")>;
+
+def BWWriteResGroup18 : SchedWriteRes<[BWPort237,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup18], (instregex "SFENCE")>;
+
+def BWWriteResGroup19 : SchedWriteRes<[BWPort06,BWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup19], (instregex "BEXTR(32|64)rr")>;
+def: InstRW<[BWWriteResGroup19], (instregex "BSWAP(16|32|64)r")>;
+
+def BWWriteResGroup20 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup20], (instregex "ADC8i8")>;
+def: InstRW<[BWWriteResGroup20], (instregex "ADC8ri")>;
+def: InstRW<[BWWriteResGroup20], (instregex "CMOVA(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup20], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup20], (instregex "CWD")>;
+def: InstRW<[BWWriteResGroup20], (instregex "JRCXZ")>;
+def: InstRW<[BWWriteResGroup20], (instregex "SBB8i8")>;
+def: InstRW<[BWWriteResGroup20], (instregex "SBB8ri")>;
+def: InstRW<[BWWriteResGroup20], (instregex "SETAr")>;
+def: InstRW<[BWWriteResGroup20], (instregex "SETBEr")>;
+
+def BWWriteResGroup21 : SchedWriteRes<[BWPort4,BWPort5,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup21], (instregex "EXTRACTPSmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "PEXTRBmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "PEXTRDmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "PEXTRQmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "PEXTRWmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "STMXCSR")>;
+def: InstRW<[BWWriteResGroup21], (instregex "VEXTRACTPSmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRBmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRDmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRQmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "VPEXTRWmr")>;
+def: InstRW<[BWWriteResGroup21], (instregex "VSTMXCSR")>;
+
+def BWWriteResGroup22 : SchedWriteRes<[BWPort4,BWPort6,BWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup22], (instregex "FNSTCW16m")>;
+
+def BWWriteResGroup23 : SchedWriteRes<[BWPort4,BWPort237,BWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup23], (instregex "SETAEm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETBm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETEm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETGEm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETGm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETLEm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETLm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETNEm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETNOm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETNPm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETNSm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETOm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETPm")>;
+def: InstRW<[BWWriteResGroup23], (instregex "SETSm")>;
+
+def BWWriteResGroup24 : SchedWriteRes<[BWPort4,BWPort237,BWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup24], (instregex "MOVBE(16|32|64)mr")>;
+
+def BWWriteResGroup25 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup25], (instregex "PUSH(16|32|64)r(mr)?")>;
+def: InstRW<[BWWriteResGroup25], (instregex "PUSH64i8")>;
+def: InstRW<[BWWriteResGroup25], (instregex "STOSB")>;
+def: InstRW<[BWWriteResGroup25], (instregex "STOSL")>;
+def: InstRW<[BWWriteResGroup25], (instregex "STOSQ")>;
+def: InstRW<[BWWriteResGroup25], (instregex "STOSW")>;
+
+def BWWriteResGroup26 : SchedWriteRes<[BWPort0]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPDrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "MOVMSKPSrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "PMOVMSKBrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBYrr")>;
+def: InstRW<[BWWriteResGroup26], (instregex "VPMOVMSKBrr")>;
+
+def BWWriteResGroup27 : SchedWriteRes<[BWPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup27], (instregex "ADDPDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADDPSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADDSDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADDSSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADDSUBPSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADD_FPrST0")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADD_FST0r")>;
+def: InstRW<[BWWriteResGroup27], (instregex "ADD_FrST0")>;
+def: InstRW<[BWWriteResGroup27], (instregex "BSF(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "BSR(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CMPPDrri")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CMPPSrri")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CMPSDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CMPSSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "COMISDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "COMISSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CVTPS2DQrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "IMUL(32|64)rr(i8)?")>;
+def: InstRW<[BWWriteResGroup27], (instregex "IMUL8r")>;
+def: InstRW<[BWWriteResGroup27], (instregex "LZCNT(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)PSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MAX(C?)SSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)PSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MIN(C?)SSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "MUL8r")>;
+def: InstRW<[BWWriteResGroup27], (instregex "PDEP(32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "PEXT(32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "POPCNT(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBPDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBPSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FPrST0")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FST0r")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBR_FrST0")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBSDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUBSSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUB_FPrST0")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUB_FST0r")>;
+def: InstRW<[BWWriteResGroup27], (instregex "SUB_FrST0")>;
+def: InstRW<[BWWriteResGroup27], (instregex "TZCNT(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "UCOMISDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "UCOMISSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDPDYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDPDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDPSYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDPSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDSDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDSSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VADDSUBPSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDYrri")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCMPPDrri")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSYrri")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCMPPSrri")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCMPSDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCMPSSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCOMISDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCOMISSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)PSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMAX(C?)SSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)PSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VMIN(C?)SSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VSUBPDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSYrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VSUBPSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VSUBSDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VSUBSSrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISDrr")>;
+def: InstRW<[BWWriteResGroup27], (instregex "VUCOMISSrr")>;
+
+def BWWriteResGroup27_16 : SchedWriteRes<[BWPort1, BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup27_16], (instregex "IMUL16rr(i8)?")>;
+
+def BWWriteResGroup28 : SchedWriteRes<[BWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VBROADCASTSSYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VEXTRACTI128rr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VINSERTF128rr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VINSERTI128rr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTBrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTQYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPBROADCASTWrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPERM2F128rr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPERM2I128rr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPERMDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPERMPDYri")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPERMPSYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPERMQYri")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBQYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXBWYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXDQYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVSXWQYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBQYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXBWYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXDQYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWDYrr")>;
+def: InstRW<[BWWriteResGroup28], (instregex "VPMOVZXWQYrr")>;
+
+def BWWriteResGroup29 : SchedWriteRes<[BWPort01]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup29], (instregex "MULPDrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "MULPSrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "MULSDrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "MULSSrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "VMULPDYrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "VMULPDrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "VMULPSYrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "VMULPSrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "VMULSDrr")>;
+def: InstRW<[BWWriteResGroup29], (instregex "VMULSSrr")>;
+
+def BWWriteResGroup30 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[BWWriteResGroup30], (instregex "XADD(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup30], (instregex "XADD8rr")>;
+def: InstRW<[BWWriteResGroup30], (instregex "XCHG8rr")>;
+
+def BWWriteResGroup31 : SchedWriteRes<[BWPort0,BWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDYrr")>;
+def: InstRW<[BWWriteResGroup31], (instregex "VPSLLVDrr")>;
+def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDYrr")>;
+def: InstRW<[BWWriteResGroup31], (instregex "VPSRAVDrr")>;
+def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDYrr")>;
+def: InstRW<[BWWriteResGroup31], (instregex "VPSRLVDrr")>;
+
+def BWWriteResGroup32 : SchedWriteRes<[BWPort5,BWPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[BWWriteResGroup32], (instregex "MMX_PHSUBWrr64")>;
+def: InstRW<[BWWriteResGroup32], (instregex "PHADDDrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "PHADDSWrr128")>;
+def: InstRW<[BWWriteResGroup32], (instregex "PHADDWrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "PHSUBDrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "PHSUBSWrr128")>;
+def: InstRW<[BWWriteResGroup32], (instregex "PHSUBWrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDYrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHADDDrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr128")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHADDSWrr256")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWYrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHADDWrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDYrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBDrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBSWrr256")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWYrr")>;
+def: InstRW<[BWWriteResGroup32], (instregex "VPHSUBWrr")>;
+
+def BWWriteResGroup33 : SchedWriteRes<[BWPort5,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSDWirr")>;
+def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKSSWBirr")>;
+def: InstRW<[BWWriteResGroup33], (instregex "MMX_PACKUSWBirr")>;
+
+def BWWriteResGroup34 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup34], (instregex "CLD")>;
+
+def BWWriteResGroup35 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCL(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCL8r1")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCL8ri")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)r1")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCR(16|32|64)ri")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCR8r1")>;
+def: InstRW<[BWWriteResGroup35], (instregex "RCR8ri")>;
+
+def BWWriteResGroup36 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup36], (instregex "ROL(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "ROL8rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "ROR(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "ROR8rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "SAR(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "SAR8rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "SHL(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "SHL8rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "SHR(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup36], (instregex "SHR8rCL")>;
+
+def BWWriteResGroup37 : SchedWriteRes<[BWPort4,BWPort6,BWPort237,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup37], (instregex "CALL(16|32|64)r")>;
+
+def BWWriteResGroup38 : SchedWriteRes<[BWPort4,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup38], (instregex "CALL64pcrel32")>;
+def: InstRW<[BWWriteResGroup38], (instregex "SETAm")>;
+def: InstRW<[BWWriteResGroup38], (instregex "SETBEm")>;
+
+def BWWriteResGroup39 : SchedWriteRes<[BWPort0,BWPort1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTSD2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTSS2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTSD2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[BWWriteResGroup39], (instregex "VCVTTSS2SIrr")>;
+
+def BWWriteResGroup40 : SchedWriteRes<[BWPort0,BWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup40], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSLLDYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSLLQYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSLLWYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSRADYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSRAWYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSRLDYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSRLQYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPSRLWYrr")>;
+def: InstRW<[BWWriteResGroup40], (instregex "VPTESTYrr")>;
+
+def BWWriteResGroup41 : SchedWriteRes<[BWPort0,BWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup41], (instregex "FNSTSW16r")>;
+
+def BWWriteResGroup42 : SchedWriteRes<[BWPort1,BWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup42], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2DQrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTPD2PSrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTSD2SSrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTSI642SDrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SDrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTSI2SSrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "IMUL(32|64)r")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MUL(32|64)r")>;
+def: InstRW<[BWWriteResGroup42], (instregex "MULX64rr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTPS2PHrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTSD2SSrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI642SDrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[BWWriteResGroup42], (instregex "VCVTTPD2DQrr")>;
+
+def BWWriteResGroup42_16 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def: InstRW<[BWWriteResGroup42_16], (instregex "IMUL16r")>;
+def: InstRW<[BWWriteResGroup42_16], (instregex "MUL16r")>;
+
+def BWWriteResGroup43 : SchedWriteRes<[BWPort0,BWPort4,BWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup43], (instregex "FNSTSWm")>;
+
+def BWWriteResGroup44 : SchedWriteRes<[BWPort1,BWPort4,BWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP16m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP32m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "ISTT_FP64m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST_F16m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST_F32m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST_FP16m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST_FP32m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "IST_FP64m")>;
+def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHYmr")>;
+def: InstRW<[BWWriteResGroup44], (instregex "VCVTPS2PHmr")>;
+
+def BWWriteResGroup45 : SchedWriteRes<[BWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[BWWriteResGroup45], (instregex "FNCLEX")>;
+
+def BWWriteResGroup46 : SchedWriteRes<[BWPort015,BWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[BWWriteResGroup46], (instregex "VZEROUPPER")>;
+
+def BWWriteResGroup47 : SchedWriteRes<[BWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMADDWDirr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHUWirr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULHWirr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULLWirr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MMX_PSADBWirr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MUL_FPrST0")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MUL_FST0r")>;
+def: InstRW<[BWWriteResGroup47], (instregex "MUL_FrST0")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PCLMULQDQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PCMPGTQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMADDUBSWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMADDWDrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMULDQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMULHRSWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMULHUWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMULHWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMULLWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PMULUDQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "PSADBWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "RCPPSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "RCPSSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "RSQRTPSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "RSQRTSSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPCLMULQDQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPCMPGTQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMADDWDrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULDQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULHRSWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULHUWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULHWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULLWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPMULUDQrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWYrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VPSADBWrr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VRCPPSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VRCPSSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTPSr")>;
+def: InstRW<[BWWriteResGroup47], (instregex "VRSQRTSSr")>;
+
+def BWWriteResGroup48 : SchedWriteRes<[BWPort01]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup48],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
+
+def BWWriteResGroup49 : SchedWriteRes<[BWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup49], (instregex "LDDQUrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64rm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVD64to64rm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MMX_MOVQ64rm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOV(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOV64toPQIrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOV8rm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVAPDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVAPSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVDDUPrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVDQArm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVDQUrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVNTDQArm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVQI2PQIrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSHDUPrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSLDUPrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm16")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm32")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVSX(16|32|64)rm8")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVUPDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVUPSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm16")>;
+def: InstRW<[BWWriteResGroup49], (instregex "MOVZX(16|32|64)rm8")>;
+def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHNTA")>;
+def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT0")>;
+def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT1")>;
+def: InstRW<[BWWriteResGroup49], (instregex "PREFETCHT2")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VLDDQUrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVAPSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVDDUPrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQArm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVDQUrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVNTDQArm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVSDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVSSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VMOVUPSrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTDrm")>;
+def: InstRW<[BWWriteResGroup49], (instregex "VPBROADCASTQrm")>;
+
+def BWWriteResGroup50 : SchedWriteRes<[BWPort1,BWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup50], (instregex "CVTSI642SSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "HADDPDrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "HADDPSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "HSUBPDrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "HSUBPSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VCVTSI642SSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDYrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHADDPDrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSYrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHADDPSrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDYrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPDrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSYrr")>;
+def: InstRW<[BWWriteResGroup50], (instregex "VHSUBPSrr")>;
+
+def BWWriteResGroup51 : SchedWriteRes<[BWPort1,BWPort6,BWPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup51], (instregex "STR(16|32|64)r")>;
+
+def BWWriteResGroup52 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup52], (instregex "MULX32rr")>;
+
+def BWWriteResGroup53 : SchedWriteRes<[BWPort0,BWPort4,BWPort237,BWPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDYmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSYmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VMASKMOVPSmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDYmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVDmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQYmr")>;
+def: InstRW<[BWWriteResGroup53], (instregex "VPMASKMOVQmr")>;
+
+def BWWriteResGroup54 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[BWWriteResGroup54], (instregex "PAUSE")>;
+
+def BWWriteResGroup55 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[BWWriteResGroup55], (instregex "XSETBV")>;
+
+def BWWriteResGroup56 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
+}
+def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG(16|32|64)rr")>;
+def: InstRW<[BWWriteResGroup56], (instregex "CMPXCHG8rr")>;
+
+def BWWriteResGroup57 : SchedWriteRes<[BWPort4,BWPort237,BWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[BWWriteResGroup57], (instregex "PUSHF16")>;
+def: InstRW<[BWWriteResGroup57], (instregex "PUSHF64")>;
+
+def BWWriteResGroup58 : SchedWriteRes<[BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup58], (instregex "LD_F32m")>;
+def: InstRW<[BWWriteResGroup58], (instregex "LD_F64m")>;
+def: InstRW<[BWWriteResGroup58], (instregex "LD_F80m")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTF128")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTI128")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VBROADCASTSSYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VLDDQUYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPDYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVAPSYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQAYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVDQUYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVNTDQAYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPDYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VMOVUPSYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTDYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VPBROADCASTQYrm")>;
+def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPDr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "ROUNDPSr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSDr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "ROUNDSSr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPDr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VROUNDPSr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSDr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VROUNDSSr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPDr")>;
+def: InstRW<[BWWriteResGroup58], (instregex "VROUNDYPSr")>;
+
+def BWWriteResGroup59 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup59], (instregex "CVTPS2PDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "CVTSS2SDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLQrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSLLWrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRADrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRAWrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLQrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "MMX_PSRLWrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSYrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VCVTPH2PSrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VPSLLVQrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VPSRLVQrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VTESTPDrm")>;
+def: InstRW<[BWWriteResGroup59], (instregex "VTESTPSrm")>;
+
+def BWWriteResGroup60 : SchedWriteRes<[BWPort1,BWPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTDQ2PDYrr")>;
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTPS2PHYrr")>;
+def: InstRW<[BWWriteResGroup60], (instregex "VCVTTPD2DQYrr")>;
+
+def BWWriteResGroup61 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup61], (instregex "ANDNPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "ANDNPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "ANDPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "ANDPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "INSERTPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PINSRWirmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PSHUFWmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHBWirm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHDQirm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKHWDirm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLBWirm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLDQirm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MMX_PUNPCKLWDirm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MOVHPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MOVHPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MOVLPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "MOVLPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "ORPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "ORPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PACKSSDWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PACKSSWBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PACKUSDWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PACKUSWBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PALIGNRrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PBLENDWrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PINSRBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PINSRDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PINSRQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PINSRWrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVSXWQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PMOVZXWQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PSHUFBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PSHUFDmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PSHUFHWmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PSHUFLWmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "SHUFPDrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "SHUFPSrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "UNPCKHPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "UNPCKLPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VANDNPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VANDNPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VANDPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VANDPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VINSERTPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VMOVHPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VMOVLPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VORPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VORPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSDWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPACKSSWBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSDWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPACKUSWBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPALIGNRrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPBLENDWrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPERMILPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPINSRBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPINSRDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPINSRQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPINSRWrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFBrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFDmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFHWmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPSHUFLWmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VPUNPCKLWDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPDrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VSHUFPSrmi")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VXORPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "VXORPSrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "XORPDrm")>;
+def: InstRW<[BWWriteResGroup61], (instregex "XORPSrm")>;
+
+def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup62], (instregex "FARJMP64")>;
+def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>;
+
+def BWWriteResGroup63 : SchedWriteRes<[BWPort23,BWPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup63], (instregex "ADC(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "ADC8rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "ADCX(32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "ADOX(32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "BT(16|32|64)mi8")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVAE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVB(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVG(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVGE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVL(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVLE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVNE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVNO(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVNP(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVNS(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVO(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVP(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "CMOVS(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "RORX(32|64)mi")>;
+def: InstRW<[BWWriteResGroup63], (instregex "SARX(32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "SBB(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "SBB8rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "SHLX(32|64)rm")>;
+def: InstRW<[BWWriteResGroup63], (instregex "SHRX(32|64)rm")>;
+
+def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup64], (instregex "ANDN(32|64)rm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "BLSI(32|64)rm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "BLSMSK(32|64)rm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "BLSR(32|64)rm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "BZHI(32|64)rm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDDirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDQirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDSWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDUSWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PADDWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PAVGWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQDirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPEQWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTDirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PCMPGTWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXSWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMAXUBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINSWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PMINUBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBDirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBQirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBSWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSBirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBUSWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MMX_PSUBWirm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "MOVBE(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PABSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PABSDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PABSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDQrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDUSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDUSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PADDWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PAVGBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PAVGWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQQrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPEQWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PCMPGTWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMAXSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMAXSDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMAXSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMAXUBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMAXUDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMAXUWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMINSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMINSDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMINSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMINUBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMINUDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PMINUWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSIGNBrm128")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSIGNDrm128")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSIGNWrm128")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBQrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBUSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "PSUBWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPABSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPABSDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPABSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDQrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDUSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPADDWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPAVGBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPAVGWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQQrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPEQWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPCMPGTWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMAXSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMAXUWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMINSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMINSDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMINSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMINUBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMINUDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPMINUWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNBrm128")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNDrm128")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSIGNWrm128")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBDrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBQrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSBrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBUSWrm")>;
+def: InstRW<[BWWriteResGroup64], (instregex "VPSUBWrm")>;
+
+def BWWriteResGroup65 : SchedWriteRes<[BWPort23,BWPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup65], (instregex "BLENDPDrmi")>;
+def: InstRW<[BWWriteResGroup65], (instregex "BLENDPSrmi")>;
+def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDNirm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "MMX_PANDirm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "MMX_PORirm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "MMX_PXORirm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "PANDNrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "PANDrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "PORrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "PXORrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPDrmi")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VBLENDPSrmi")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VINSERTF128rm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VINSERTI128rm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VPANDNrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VPANDrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VPBLENDDrmi")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VPORrm")>;
+def: InstRW<[BWWriteResGroup65], (instregex "VPXORrm")>;
+
+def BWWriteResGroup66 : SchedWriteRes<[BWPort23,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup66], (instregex "ADD(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "ADD8rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "AND(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "AND8rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup66], (instregex "CMP(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "CMP8mi")>;
+def: InstRW<[BWWriteResGroup66], (instregex "CMP8mr")>;
+def: InstRW<[BWWriteResGroup66], (instregex "CMP8rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "OR(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "OR8rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "POP(16|32|64)r(mr)?")>;
+def: InstRW<[BWWriteResGroup66], (instregex "SUB(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "SUB8rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "TEST(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup66], (instregex "TEST8mi")>;
+def: InstRW<[BWWriteResGroup66], (instregex "TEST8mr")>;
+def: InstRW<[BWWriteResGroup66], (instregex "XOR(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup66], (instregex "XOR8rm")>;
+
+def BWWriteResGroup67 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[BWWriteResGroup67], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[BWWriteResGroup67], (instregex "SHRD(16|32|64)rrCL")>;
+
+def BWWriteResGroup68 : SchedWriteRes<[BWPort1,BWPort6,BWPort06,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup68], (instregex "SLDT(16|32|64)r")>;
+
+def BWWriteResGroup69 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup69], (instregex "BTC(16|32|64)mi8")>;
+def: InstRW<[BWWriteResGroup69], (instregex "BTR(16|32|64)mi8")>;
+def: InstRW<[BWWriteResGroup69], (instregex "BTS(16|32|64)mi8")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SAR(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SAR8m1")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SAR8mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHL(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHL8m1")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHL8mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHR(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHR8m1")>;
+def: InstRW<[BWWriteResGroup69], (instregex "SHR8mi")>;
+
+def BWWriteResGroup70 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "ADD(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "ADD8mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "ADD8mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "AND(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "AND8mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "AND8mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "DEC(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "DEC8m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "INC(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "INC8m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "NEG(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "NEG8m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "NOT(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "NOT8m")>;
+def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "OR(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "OR8mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "OR8mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "POP(16|32|64)rmm")>;
+def: InstRW<[BWWriteResGroup70], (instregex "PUSH(16|32|64)rmm")>;
+def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "SUB(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "SUB8mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "SUB8mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "XOR(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup70], (instregex "XOR8mi")>;
+def: InstRW<[BWWriteResGroup70], (instregex "XOR8mr")>;
+
+def BWWriteResGroup71 : SchedWriteRes<[BWPort6,BWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[BWWriteResGroup71], (instregex "STD")>;
+
+def BWWriteResGroup72 : SchedWriteRes<[BWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup72], (instregex "AESDECLASTrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "AESDECrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "AESENCLASTrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "AESENCrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "VAESDECLASTrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "VAESDECrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "VAESENCLASTrr")>;
+def: InstRW<[BWWriteResGroup72], (instregex "VAESENCrr")>;
+
+def BWWriteResGroup73 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup73], (instregex "VPSLLDYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSLLQYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSLLVQYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSLLWYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSRADYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSRAWYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSRLDYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSRLQYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSRLVQYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VPSRLWYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VTESTPDYrm")>;
+def: InstRW<[BWWriteResGroup73], (instregex "VTESTPSYrm")>;
+
+def BWWriteResGroup74 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup74], (instregex "FCOM32m")>;
+def: InstRW<[BWWriteResGroup74], (instregex "FCOM64m")>;
+def: InstRW<[BWWriteResGroup74], (instregex "FCOMP32m")>;
+def: InstRW<[BWWriteResGroup74], (instregex "FCOMP64m")>;
+
+def BWWriteResGroup75 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup75], (instregex "VANDNPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VANDNPSYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VANDPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VANDPSYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VORPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VORPSYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSDWYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPACKSSWBYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSDWYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPACKUSWBYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPALIGNRYrmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPBLENDWYrmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPERMILPSYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFBYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFDYmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFHWYmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPSHUFLWYmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHBWYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHDQYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHQDQYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKHWDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLBWYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLDQYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLQDQYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VPUNPCKLWDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKHPSYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VXORPDYrm")>;
+def: InstRW<[BWWriteResGroup75], (instregex "VXORPSYrm")>;
+
+def BWWriteResGroup76 : SchedWriteRes<[BWPort23,BWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup76], (instregex "VPABSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPABSDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPABSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDQYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDUSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPADDWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPAVGBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPAVGWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQQYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPEQWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPCMPGTWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMAXSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMAXUWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMINSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMINSDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMINSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMINUBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMINUDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPMINUWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNBYrm256")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNDYrm256")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSIGNWYrm256")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBDYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBQYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSBYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[BWWriteResGroup76], (instregex "VPSUBWYrm")>;
+
+def BWWriteResGroup77 : SchedWriteRes<[BWPort23,BWPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[BWWriteResGroup77], (instregex "VBLENDPSYrmi")>;
+def: InstRW<[BWWriteResGroup77], (instregex "VPANDNYrm")>;
+def: InstRW<[BWWriteResGroup77], (instregex "VPANDYrm")>;
+def: InstRW<[BWWriteResGroup77], (instregex "VPBLENDDYrmi")>;
+def: InstRW<[BWWriteResGroup77], (instregex "VPORYrm")>;
+def: InstRW<[BWWriteResGroup77], (instregex "VPXORYrm")>;
+
+def BWWriteResGroup78 : SchedWriteRes<[BWPort0,BWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup78], (instregex "MPSADBWrri")>;
+def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWYrri")>;
+def: InstRW<[BWWriteResGroup78], (instregex "VMPSADBWrri")>;
+
+def BWWriteResGroup79 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPDrm0")>;
+def: InstRW<[BWWriteResGroup79], (instregex "BLENDVPSrm0")>;
+def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSDWirm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKSSWBirm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "MMX_PACKUSWBirm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "PBLENDVBrm0")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPDrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VBLENDVPSrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VPBLENDVBrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVDrm")>;
+def: InstRW<[BWWriteResGroup79], (instregex "VPMASKMOVQrm")>;
+
+def BWWriteResGroup80 : SchedWriteRes<[BWPort23,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[BWWriteResGroup80], (instregex "LEAVE64")>;
+def: InstRW<[BWWriteResGroup80], (instregex "SCASB")>;
+def: InstRW<[BWWriteResGroup80], (instregex "SCASL")>;
+def: InstRW<[BWWriteResGroup80], (instregex "SCASQ")>;
+def: InstRW<[BWWriteResGroup80], (instregex "SCASW")>;
+
+def BWWriteResGroup81 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup81], (instregex "PSLLDrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSLLQrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSLLWrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSRADrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSRAWrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSRLDrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSRLQrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PSRLWrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "PTESTrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSLLDrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSLLQrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSLLWrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSRADrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSRAWrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSRLDrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSRLQrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPSRLWrm")>;
+def: InstRW<[BWWriteResGroup81], (instregex "VPTESTrm")>;
+
+def BWWriteResGroup82 : SchedWriteRes<[BWPort0,BWPort01,BWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup82], (instregex "FLDCW16m")>;
+
+def BWWriteResGroup83 : SchedWriteRes<[BWPort0,BWPort23,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup83], (instregex "LDMXCSR")>;
+def: InstRW<[BWWriteResGroup83], (instregex "VLDMXCSR")>;
+
+def BWWriteResGroup84 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup84], (instregex "LRETQ")>;
+def: InstRW<[BWWriteResGroup84], (instregex "RETQ")>;
+
+def BWWriteResGroup85 : SchedWriteRes<[BWPort23,BWPort06,BWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup85], (instregex "BEXTR(32|64)rm")>;
+
+def BWWriteResGroup86 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup86], (instregex "CMOVA(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup86], (instregex "CMOVBE(16|32|64)rm")>;
+
+def BWWriteResGroup87 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROL(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROL8m1")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROL8mi")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROR(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROR8m1")>;
+def: InstRW<[BWWriteResGroup87], (instregex "ROR8mi")>;
+
+def BWWriteResGroup88 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup88], (instregex "XADD(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup88], (instregex "XADD8rm")>;
+
+def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup89], (instregex "FARCALL64")>;
+
+def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,1,2];
+}
+def: InstRW<[BWWriteResGroup90], (instregex "LOOP")>;
+
+def BWWriteResGroup91 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup91], (instregex "ADDPDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "ADDPSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "ADDSDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "ADDSSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "ADDSUBPSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "BSF(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "BSR(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CMPPDrmi")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CMPPSrmi")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CMPSDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CMPSSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "COMISDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "COMISSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CVTPS2DQrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "IMUL64m")>;
+def: InstRW<[BWWriteResGroup91], (instregex "IMUL(32|64)rm(i8)?")>;
+def: InstRW<[BWWriteResGroup91], (instregex "IMUL8m")>;
+def: InstRW<[BWWriteResGroup91], (instregex "LZCNT(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)PSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MAX(C?)SSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)PSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MIN(C?)SSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MUL64m")>;
+def: InstRW<[BWWriteResGroup91], (instregex "MUL8m")>;
+def: InstRW<[BWWriteResGroup91], (instregex "PDEP(32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "PEXT(32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "POPCNT(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "SUBPDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "SUBPSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "SUBSDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "SUBSSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "UCOMISDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "UCOMISSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VADDPDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VADDPSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VADDSDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VADDSSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VADDSUBPSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCMPPDrmi")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCMPPSrmi")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCMPSDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCMPSSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCOMISDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCOMISSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)PSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMAX(C?)SSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)PSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VMIN(C?)SSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VSUBPDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VSUBPSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VSUBSDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VSUBSSrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISDrm")>;
+def: InstRW<[BWWriteResGroup91], (instregex "VUCOMISSrm")>;
+
+def BWWriteResGroup91_16 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup91_16], (instregex "IMUL16rm(i8)?")>;
+
+def BWWriteResGroup91_16_2 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+def: InstRW<[BWWriteResGroup91_16_2], (instregex "IMUL16m")>;
+def: InstRW<[BWWriteResGroup91_16_2], (instregex "MUL16m")>;
+
+def BWWriteResGroup91_32 : SchedWriteRes<[BWPort1, BWPort0156, BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup91_32], (instregex "IMUL32m")>;
+def: InstRW<[BWWriteResGroup91_32], (instregex "MUL32m")>;
+
+def BWWriteResGroup92 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBDYrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBQYrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXBWYrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXDQYrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWDYrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVSXWQYrm")>;
+def: InstRW<[BWWriteResGroup92], (instregex "VPMOVZXWDYrm")>;
+
+def BWWriteResGroup93 : SchedWriteRes<[BWPort01,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup93], (instregex "MULPDrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "MULPSrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "MULSDrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "MULSSrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "VMULPDrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "VMULPSrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "VMULSDrm")>;
+def: InstRW<[BWWriteResGroup93], (instregex "VMULSSrm")>;
+
+def BWWriteResGroup94 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[BWWriteResGroup94], (instregex "VBLENDVPSYrm")>;
+def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[BWWriteResGroup94], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[BWWriteResGroup94], (instregex "VPBLENDVBYrm")>;
+def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVDYrm")>;
+def: InstRW<[BWWriteResGroup94], (instregex "VPMASKMOVQYrm")>;
+
+def BWWriteResGroup95 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup95], (instregex "VPSLLVDrm")>;
+def: InstRW<[BWWriteResGroup95], (instregex "VPSRAVDrm")>;
+def: InstRW<[BWWriteResGroup95], (instregex "VPSRLVDrm")>;
+
+def BWWriteResGroup96 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBSWrm64")>;
+def: InstRW<[BWWriteResGroup96], (instregex "MMX_PHSUBWrm64")>;
+def: InstRW<[BWWriteResGroup96], (instregex "PHADDDrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "PHADDSWrm128")>;
+def: InstRW<[BWWriteResGroup96], (instregex "PHADDWrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "PHSUBDrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "PHSUBSWrm128")>;
+def: InstRW<[BWWriteResGroup96], (instregex "PHSUBWrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "VPHADDDrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "VPHADDSWrm128")>;
+def: InstRW<[BWWriteResGroup96], (instregex "VPHADDWrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBDrm")>;
+def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBSWrm128")>;
+def: InstRW<[BWWriteResGroup96], (instregex "VPHSUBWrm")>;
+
+def BWWriteResGroup97 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCL(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCL8m1")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCL8mi")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)m1")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCR(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCR8m1")>;
+def: InstRW<[BWWriteResGroup97], (instregex "RCR8mi")>;
+
+def BWWriteResGroup98 : SchedWriteRes<[BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[BWWriteResGroup98], (instregex "ROR(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup98], (instregex "ROR8mCL")>;
+
+def BWWriteResGroup99 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[BWWriteResGroup99], (instregex "ADC(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "ADC8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "ADD8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "AND8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "OR8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "SUB8mi")>;
+def: InstRW<[BWWriteResGroup99], (instregex "XCHG(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup99], (instregex "XCHG8rm")>;
+def: InstRW<[BWWriteResGroup99], (instregex "XOR8mi")>;
+
+def BWWriteResGroup100 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[BWWriteResGroup100], (instregex "ADC(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup100], (instregex "ADC8mr")>;
+def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup100], (instregex "CMPXCHG8rm")>;
+def: InstRW<[BWWriteResGroup100], (instregex "ROL(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "ROL8mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SAR(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SAR8mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mi")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SBB(16|32|64)mr")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SBB8mi")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SBB8mr")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SHL(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SHL8mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SHR(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup100], (instregex "SHR8mCL")>;
+
+def BWWriteResGroup101 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup101], (instregex "ADD_F32m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "ADD_F64m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "ILD_F16m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "ILD_F32m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "ILD_F64m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F32m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "SUBR_F64m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "SUB_F32m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "SUB_F64m")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VADDPDYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VADDPSYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VCMPPDYrmi")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VCMPPSYrmi")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VCVTTPS2DQYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PDYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VMAX(C?)PSYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PDYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VMIN(C?)PSYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VSUBPDYrm")>;
+def: InstRW<[BWWriteResGroup101], (instregex "VSUBPSYrm")>;
+
+def BWWriteResGroup102 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup102], (instregex "VPERM2F128rm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPERM2I128rm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPERMDYrm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPERMPDYmi")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPERMPSYrm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPERMQYmi")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBDYrm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBQYrm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXBWYrm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXDQYrm")>;
+def: InstRW<[BWWriteResGroup102], (instregex "VPMOVZXWQYrm")>;
+
+def BWWriteResGroup103 : SchedWriteRes<[BWPort01,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup103], (instregex "VMULPDYrm")>;
+def: InstRW<[BWWriteResGroup103], (instregex "VMULPSYrm")>;
+
+def BWWriteResGroup104 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup104], (instregex "DPPDrri")>;
+def: InstRW<[BWWriteResGroup104], (instregex "VDPPDrri")>;
+
+def BWWriteResGroup105 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "CVTSD2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "CVTSS2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTSD2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSD2SIrm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[BWWriteResGroup105], (instregex "VCVTTSS2SIrm")>;
+
+def BWWriteResGroup106 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup106], (instregex "VCVTPS2PDYrm")>;
+
+def BWWriteResGroup107 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup107], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2DQrm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "CVTPD2PSrm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "CVTSD2SSrm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "MULX64rm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[BWWriteResGroup107], (instregex "VCVTSD2SSrm")>;
+
+def BWWriteResGroup108 : SchedWriteRes<[BWPort5,BWPort23,BWPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBYrm")>;
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTBrm")>;
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWYrm")>;
+def: InstRW<[BWWriteResGroup108], (instregex "VPBROADCASTWrm")>;
+
+def BWWriteResGroup109 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup109], (instregex "VPSLLVDYrm")>;
+def: InstRW<[BWWriteResGroup109], (instregex "VPSRAVDYrm")>;
+def: InstRW<[BWWriteResGroup109], (instregex "VPSRLVDYrm")>;
+
+def BWWriteResGroup110 : SchedWriteRes<[BWPort5,BWPort23,BWPort15]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup110], (instregex "VPHADDDYrm")>;
+def: InstRW<[BWWriteResGroup110], (instregex "VPHADDSWrm256")>;
+def: InstRW<[BWWriteResGroup110], (instregex "VPHADDWYrm")>;
+def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBDYrm")>;
+def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBSWrm256")>;
+def: InstRW<[BWWriteResGroup110], (instregex "VPHSUBWYrm")>;
+
+def BWWriteResGroup111 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup111], (instregex "SHLD(16|32|64)mri8")>;
+def: InstRW<[BWWriteResGroup111], (instregex "SHRD(16|32|64)mri8")>;
+
+def BWWriteResGroup112 : SchedWriteRes<[BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[BWWriteResGroup112], (instregex "RDRAND(16|32|64)r")>;
+
+def BWWriteResGroup113 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[BWWriteResGroup113], (instregex "LAR(16|32|64)rm")>;
+def: InstRW<[BWWriteResGroup113], (instregex "LSL(16|32|64)rm")>;
+
+def BWWriteResGroup114 : SchedWriteRes<[BWPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup114], (instregex "PMULLDrr")>;
+def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDYrr")>;
+def: InstRW<[BWWriteResGroup114], (instregex "VPMULLDrr")>;
+
+def BWWriteResGroup115 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMADDWDirm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHUWirm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULHWirm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULLWirm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "MMX_PSADBWirm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PCLMULQDQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PCMPGTQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMADDUBSWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMADDWDrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMULDQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMULHRSWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMULHUWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMULHWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMULLWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PMULUDQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "PSADBWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "RCPPSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "RCPSSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "RSQRTPSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "RSQRTSSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPCLMULQDQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPCMPGTQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMADDWDrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMULDQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMULHRSWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMULHUWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMULHWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMULLWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPMULUDQrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VPSADBWrm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VRCPPSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VRCPSSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTPSm")>;
+def: InstRW<[BWWriteResGroup115], (instregex "VRSQRTSSm")>;
+
+def BWWriteResGroup116 : SchedWriteRes<[BWPort01,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup116],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+
+def BWWriteResGroup117 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup117], (instregex "FICOM16m")>;
+def: InstRW<[BWWriteResGroup117], (instregex "FICOM32m")>;
+def: InstRW<[BWWriteResGroup117], (instregex "FICOMP16m")>;
+def: InstRW<[BWWriteResGroup117], (instregex "FICOMP32m")>;
+
+def BWWriteResGroup118 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup118], (instregex "VPTESTYrm")>;
+
+def BWWriteResGroup119 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[BWWriteResGroup119], (instregex "HADDPDrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "HADDPSrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "HSUBPDrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "HSUBPSrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "VHADDPDrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "VHADDPSrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPDrm")>;
+def: InstRW<[BWWriteResGroup119], (instregex "VHSUBPSrm")>;
+
+def BWWriteResGroup120 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup120], (instregex "CVTTSS2SI64rm")>;
+
+def BWWriteResGroup121 : SchedWriteRes<[BWPort1,BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup121], (instregex "MULX32rm")>;
+
+def BWWriteResGroup122 : SchedWriteRes<[BWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup122], (instregex "DIVPSrr")>;
+def: InstRW<[BWWriteResGroup122], (instregex "DIVSSrr")>;
+def: InstRW<[BWWriteResGroup122], (instregex "VDIVPSrr")>;
+def: InstRW<[BWWriteResGroup122], (instregex "VDIVSSrr")>;
+
+def BWWriteResGroup123 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup123], (instregex "MUL_F32m")>;
+def: InstRW<[BWWriteResGroup123], (instregex "MUL_F64m")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPCMPGTQYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMADDUBSWYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMADDWDYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMULDQYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMULHRSWYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMULHUWYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMULHWYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMULLWYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPMULUDQYrm")>;
+def: InstRW<[BWWriteResGroup123], (instregex "VPSADBWYrm")>;
+
+def BWWriteResGroup124 : SchedWriteRes<[BWPort01,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup124],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
+
+def BWWriteResGroup125 : SchedWriteRes<[BWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRIrr")>;
+def: InstRW<[BWWriteResGroup125], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[BWWriteResGroup125], (instregex "VPCMPISTRM128rr")>;
+
+def BWWriteResGroup126 : SchedWriteRes<[BWPort0,BWPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup126], (instregex "VRCPPSYr")>;
+def: InstRW<[BWWriteResGroup126], (instregex "VRSQRTPSYr")>;
+
+def BWWriteResGroup127 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPDm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "ROUNDPSm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSDm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "ROUNDSSm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPDm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "VROUNDPSm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSDm")>;
+def: InstRW<[BWWriteResGroup127], (instregex "VROUNDSSm")>;
+
+def BWWriteResGroup128 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup128], (instregex "VCVTDQ2PDYrm")>;
+
+def BWWriteResGroup129 : SchedWriteRes<[BWPort1,BWPort5,BWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[BWWriteResGroup129], (instregex "VHADDPDYrm")>;
+def: InstRW<[BWWriteResGroup129], (instregex "VHADDPSYrm")>;
+def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPDYrm")>;
+def: InstRW<[BWWriteResGroup129], (instregex "VHSUBPSYrm")>;
+
+def BWWriteResGroup130 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup130], (instregex "SHLD(16|32|64)mrCL")>;
+def: InstRW<[BWWriteResGroup130], (instregex "SHRD(16|32|64)mrCL")>;
+
+def BWWriteResGroup131 : SchedWriteRes<[BWPort1,BWPort06,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,3];
+}
+def: InstRW<[BWWriteResGroup131], (instregex "RCL(16|32|64)rCL")>;
+def: InstRW<[BWWriteResGroup131], (instregex "RCR(16|32|64)rCL")>;
+
+def BWWriteResGroup132 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,4,1,3];
+}
+def: InstRW<[BWWriteResGroup132], (instregex "RCL8rCL")>;
+
+def BWWriteResGroup133 : SchedWriteRes<[BWPort06,BWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[BWWriteResGroup133], (instregex "LOOPE")>;
+def: InstRW<[BWWriteResGroup133], (instregex "LOOPNE")>;
+
+def BWWriteResGroup134 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup134], (instregex "AESDECLASTrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "AESDECrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "AESENCLASTrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "AESENCrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "VAESDECLASTrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "VAESDECrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "VAESENCLASTrm")>;
+def: InstRW<[BWWriteResGroup134], (instregex "VAESENCrm")>;
+
+def BWWriteResGroup135 : SchedWriteRes<[BWPort1,BWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI16m")>;
+def: InstRW<[BWWriteResGroup135], (instregex "ADD_FI32m")>;
+def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI16m")>;
+def: InstRW<[BWWriteResGroup135], (instregex "SUBR_FI32m")>;
+def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI16m")>;
+def: InstRW<[BWWriteResGroup135], (instregex "SUB_FI32m")>;
+def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPDm")>;
+def: InstRW<[BWWriteResGroup135], (instregex "VROUNDYPSm")>;
+
+def BWWriteResGroup136 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[BWWriteResGroup136], (instregex "MPSADBWrmi")>;
+def: InstRW<[BWWriteResGroup136], (instregex "VMPSADBWrmi")>;
+
+def BWWriteResGroup137 : SchedWriteRes<[BWPort0]> {
+ let Latency = 13;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup137], (instregex "SQRTPSr")>;
+def: InstRW<[BWWriteResGroup137], (instregex "SQRTSSr")>;
+
+def BWWriteResGroup138 : SchedWriteRes<[BWPort0,BWPort5,BWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[BWWriteResGroup138], (instregex "VMPSADBWYrmi")>;
+
+def BWWriteResGroup139 : SchedWriteRes<[BWPort0]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup139], (instregex "DIVPDrr")>;
+def: InstRW<[BWWriteResGroup139], (instregex "DIVSDrr")>;
+def: InstRW<[BWWriteResGroup139], (instregex "VDIVPDrr")>;
+def: InstRW<[BWWriteResGroup139], (instregex "VDIVSDrr")>;
+def: InstRW<[BWWriteResGroup139], (instregex "VSQRTPSr")>;
+def: InstRW<[BWWriteResGroup139], (instregex "VSQRTSSr")>;
+
+def BWWriteResGroup140 : SchedWriteRes<[BWPort5]> {
+ let Latency = 14;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[BWWriteResGroup140], (instregex "AESIMCrr")>;
+def: InstRW<[BWWriteResGroup140], (instregex "VAESIMCrr")>;
+
+def BWWriteResGroup141 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI16m")>;
+def: InstRW<[BWWriteResGroup141], (instregex "MUL_FI32m")>;
+
+def BWWriteResGroup142 : SchedWriteRes<[BWPort0,BWPort1,BWPort5]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup142], (instregex "DPPSrri")>;
+def: InstRW<[BWWriteResGroup142], (instregex "VDPPSYrri")>;
+def: InstRW<[BWWriteResGroup142], (instregex "VDPPSrri")>;
+
+def BWWriteResGroup143 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[BWWriteResGroup143], (instregex "DPPDrmi")>;
+def: InstRW<[BWWriteResGroup143], (instregex "VDPPDrmi")>;
+
+def BWWriteResGroup144 : SchedWriteRes<[BWPort1,BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,2,1,3];
+}
+def: InstRW<[BWWriteResGroup144], (instregex "LAR(16|32|64)rr")>;
+
+def BWWriteResGroup145 : SchedWriteRes<[BWPort1,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,3,1,4];
+}
+def: InstRW<[BWWriteResGroup145], (instregex "RCR8rCL")>;
+
+def BWWriteResGroup146 : SchedWriteRes<[BWPort0,BWPort1,BWPort6,BWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 12;
+ let ResourceCycles = [2,1,4,5];
+}
+def: InstRW<[BWWriteResGroup146], (instregex "XCH_F")>;
+
+def BWWriteResGroup147 : SchedWriteRes<[BWPort0]> {
+ let Latency = 15;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FPrST0")>;
+def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FST0r")>;
+def: InstRW<[BWWriteResGroup147], (instregex "DIVR_FrST0")>;
+
+def BWWriteResGroup148 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup148], (instregex "PMULLDrm")>;
+def: InstRW<[BWWriteResGroup148], (instregex "VPMULLDrm")>;
+
+def BWWriteResGroup149 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,4,1,2];
+}
+def: InstRW<[BWWriteResGroup149], (instregex "RCL(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup149], (instregex "RCL8mCL")>;
+
+def BWWriteResGroup150 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup150], (instregex "DIVPSrm")>;
+def: InstRW<[BWWriteResGroup150], (instregex "DIVSSrm")>;
+def: InstRW<[BWWriteResGroup150], (instregex "VDIVPSrm")>;
+def: InstRW<[BWWriteResGroup150], (instregex "VDIVSSrm")>;
+
+def BWWriteResGroup151 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup151], (instregex "VPMULLDYrm")>;
+
+def BWWriteResGroup152 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRIrm")>;
+def: InstRW<[BWWriteResGroup152], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[BWWriteResGroup152], (instregex "VPCMPISTRM128rm")>;
+
+def BWWriteResGroup153 : SchedWriteRes<[BWPort4,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[BWWriteResGroup153], (instregex "CMPXCHG8B")>;
+
+def BWWriteResGroup154 : SchedWriteRes<[BWPort5]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ResourceCycles = [16];
+}
+def: InstRW<[BWWriteResGroup154], (instregex "VZEROALL")>;
+
+def BWWriteResGroup155 : SchedWriteRes<[BWPort0,BWPort015]> {
+ let Latency = 17;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup155], (instregex "VDIVPSYrr")>;
+
+def BWWriteResGroup156 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup156], (instregex "VRCPPSYm")>;
+def: InstRW<[BWWriteResGroup156], (instregex "VRSQRTPSYm")>;
+
+def BWWriteResGroup157 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup157], (instregex "SQRTPSm")>;
+def: InstRW<[BWWriteResGroup157], (instregex "SQRTSSm")>;
+
+def BWWriteResGroup158 : SchedWriteRes<[BWPort0,BWPort5,BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def: InstRW<[BWWriteResGroup158], (instregex "PCMPESTRIrr")>;
+def: InstRW<[BWWriteResGroup158], (instregex "VPCMPESTRIrr")>;
+
+def BWWriteResGroup159 : SchedWriteRes<[BWPort5,BWPort6,BWPort06,BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[BWWriteResGroup159], (instregex "CPUID")>;
+def: InstRW<[BWWriteResGroup159], (instregex "RDTSC")>;
+
+def BWWriteResGroup160 : SchedWriteRes<[BWPort1,BWPort23,BWPort237,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,3,1,3];
+}
+def: InstRW<[BWWriteResGroup160], (instregex "RCR(16|32|64)mCL")>;
+def: InstRW<[BWWriteResGroup160], (instregex "RCR8mCL")>;
+
+def BWWriteResGroup161 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup161], (instregex "DIVPDrm")>;
+def: InstRW<[BWWriteResGroup161], (instregex "DIVSDrm")>;
+def: InstRW<[BWWriteResGroup161], (instregex "VDIVPDrm")>;
+def: InstRW<[BWWriteResGroup161], (instregex "VDIVSDrm")>;
+def: InstRW<[BWWriteResGroup161], (instregex "VSQRTPSm")>;
+def: InstRW<[BWWriteResGroup161], (instregex "VSQRTSSm")>;
+
+def BWWriteResGroup162 : SchedWriteRes<[BWPort5,BWPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup162], (instregex "AESIMCrm")>;
+def: InstRW<[BWWriteResGroup162], (instregex "VAESIMCrm")>;
+
+def BWWriteResGroup163 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,1,1,1];
+}
+def: InstRW<[BWWriteResGroup163], (instregex "DPPSrmi")>;
+def: InstRW<[BWWriteResGroup163], (instregex "VDPPSrmi")>;
+
+def BWWriteResGroup164 : SchedWriteRes<[BWPort0,BWPort5,BWPort015,BWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[BWWriteResGroup164], (instregex "PCMPESTRM128rr")>;
+def: InstRW<[BWWriteResGroup164], (instregex "VPCMPESTRM128rr")>;
+
+def BWWriteResGroup165 : SchedWriteRes<[BWPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup165], (instregex "DIV_FPrST0")>;
+def: InstRW<[BWWriteResGroup165], (instregex "DIV_FST0r")>;
+def: InstRW<[BWWriteResGroup165], (instregex "DIV_FrST0")>;
+def: InstRW<[BWWriteResGroup165], (instregex "SQRTPDr")>;
+def: InstRW<[BWWriteResGroup165], (instregex "SQRTSDr")>;
+
+def BWWriteResGroup166 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,1,1,1];
+}
+def: InstRW<[BWWriteResGroup166], (instregex "VDPPSYrmi")>;
+
+def BWWriteResGroup167 : SchedWriteRes<[BWPort4,BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[BWWriteResGroup167], (instregex "INSB")>;
+def: InstRW<[BWWriteResGroup167], (instregex "INSL")>;
+def: InstRW<[BWWriteResGroup167], (instregex "INSW")>;
+
+def BWWriteResGroup168 : SchedWriteRes<[BWPort0]> {
+ let Latency = 21;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[BWWriteResGroup168], (instregex "VSQRTPDr")>;
+def: InstRW<[BWWriteResGroup168], (instregex "VSQRTSDr")>;
+
+def BWWriteResGroup169 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup169], (instregex "DIV_F32m")>;
+def: InstRW<[BWWriteResGroup169], (instregex "DIV_F64m")>;
+
+def BWWriteResGroup170 : SchedWriteRes<[BWPort0,BWPort015]> {
+ let Latency = 21;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup170], (instregex "VSQRTPSYr")>;
+
+def BWWriteResGroup171 : SchedWriteRes<[BWPort0,BWPort4,BWPort5,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[BWWriteResGroup171], (instregex "CMPXCHG16B")>;
+
+def BWWriteResGroup172 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 22;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,16];
+}
+def: InstRW<[BWWriteResGroup172], (instregex "POPF64")>;
+
+def BWWriteResGroup173 : SchedWriteRes<[BWPort0,BWPort015]> {
+ let Latency = 23;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup173], (instregex "VDIVPDYrr")>;
+
+def BWWriteResGroup174 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
+ let Latency = 23;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup174], (instregex "VDIVPSYrm")>;
+
+def BWWriteResGroup175 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[BWWriteResGroup175], (instregex "PCMPESTRIrm")>;
+def: InstRW<[BWWriteResGroup175], (instregex "VPCMPESTRIrm")>;
+
+def BWWriteResGroup176 : SchedWriteRes<[BWPort6,BWPort23,BWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [3,1,15];
+}
+def: InstRW<[BWWriteResGroup176], (instregex "XRSTOR(64)?")>;
+
+def BWWriteResGroup177 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 24;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI16m")>;
+def: InstRW<[BWWriteResGroup177], (instregex "DIV_FI32m")>;
+
+def BWWriteResGroup178 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015,BWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+def: InstRW<[BWWriteResGroup178], (instregex "PCMPESTRM128rm")>;
+def: InstRW<[BWWriteResGroup178], (instregex "VPCMPESTRM128rm")>;
+
+def BWWriteResGroup179 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup179], (instregex "SQRTPDm")>;
+def: InstRW<[BWWriteResGroup179], (instregex "SQRTSDm")>;
+
+def BWWriteResGroup180 : SchedWriteRes<[BWPort0,BWPort23]> {
+ let Latency = 26;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F32m")>;
+def: InstRW<[BWWriteResGroup180], (instregex "DIVR_F64m")>;
+def: InstRW<[BWWriteResGroup180], (instregex "VSQRTPDm")>;
+def: InstRW<[BWWriteResGroup180], (instregex "VSQRTSDm")>;
+
+def BWWriteResGroup181 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
+ let Latency = 27;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup181], (instregex "VSQRTPSYm")>;
+
+def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
+ let Latency = 29;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI16m")>;
+def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI32m")>;
+
+def BWWriteResGroup183 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup183], (instregex "VDIVPDYrm")>;
+
+def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 22;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,2,1];
+}
+def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERQPDrm)>;
+
+def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,3,4,1];
+}
+def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERQPDYrm)>;
+
+def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,2,1];
+}
+def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSYrm)>;
+
+def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,2,1];
+}
+def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPDrm,
+ VGATHERDPSrm)>;
+
+def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 26;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,2,1];
+}
+def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>;
+
+def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 26;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,4,8,1];
+}
+def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>;
+
+def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
+ let Latency = 27;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,2,1];
+}
+def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>;
+
+def BWWriteResGroup184 : SchedWriteRes<[BWPort0,BWPort5,BWPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
+}
+def: InstRW<[BWWriteResGroup184], (instregex "AESKEYGENASSIST128rr")>;
+def: InstRW<[BWWriteResGroup184], (instregex "VAESKEYGENASSIST128rr")>;
+
+def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 29;
+ let NumMicroOps = 27;
+ let ResourceCycles = [1,5,1,1,19];
+}
+def: InstRW<[BWWriteResGroup185], (instregex "XSAVE64")>;
+
+def BWWriteResGroup186 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 28;
+ let ResourceCycles = [1,6,1,1,19];
+}
+def: InstRW<[BWWriteResGroup186], (instregex "XSAVE(OPT)?")>;
+
+def BWWriteResGroup187 : SchedWriteRes<[BWPort01,BWPort15,BWPort015,BWPort0156]> {
+ let Latency = 31;
+ let NumMicroOps = 31;
+ let ResourceCycles = [8,1,21,1];
+}
+def: InstRW<[BWWriteResGroup187], (instregex "MMX_EMMS")>;
+
+def BWWriteResGroup188 : SchedWriteRes<[BWPort0,BWPort5,BWPort23,BWPort015]> {
+ let Latency = 33;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
+}
+def: InstRW<[BWWriteResGroup188], (instregex "AESKEYGENASSIST128rm")>;
+def: InstRW<[BWWriteResGroup188], (instregex "VAESKEYGENASSIST128rm")>;
+
+def BWWriteResGroup189 : SchedWriteRes<[BWPort0,BWPort015]> {
+ let Latency = 34;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[BWWriteResGroup189], (instregex "VSQRTPDYr")>;
+
+def BWWriteResGroup190 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> {
+ let Latency = 34;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,2,2,1,1];
+}
+def: InstRW<[BWWriteResGroup190], (instregex "DIV(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup190], (instregex "DIV8m")>;
+
+def BWWriteResGroup191 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort06,BWPort0156]> {
+ let Latency = 34;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)ri")>;
+def: InstRW<[BWWriteResGroup191], (instregex "IN(16|32)rr")>;
+def: InstRW<[BWWriteResGroup191], (instregex "IN8ri")>;
+def: InstRW<[BWWriteResGroup191], (instregex "IN8rr")>;
+
+def BWWriteResGroup193 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort23,BWPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,2,2,1,1];
+}
+def: InstRW<[BWWriteResGroup193], (instregex "IDIV(16|32|64)m")>;
+def: InstRW<[BWWriteResGroup193], (instregex "IDIV8m")>;
+
+def BWWriteResGroup194 : SchedWriteRes<[BWPort5,BWPort6,BWPort23,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)ir")>;
+def: InstRW<[BWWriteResGroup194], (instregex "OUT(16|32)rr")>;
+def: InstRW<[BWWriteResGroup194], (instregex "OUT8ir")>;
+def: InstRW<[BWWriteResGroup194], (instregex "OUT8rr")>;
+
+def BWWriteResGroup195 : SchedWriteRes<[BWPort0,BWPort23,BWPort015]> {
+ let Latency = 40;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[BWWriteResGroup195], (instregex "VSQRTPDYm")>;
+
+def BWWriteResGroup196 : SchedWriteRes<[BWPort5,BWPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[BWWriteResGroup196], (instregex "RDTSCP")>;
+
+def BWWriteResGroup197 : SchedWriteRes<[BWPort0,BWPort01,BWPort23,BWPort05,BWPort06,BWPort015,BWPort0156]> {
+ let Latency = 60;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,2,8,1,10,2,39];
+}
+def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>;
+def: InstRW<[BWWriteResGroup197], (instregex "FLDENVm")>;
+
+def BWWriteResGroup198 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[BWWriteResGroup198], (instregex "FXRSTOR64")>;
+
+def BWWriteResGroup199 : SchedWriteRes<[BWPort0,BWPort6,BWPort23,BWPort05,BWPort06,BWPort15,BWPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[BWWriteResGroup199], (instregex "FXRSTOR")>;
+
+def BWWriteResGroup200 : SchedWriteRes<[BWPort5,BWPort01,BWPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[BWWriteResGroup200], (instregex "FNINIT")>;
+
+def BWWriteResGroup201 : SchedWriteRes<[BWPort0,BWPort1,BWPort5,BWPort6,BWPort01,BWPort0156]> {
+ let Latency = 80;
+ let NumMicroOps = 32;
+ let ResourceCycles = [7,7,3,3,1,11];
+}
+def: InstRW<[BWWriteResGroup201], (instregex "DIV(16|32|64)r")>;
+
+def BWWriteResGroup202 : SchedWriteRes<[BWPort0,BWPort1,BWPort4,BWPort5,BWPort6,BWPort237,BWPort06,BWPort0156]> {
+ let Latency = 115;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,9,11,8,1,11,21,30];
+}
+def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>;
+def: InstRW<[BWWriteResGroup202], (instregex "FSTENVm")>;
+
+} // SchedModel
+
diff --git a/lib/Target/X86/X86SchedHaswell.td b/lib/Target/X86/X86SchedHaswell.td
index 03c8ccb53afe..46612554b1fa 100644
--- a/lib/Target/X86/X86SchedHaswell.td
+++ b/lib/Target/X86/X86SchedHaswell.td
@@ -17,14 +17,14 @@ def HaswellModel : SchedMachineModel {
// instructions per cycle.
let IssueWidth = 4;
let MicroOpBufferSize = 192; // Based on the reorder buffer.
- let LoadLatency = 4;
+ let LoadLatency = 5;
let MispredictPenalty = 16;
// Based on the LSD (loop-stream detector) queue size and benchmarking data.
let LoopMicroOpBufferSize = 50;
- // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
- // the scheduler to assign a default model to unrecognized opcodes.
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
let CompleteModel = 0;
}
@@ -70,9 +70,9 @@ def HWPortAny : ProcResGroup<[HWPort0, HWPort1, HWPort2, HWPort3, HWPort4,
// Integer division issued on port 0.
def HWDivider : ProcResource<1>;
-// Loads are 4 cycles, so ReadAfterLd registers needn't be available until 4
+// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
// cycles after the memory operand.
-def : ReadAdvance<ReadAfterLd, 4>;
+def : ReadAdvance<ReadAfterLd, 5>;
// Many SchedWrites are defined in pairs with and without a folded load.
// Instructions with folded loads are usually micro-fused, so they only appear
@@ -85,10 +85,10 @@ multiclass HWWriteResPair<X86FoldableSchedWrite SchedRW,
// Register variant is using a single cycle on ExePort.
def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
- // Memory variant also uses a cycle on port 2/3 and adds 4 cycles to the
+ // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
// latency.
def : WriteRes<SchedRW.Folded, [HWPort23, ExePort]> {
- let Latency = !add(Lat, 4);
+ let Latency = !add(Lat, 5);
}
}
@@ -99,7 +99,7 @@ def : WriteRes<WriteRMW, [HWPort4]>;
// Store_addr on 237.
// Store_data on 4.
def : WriteRes<WriteStore, [HWPort237, HWPort4]>;
-def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 4; }
+def : WriteRes<WriteLoad, [HWPort23]> { let Latency = 5; }
def : WriteRes<WriteMove, [HWPort0156]>;
def : WriteRes<WriteZero, []>;
@@ -134,6 +134,7 @@ defm : HWWriteResPair<WriteFSqrt, HWPort0, 15>;
defm : HWWriteResPair<WriteCvtF2I, HWPort1, 3>;
defm : HWWriteResPair<WriteCvtI2F, HWPort1, 4>;
defm : HWWriteResPair<WriteCvtF2F, HWPort1, 3>;
+defm : HWWriteResPair<WriteFMA, HWPort01, 5>;
defm : HWWriteResPair<WriteFShuffle, HWPort5, 1>;
defm : HWWriteResPair<WriteFBlend, HWPort015, 1>;
defm : HWWriteResPair<WriteFShuffle256, HWPort5, 3>;
@@ -434,31 +435,7 @@ def : InstRW<[WriteALULd], (instregex "MOV16rm")>;
// MOVSX, MOVZX.
// r,m.
-def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
-
-// CMOVcc.
-// r,r.
-def : InstRW<[Write2P0156_Lat2],
- (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd],
- (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
-
-// XCHG.
-// r,r.
-def WriteXCHG : SchedWriteRes<[HWPort0156]> {
- let Latency = 2;
- let ResourceCycles = [3];
-}
-
-def : InstRW<[WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
-
-// r,m.
-def WriteXCHGrm : SchedWriteRes<[]> {
- let Latency = 21;
- let NumMicroOps = 8;
-}
-def : InstRW<[WriteXCHGrm], (instregex "XCHG(8|16|32|64)rm")>;
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm8")>;
// XLAT.
def WriteXLAT : SchedWriteRes<[]> {
@@ -471,12 +448,6 @@ def : InstRW<[WriteXLAT], (instregex "XLAT")>;
// m.
def : InstRW<[Write2P237_P4], (instregex "PUSH(16|32)rmm")>;
-// PUSHF.
-def WritePushF : SchedWriteRes<[HWPort1, HWPort4, HWPort237, HWPort06]> {
- let NumMicroOps = 4;
-}
-def : InstRW<[WritePushF], (instregex "PUSHF(16|32)")>;
-
// PUSHA.
def WritePushA : SchedWriteRes<[]> {
let NumMicroOps = 19;
@@ -487,178 +458,14 @@ def : InstRW<[WritePushA], (instregex "PUSHA(16|32)")>;
// m.
def : InstRW<[Write2P237_P4], (instregex "POP(16|32)rmm")>;
-// POPF.
-def WritePopF : SchedWriteRes<[]> {
- let NumMicroOps = 9;
-}
-def : InstRW<[WritePopF], (instregex "POPF(16|32)")>;
-
// POPA.
def WritePopA : SchedWriteRes<[]> {
let NumMicroOps = 18;
}
def : InstRW<[WritePopA], (instregex "POPA(16|32)")>;
-// LAHF SAHF.
-def : InstRW<[WriteP06], (instregex "(S|L)AHF")>;
-
-// BSWAP.
-// r32.
-def WriteBSwap32 : SchedWriteRes<[HWPort15]>;
-def : InstRW<[WriteBSwap32], (instregex "BSWAP32r")>;
-
-// r64.
-def WriteBSwap64 : SchedWriteRes<[HWPort06, HWPort15]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteBSwap64], (instregex "BSWAP64r")>;
-
-// MOVBE.
-// r16,m16 / r64,m64.
-def : InstRW<[Write2P0156_Lat2Ld], (instregex "MOVBE(16|64)rm")>;
-
-// r32, m32.
-def WriteMoveBE32rm : SchedWriteRes<[HWPort15, HWPort23]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMoveBE32rm], (instregex "MOVBE32rm")>;
-
-// m16,r16.
-def WriteMoveBE16mr : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMoveBE16mr], (instregex "MOVBE16mr")>;
-
-// m32,r32.
-def WriteMoveBE32mr : SchedWriteRes<[HWPort15, HWPort237, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMoveBE32mr], (instregex "MOVBE32mr")>;
-
-// m64,r64.
-def WriteMoveBE64mr : SchedWriteRes<[HWPort06, HWPort15, HWPort237, HWPort4]> {
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteMoveBE64mr], (instregex "MOVBE64mr")>;
-
//-- Arithmetic instructions --//
-// ADD SUB.
-// m,r/i.
-def : InstRW<[Write2P0156_2P237_P4],
- (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
- "(ADD|SUB)(8|16|32|64)mi8", "(ADD|SUB)64mi32")>;
-
-// ADC SBB.
-// r,r/i.
-def : InstRW<[Write2P0156_Lat2], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
- "(ADC|SBB)(16|32|64)ri8",
- "(ADC|SBB)64ri32",
- "(ADC|SBB)(8|16|32|64)rr_REV")>;
-
-// r,m.
-def : InstRW<[Write2P0156_Lat2Ld, ReadAfterLd], (instregex "(ADC|SBB)(8|16|32|64)rm")>;
-
-// m,r/i.
-def : InstRW<[Write3P0156_2P237_P4],
- (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
- "(ADC|SBB)(16|32|64)mi8",
- "(ADC|SBB)64mi32")>;
-
-// INC DEC NOT NEG.
-// m.
-def : InstRW<[WriteP0156_2P237_P4],
- (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
- "(INC|DEC)64(16|32)m")>;
-
-// MUL IMUL.
-// r16.
-def WriteMul16 : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteMul16], (instregex "IMUL16r", "MUL16r")>;
-
-// m16.
-def WriteMul16Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteMul16Ld], (instregex "IMUL16m", "MUL16m")>;
-
-// r32.
-def WriteMul32 : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMul32], (instregex "IMUL32r", "MUL32r")>;
-
-// m32.
-def WriteMul32Ld : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteMul32Ld], (instregex "IMUL32m", "MUL32m")>;
-
-// r64.
-def WriteMul64 : SchedWriteRes<[HWPort1, HWPort6]> {
- let Latency = 3;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMul64], (instregex "IMUL64r", "MUL64r")>;
-
-// m64.
-def WriteMul64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
- let Latency = 7;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMul64Ld], (instregex "IMUL64m", "MUL64m")>;
-
-// r16,r16.
-def WriteMul16rri : SchedWriteRes<[HWPort1, HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
-
-// r16,m16.
-def WriteMul16rmi : SchedWriteRes<[HWPort1, HWPort0156, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMul16rmi], (instregex "IMUL16rmi", "IMUL16rmi8")>;
-
-// MULX.
-// r32,r32,r32.
-def WriteMulX32 : SchedWriteRes<[HWPort1, HWPort056]> {
- let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
-}
-def : InstRW<[WriteMulX32], (instregex "MULX32rr")>;
-
-// r32,r32,m32.
-def WriteMulX32Ld : SchedWriteRes<[HWPort1, HWPort056, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
-}
-def : InstRW<[WriteMulX32Ld], (instregex "MULX32rm")>;
-
-// r64,r64,r64.
-def WriteMulX64 : SchedWriteRes<[HWPort1, HWPort6]> {
- let Latency = 4;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteMulX64], (instregex "MULX64rr")>;
-
-// r64,r64,m64.
-def WriteMulX64Ld : SchedWriteRes<[HWPort1, HWPort6, HWPort23]> {
- let Latency = 8;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteMulX64Ld], (instregex "MULX64rm")>;
-
// DIV.
// r8.
def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
@@ -667,27 +474,6 @@ def WriteDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
}
def : InstRW<[WriteDiv8], (instregex "DIV8r")>;
-// r16.
-def WriteDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 23;
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteDiv16], (instregex "DIV16r")>;
-
-// r32.
-def WriteDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 22;
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteDiv32], (instregex "DIV32r")>;
-
-// r64.
-def WriteDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 32;
- let NumMicroOps = 36;
-}
-def : InstRW<[WriteDiv64], (instregex "DIV64r")>;
-
// IDIV.
// r8.
def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
@@ -696,259 +482,23 @@ def WriteIDiv8 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
}
def : InstRW<[WriteIDiv8], (instregex "IDIV8r")>;
-// r16.
-def WriteIDiv16 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 23;
- let NumMicroOps = 10;
-}
-def : InstRW<[WriteIDiv16], (instregex "IDIV16r")>;
-
-// r32.
-def WriteIDiv32 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 22;
- let NumMicroOps = 9;
-}
-def : InstRW<[WriteIDiv32], (instregex "IDIV32r")>;
-
-// r64.
-def WriteIDiv64 : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort6]> {
- let Latency = 39;
- let NumMicroOps = 59;
-}
-def : InstRW<[WriteIDiv64], (instregex "IDIV64r")>;
-
-//-- Logic instructions --//
-
-// AND OR XOR.
-// m,r/i.
-def : InstRW<[Write2P0156_2P237_P4],
- (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
- "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
-
-// SHR SHL SAR.
-// m,i.
-def WriteShiftRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
-}
-def : InstRW<[WriteShiftRMW], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
-
-// r,cl.
-def : InstRW<[Write3P06_Lat2], (instregex "S(A|H)(R|L)(8|16|32|64)rCL")>;
-
-// m,cl.
-def WriteShiftClLdRMW : SchedWriteRes<[HWPort06, HWPort23, HWPort4]> {
- let NumMicroOps = 6;
- let ResourceCycles = [3, 2, 1];
-}
-def : InstRW<[WriteShiftClLdRMW], (instregex "S(A|H)(R|L)(8|16|32|64)mCL")>;
-
-// ROR ROL.
-// r,1.
-def : InstRW<[Write2P06], (instregex "RO(R|L)(8|16|32|64)r1")>;
-
-// m,i.
-def WriteRotateRMW : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 5;
- let ResourceCycles = [2, 2, 1];
-}
-def : InstRW<[WriteRotateRMW], (instregex "RO(R|L)(8|16|32|64)mi")>;
-
-// r,cl.
-def : InstRW<[Write3P06_Lat2], (instregex "RO(R|L)(8|16|32|64)rCL")>;
-
-// m,cl.
-def WriteRotateRMWCL : SchedWriteRes<[]> {
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteRotateRMWCL], (instregex "RO(R|L)(8|16|32|64)mCL")>;
-
-// RCR RCL.
-// r,1.
-def WriteRCr1 : SchedWriteRes<[HWPort06, HWPort0156]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
-}
-def : InstRW<[WriteRCr1], (instregex "RC(R|L)(8|16|32|64)r1")>;
-
-// m,1.
-def WriteRCm1 : SchedWriteRes<[]> {
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteRCm1], (instregex "RC(R|L)(8|16|32|64)m1")>;
-
-// r,i.
-def WriteRCri : SchedWriteRes<[HWPort0156]> {
- let Latency = 6;
- let NumMicroOps = 8;
-}
-def : InstRW<[WriteRCri], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
-
-// m,i.
-def WriteRCmi : SchedWriteRes<[]> {
- let NumMicroOps = 11;
-}
-def : InstRW<[WriteRCmi], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
-
-// SHRD SHLD.
-// r,r,i.
-def WriteShDrr : SchedWriteRes<[HWPort1]> {
- let Latency = 3;
-}
-def : InstRW<[WriteShDrr], (instregex "SH(R|L)D(16|32|64)rri8")>;
-
-// m,r,i.
-def WriteShDmr : SchedWriteRes<[]> {
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteShDmr], (instregex "SH(R|L)D(16|32|64)mri8")>;
-
-// r,r,cl.
-def WriteShlDCL : SchedWriteRes<[HWPort0156]> {
- let Latency = 3;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteShlDCL], (instregex "SHLD(16|32|64)rrCL")>;
-
-// r,r,cl.
-def WriteShrDCL : SchedWriteRes<[HWPort0156]> {
- let Latency = 4;
- let NumMicroOps = 4;
-}
-def : InstRW<[WriteShrDCL], (instregex "SHRD(16|32|64)rrCL")>;
-
-// m,r,cl.
-def WriteShDmrCL : SchedWriteRes<[]> {
- let NumMicroOps = 7;
-}
-def : InstRW<[WriteShDmrCL], (instregex "SH(R|L)D(16|32|64)mrCL")>;
-
// BT.
-// r,r/i.
-def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
-
// m,r.
def WriteBTmr : SchedWriteRes<[]> {
let NumMicroOps = 10;
}
def : InstRW<[WriteBTmr], (instregex "BT(16|32|64)mr")>;
-// m,i.
-def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
-
// BTR BTS BTC.
-// r,r,i.
-def : InstRW<[WriteShift], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
-
// m,r.
def WriteBTRSCmr : SchedWriteRes<[]> {
let NumMicroOps = 11;
}
def : InstRW<[WriteBTRSCmr], (instregex "BT(R|S|C)(16|32|64)mr")>;
-// m,i.
-def : InstRW<[WriteShiftLd], (instregex "BT(R|S|C)(16|32|64)mi8")>;
-
-// BSF BSR.
-// r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "BS(R|F)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "BS(R|F)(16|32|64)rm")>;
-
-// SETcc.
-// r.
-def : InstRW<[WriteShift],
- (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
-// m.
-def WriteSetCCm : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteSetCCm],
- (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
-
-// CLD STD.
-def WriteCldStd : SchedWriteRes<[HWPort15, HWPort6]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteCldStd], (instregex "STD", "CLD")>;
-
-// LZCNT TZCNT.
-// r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "(L|TZCNT)(16|32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "(L|TZCNT)(16|32|64)rm")>;
-
-// ANDN.
-// r,r.
-def : InstRW<[WriteP15], (instregex "ANDN(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP15Ld], (instregex "ANDN(32|64)rm")>;
-
-// BLSI BLSMSK BLSR.
-// r,r.
-def : InstRW<[WriteP15], (instregex "BLS(I|MSK|R)(32|64)rr")>;
-// r,m.
-def : InstRW<[WriteP15Ld], (instregex "BLS(I|MSK|R)(32|64)rm")>;
-
-// BEXTR.
-// r,r,r.
-def : InstRW<[Write2P0156_Lat2], (instregex "BEXTR(32|64)rr")>;
-// r,m,r.
-def : InstRW<[Write2P0156_Lat2Ld], (instregex "BEXTR(32|64)rm")>;
-
-// BZHI.
-// r,r,r.
-def : InstRW<[WriteP15], (instregex "BZHI(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteP15Ld], (instregex "BZHI(32|64)rm")>;
-
-// PDEP PEXT.
-// r,r,r.
-def : InstRW<[WriteP1_Lat3], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
-// r,m,r.
-def : InstRW<[WriteP1_Lat3Ld], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
-
//-- Control transfer instructions --//
-// J(E|R)CXZ.
-def WriteJCXZ : SchedWriteRes<[HWPort0156, HWPort6]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
-
-// LOOP.
-def WriteLOOP : SchedWriteRes<[]> {
- let NumMicroOps = 7;
-}
-def : InstRW<[WriteLOOP], (instregex "LOOP")>;
-
-// LOOP(N)E
-def WriteLOOPE : SchedWriteRes<[]> {
- let NumMicroOps = 11;
-}
-def : InstRW<[WriteLOOPE], (instregex "LOOPE", "LOOPNE")>;
-
// CALL.
-// r.
-def WriteCALLr : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteCALLr], (instregex "CALL(16|32)r")>;
-
-// m.
-def WriteCALLm : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
-}
-def : InstRW<[WriteCALLm], (instregex "CALL(16|32)m")>;
-
-// RET.
-def WriteRET : SchedWriteRes<[HWPort237, HWPort6]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)")>;
-
// i.
def WriteRETI : SchedWriteRes<[HWPort23, HWPort6, HWPort015]> {
let NumMicroOps = 4;
@@ -977,12 +527,6 @@ def : InstRW<[Write2P0156_P23], (instregex "LODS(B|W)")>;
// LODSD/Q.
def : InstRW<[WriteP0156_P23], (instregex "LODS(L|Q)")>;
-// STOS.
-def WriteSTOS : SchedWriteRes<[HWPort23, HWPort0156, HWPort4]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteSTOS], (instregex "STOS(B|L|Q|W)")>;
-
// MOVS.
def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
let Latency = 4;
@@ -991,9 +535,6 @@ def WriteMOVS : SchedWriteRes<[HWPort23, HWPort4, HWPort0156]> {
}
def : InstRW<[WriteMOVS], (instregex "MOVS(B|L|Q|W)")>;
-// SCAS.
-def : InstRW<[Write2P0156_P23], (instregex "SCAS(B|W|L|Q)")>;
-
// CMPS.
def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
let Latency = 4;
@@ -1002,57 +543,9 @@ def WriteCMPS : SchedWriteRes<[HWPort23, HWPort0156]> {
}
def : InstRW<[WriteCMPS], (instregex "CMPS(B|L|Q|W)")>;
-//-- Synchronization instructions --//
-
-// XADD.
-def WriteXADD : SchedWriteRes<[]> {
- let NumMicroOps = 5;
-}
-def : InstRW<[WriteXADD], (instregex "XADD(8|16|32|64)rm")>;
-
-// CMPXCHG.
-def WriteCMPXCHG : SchedWriteRes<[]> {
- let NumMicroOps = 6;
-}
-def : InstRW<[WriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
-
-// CMPXCHG8B.
-def WriteCMPXCHG8B : SchedWriteRes<[]> {
- let NumMicroOps = 15;
-}
-def : InstRW<[WriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
-
-// CMPXCHG16B.
-def WriteCMPXCHG16B : SchedWriteRes<[]> {
- let NumMicroOps = 22;
-}
-def : InstRW<[WriteCMPXCHG16B], (instregex "CMPXCHG16B")>;
-
//-- Other --//
-// PAUSE.
-def WritePAUSE : SchedWriteRes<[HWPort05, HWPort6]> {
- let NumMicroOps = 5;
- let ResourceCycles = [1, 3];
-}
-def : InstRW<[WritePAUSE], (instregex "PAUSE")>;
-
-// LEAVE.
-def : InstRW<[Write2P0156_P23], (instregex "LEAVE")>;
-
-// XGETBV.
-def WriteXGETBV : SchedWriteRes<[]> {
- let NumMicroOps = 8;
-}
-def : InstRW<[WriteXGETBV], (instregex "XGETBV")>;
-
-// RDTSC.
-def WriteRDTSC : SchedWriteRes<[]> {
- let NumMicroOps = 15;
-}
-def : InstRW<[WriteRDTSC], (instregex "RDTSC")>;
-
-// RDPMC.
+// RDPMC.f
def WriteRDPMC : SchedWriteRes<[]> {
let NumMicroOps = 34;
}
@@ -1072,13 +565,6 @@ def : InstRW<[WriteRDRAND], (instregex "RDRAND(16|32|64)r")>;
// m80.
def : InstRW<[WriteP01], (instregex "LD_Frr")>;
-def WriteLD_F80m : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 4;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 2];
-}
-def : InstRW<[WriteLD_F80m], (instregex "LD_F80m")>;
-
// FBLD.
// m80.
def WriteFBLD : SchedWriteRes<[]> {
@@ -1091,84 +577,12 @@ def : InstRW<[WriteFBLD], (instregex "FBLDm")>;
// r.
def : InstRW<[WriteP01], (instregex "ST_(F|FP)rr")>;
-// m80.
-def WriteST_FP80m : SchedWriteRes<[HWPort0156, HWPort23, HWPort4]> {
- let NumMicroOps = 7;
- let ResourceCycles = [3, 2, 2];
-}
-def : InstRW<[WriteST_FP80m], (instregex "ST_FP80m")>;
-
-// FBSTP.
-// m80.
-def WriteFBSTP : SchedWriteRes<[]> {
- let NumMicroOps = 226;
-}
-def : InstRW<[WriteFBSTP], (instregex "FBSTPm")>;
-
-// FXCHG.
-def : InstRW<[WriteNop], (instregex "XCH_F")>;
-
-// FILD.
-def WriteFILD : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteFILD], (instregex "ILD_F(16|32|64)m")>;
-
-// FIST(P) FISTTP.
-def WriteFIST : SchedWriteRes<[HWPort1, HWPort23, HWPort4]> {
- let Latency = 7;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFIST], (instregex "IST_(F|FP)(16|32)m")>;
-
// FLDZ.
def : InstRW<[WriteP01], (instregex "LD_F0")>;
-// FLD1.
-def : InstRW<[Write2P01], (instregex "LD_F1")>;
-
// FLDPI FLDL2E etc.
def : InstRW<[Write2P01], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
-// FCMOVcc.
-def WriteFCMOVcc : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
-}
-def : InstRW<[WriteFCMOVcc], (instregex "CMOV(B|BE|P|NB|NBE|NE|NP)_F")>;
-
-// FNSTSW.
-// AX.
-def WriteFNSTSW : SchedWriteRes<[HWPort0, HWPort0156]> {
- let NumMicroOps = 2;
-}
-def : InstRW<[WriteFNSTSW], (instregex "FNSTSW16r")>;
-
-// m16.
-def WriteFNSTSWm : SchedWriteRes<[HWPort0, HWPort4, HWPort237]> {
- let Latency = 6;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFNSTSWm], (instregex "FNSTSWm")>;
-
-// FLDCW.
-def WriteFLDCW : SchedWriteRes<[HWPort01, HWPort23, HWPort6]> {
- let Latency = 7;
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFLDCW], (instregex "FLDCW16m")>;
-
-// FNSTCW.
-def WriteFNSTCW : SchedWriteRes<[HWPort237, HWPort4, HWPort6]> {
- let NumMicroOps = 3;
-}
-def : InstRW<[WriteFNSTCW], (instregex "FNSTCW16m")>;
-
-// FINCSTP FDECSTP.
-def : InstRW<[WriteP01], (instregex "FINCSTP", "FDECSTP")>;
-
// FFREE.
def : InstRW<[WriteP01], (instregex "FFREE")>;
@@ -1192,13 +606,6 @@ def : InstRW<[WriteP0], (instregex "ABS_F")>;
// FCHS.
def : InstRW<[WriteP0], (instregex "CHS_F")>;
-// FCOM(P) FUCOM(P).
-// r.
-def : InstRW<[WriteP1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
- "UCOM_FPr")>;
-// m.
-def : InstRW<[WriteP1_P23], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
-
// FCOMPP FUCOMPP.
// r.
def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
@@ -1208,9 +615,6 @@ def : InstRW<[Write2P01], (instregex "FCOMPP", "UCOM_FPPr")>;
def : InstRW<[Write3P01], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
"UCOM_FIPr")>;
-// FICOM(P).
-def : InstRW<[Write2P1_P23], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
-
// FTST.
def : InstRW<[WriteP1], (instregex "TST_F")>;
@@ -1271,910 +675,3693 @@ def WriteFNINIT : SchedWriteRes<[]> {
}
def : InstRW<[WriteFNINIT], (instregex "FNINIT")>;
-//=== Integer MMX and XMM Instructions ===//
-//-- Move instructions --//
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
-// MOVD.
-// r32/64 <- (x)mm.
-def : InstRW<[WriteP0], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
- "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
-// (x)mm <- r32/64.
-def : InstRW<[WriteP5], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
- "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1, 2, 1];
+}
-// MOVQ.
-// r64 <- (x)mm.
-def : InstRW<[WriteP0], (instregex "VMOVPQIto64rr")>;
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2];
+}
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1, 2, 1];
+}
-// (x)mm <- r64.
-def : InstRW<[WriteP5], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+//=== Floating Point XMM and YMM Instructions ===//
-// (x)mm <- (x)mm.
-def : InstRW<[WriteP015], (instregex "MMX_MOVQ64rr")>;
+// Remaining instrs.
-// (V)MOVDQA/U.
-// x <- x.
-def : InstRW<[WriteP015], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
- "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV",
- "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+def HWWriteResGroup0 : SchedWriteRes<[HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0], (instregex "LDDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVAPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVAPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVNTDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSHDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVSLDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVUPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "MOVUPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VLDDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVAPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVDQUrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVNTDQArm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VMOVUPSrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTDrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VPBROADCASTQrm")>;
+def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPDr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "ROUNDPSr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSDr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "ROUNDSSr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPDr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VROUNDPSr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSDr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VROUNDSSr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPDr")>;
+def: InstRW<[HWWriteResGroup0], (instregex "VROUNDYPSr")>;
+
+def HWWriteResGroup0_1 : SchedWriteRes<[HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F32m")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F64m")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "LD_F80m")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTF128")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTI128")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VBROADCASTSSYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VLDDQUYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPDYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVAPSYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQAYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVDQUYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVNTDQAYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPDYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VMOVUPSYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTDYrm")>;
+def: InstRW<[HWWriteResGroup0_1], (instregex "VPBROADCASTQYrm")>;
+
+def HWWriteResGroup0_2 : SchedWriteRes<[HWPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64rm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVD64to64rm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MMX_MOVQ64rm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOV(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOV64toPQIrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOV8rm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDDUPrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVQI2PQIrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSDrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSSrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm16")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm32")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVSX(16|32|64)rm8")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm16")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "MOVZX(16|32|64)rm8")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHNTA")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT0")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT1")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "PREFETCHT2")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDDUPrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSDrm")>;
+def: InstRW<[HWWriteResGroup0_2], (instregex "VMOVSSrm")>;
+
+def HWWriteResGroup1 : SchedWriteRes<[HWPort4,HWPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup1], (instregex "FBSTPm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVD64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVNTQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MMX_MOVQ64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV8mi")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOV8mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVAPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVAPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVDQAmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVDQUmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVHPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVHPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVLPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVLPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTDQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTI_64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVNTPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPDI2DImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPQI2QImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVPQIto64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVSDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVSSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVUPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "MOVUPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP32m")>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP64m")>;
+def: InstRW<[HWWriteResGroup1], (instregex "ST_FP80m")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VEXTRACTI128mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVAPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQAmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVDQUmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVHPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVLPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTDQmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVNTPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVSDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVSSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPDmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSYmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMOVUPSmr")>;
+def: InstRW<[HWWriteResGroup1], (instregex "VMPTRSTm")>;
+
+def HWWriteResGroup2 : SchedWriteRes<[HWPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_MOVD64grr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSLLWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRADrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRAWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MMX_PSRLWrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "MOVPQIto64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "PSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLVQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSLLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRADYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRADri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRAWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLDri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLQri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLVQrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWYri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VPSRLWri")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPDrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSYrr")>;
+def: InstRW<[HWWriteResGroup2], (instregex "VTESTPSrr")>;
+
+def HWWriteResGroup3 : SchedWriteRes<[HWPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup3], (instregex "COMP_FST0r")>;
+def: InstRW<[HWWriteResGroup3], (instregex "COM_FST0r")>;
+def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[HWWriteResGroup3], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[HWWriteResGroup3], (instregex "UCOM_FPr")>;
+def: InstRW<[HWWriteResGroup3], (instregex "UCOM_Fr")>;
+def: InstRW<[HWWriteResGroup3], (instregex "VMASKMOVDQU")>;
+
+def HWWriteResGroup4 : SchedWriteRes<[HWPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup4], (instregex "ANDNPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDNPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ANDPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "INSERTPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64rr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVD64to64rr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PSHUFWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHBWirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHDQirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKHWDirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLBWirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLDQirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MMX_PUNPCKLWDirr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVAPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVAPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVDDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVHLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVLHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSHDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSLDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVSSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "MOVUPSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "ORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKSSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKSSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKUSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PACKUSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PALIGNRrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PBLENDWrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVSXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PMOVZXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFHWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSHUFLWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSLLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PSRLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "SHUFPDrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "SHUFPSrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "UNPCKLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDNPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VANDPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VINSERTPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVAPSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVDI2PDIrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVHLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVLHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VMOVUPSrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKSSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSDWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPACKUSWBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPALIGNRrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBLENDWrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPBROADCASTQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPERMILPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFBrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFDri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFHWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSHUFLWri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSLLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQYri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPSRLDQri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VPUNPCKLWDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPDrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSYrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VSHUFPSrri")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPDYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPSYrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "VXORPSrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "XORPDrr")>;
+def: InstRW<[HWWriteResGroup4], (instregex "XORPSrr")>;
+
+def HWWriteResGroup5 : SchedWriteRes<[HWPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup5], (instregex "JMP(16|32|64)r")>;
-// MOVDQ2Q.
-def : InstRW<[WriteP01_P5], (instregex "MMX_MOVDQ2Qrr")>;
+def HWWriteResGroup6 : SchedWriteRes<[HWPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup6], (instregex "FINCSTP")>;
+def: InstRW<[HWWriteResGroup6], (instregex "FNOP")>;
-// MOVQ2DQ.
-def : InstRW<[WriteP015], (instregex "MMX_MOVQ2DQrr")>;
+def HWWriteResGroup7 : SchedWriteRes<[HWPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
+def: InstRW<[HWWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "CDQ")>;
+def: InstRW<[HWWriteResGroup7], (instregex "CQO")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JAE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JAE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JA_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JA_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JBE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JBE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JB_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JB_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JGE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JGE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JG_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JG_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JLE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JLE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JL_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JL_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JMP_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JMP_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNE_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNE_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNO_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNO_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNP_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNP_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNS_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JNS_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JO_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JO_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JP_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JP_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JS_1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "JS_4")>;
+def: InstRW<[HWWriteResGroup7], (instregex "RORX(32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SAR8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SARX(32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETAEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETBr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETGEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETGr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETLEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETLr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNEr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNOr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNPr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETNSr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETOr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETPr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SETSr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHL8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHLX(32|64)rr")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR8r1")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHR8ri")>;
+def: InstRW<[HWWriteResGroup7], (instregex "SHRX(32|64)rr")>;
+
+def HWWriteResGroup8 : SchedWriteRes<[HWPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSI(32|64)rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BLSR(32|64)rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "BZHI(32|64)rr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDQirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDUSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PADDWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PAVGWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPEQWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PCMPGTWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMAXUBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PMINUBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBDirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBQirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSBirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBUSWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "MMX_PSUBWirr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PABSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PADDWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PAVGBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PAVGWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PCMPGTWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMAXUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PMINUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNBrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNDrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSIGNWrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "PSUBWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPABSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPADDWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPAVGWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPEQWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPCMPGTWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMAXUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPMINUWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNBrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNDrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWYrr256")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSIGNWrr128")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBDrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBQrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSBrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBUSWrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWYrr")>;
+def: InstRW<[HWWriteResGroup8], (instregex "VPSUBWrr")>;
+
+def HWWriteResGroup9 : SchedWriteRes<[HWPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup9], (instregex "BLENDPDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "BLENDPSrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_MOVQ64rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDNirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PANDirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PORirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MMX_PXORirr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PANDNrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PANDrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "PXORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VBLENDPSrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDNYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDNrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPANDrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDYrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPBLENDDrri")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPORYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPORrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPXORYrr")>;
+def: InstRW<[HWWriteResGroup9], (instregex "VPXORrr")>;
+
+def HWWriteResGroup10 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "AND8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CBW")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CLC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "CWDE")>;
+def: InstRW<[HWWriteResGroup10], (instregex "DEC(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "DEC8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "INC(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "INC8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "LAHF")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
+def: InstRW<[HWWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NEG(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NEG8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOOP")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOT(16|32|64)r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "NOT8r")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "OR8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SAHF")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SGDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SIDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SLDT64m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SMSW16m")>;
+def: InstRW<[HWWriteResGroup10], (instregex "STC")>;
+def: InstRW<[HWWriteResGroup10], (instregex "STRm")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup10], (instregex "SYSCALL")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "TEST8rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8i8")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8ri")>;
+def: InstRW<[HWWriteResGroup10], (instregex "XOR8rr")>;
+
+def HWWriteResGroup11 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11], (instregex "CVTPS2PDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSLLWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRADrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRAWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLDrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLQrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "MMX_PSRLWrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPH2PSrm")>;
+def: InstRW<[HWWriteResGroup11], (instregex "VCVTPS2PDrm")>;
+
+def HWWriteResGroup11_1 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11_1], (instregex "CVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTPH2PSYrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "VPSLLVQrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "VPSRLVQrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPDrm")>;
+def: InstRW<[HWWriteResGroup11_1], (instregex "VTESTPSrm")>;
+def HWWriteResGroup11_2 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLDYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLQYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLVQYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSLLWYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRADYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRAWYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLDYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLQYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLVQYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VPSRLWYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPDYrm")>;
+def: InstRW<[HWWriteResGroup11_2], (instregex "VTESTPSYrm")>;
+
+def HWWriteResGroup12 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup12], (instregex "ADDSDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "ADDSSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "BSF(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "BSR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "CMPSDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "CMPSSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "COMISDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "COMISSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOM32m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOM64m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOMP32m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "FCOMP64m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "IMUL(16|32|64)rm(i8)?")>;
+def: InstRW<[HWWriteResGroup12], (instregex "IMUL8m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "LZCNT(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MAX(C?)SSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MIN(C?)SSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MUL(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "MUL8m")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PDEP(32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "PEXT(32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "POPCNT(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "SUBSDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "SUBSSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "TZCNT(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "UCOMISDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "UCOMISSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VADDSDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VADDSSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VCMPSDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VCMPSSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VCOMISDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VCOMISSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMAX(C?)SSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VMIN(C?)SSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VSUBSDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VSUBSSrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISDrm")>;
+def: InstRW<[HWWriteResGroup12], (instregex "VUCOMISSrm")>;
+
+def HWWriteResGroup13 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13], (instregex "ANDNPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ANDNPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ANDPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ANDPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "INSERTPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "ORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKSSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKSSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKUSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PACKUSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PALIGNRrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PBLENDWrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFDmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PSHUFLWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "SHUFPDrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "SHUFPSrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "UNPCKLPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDNPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDNPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VANDPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VINSERTPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKSSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSDWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPACKUSWBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPALIGNRrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPBLENDWrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPERMILPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFBrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFDmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFHWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPSHUFLWmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VPUNPCKLWDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPDrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VSHUFPSrmi")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VXORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "VXORPSrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "XORPDrm")>;
+def: InstRW<[HWWriteResGroup13], (instregex "XORPSrm")>;
+
+def HWWriteResGroup13_1 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VANDNPSYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VANDPSYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VORPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VORPSYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSDWYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKSSWBYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSDWYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPACKUSWBYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPALIGNRYrmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPBLENDWYrmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPERMILPSYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXBQYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPMOVSXWQYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFBYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFDYmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFHWYmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPSHUFLWYmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHBWYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHDQYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHQDQYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKHWDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLBWYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLDQYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLQDQYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VPUNPCKLWDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKHPSYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPDYrm")>;
+def: InstRW<[HWWriteResGroup13_1], (instregex "VXORPSYrm")>;
+
+def HWWriteResGroup13_2 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PINSRWirmi")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PSHUFWmi")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHBWirm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHDQirm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKHWDirm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLBWirm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLDQirm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MMX_PUNPCKLWDirm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MOVHPSrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "MOVLPSrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRBrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PINSRWrmi")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXBWrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXDQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVSXWQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXBWrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXDQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "PMOVZXWQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVHPSrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VMOVLPSrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRBrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPINSRWrmi")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[HWWriteResGroup13_2], (instregex "VPMOVZXWQrm")>;
+
+def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup14], (instregex "FARJMP64")>;
+def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
-// PACKSSWB/DW.
-// mm <- mm.
-def WriteMMXPACKSSrr : SchedWriteRes<[HWPort5]> {
+def HWWriteResGroup15 : SchedWriteRes<[HWPort23,HWPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup15], (instregex "BT(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup15], (instregex "RORX32mi")>;
+def: InstRW<[HWWriteResGroup15], (instregex "RORX64mi")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SARX32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SARX64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHLX32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHLX64rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHRX32rm")>;
+def: InstRW<[HWWriteResGroup15], (instregex "SHRX64rm")>;
+
+def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup16], (instregex "ANDN(32|64)rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSI(32|64)rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSMSK(32|64)rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BLSR(32|64)rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "BZHI(32|64)rm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDQirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDUSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PADDWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PAVGWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPEQWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PCMPGTWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMAXUBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PMINUBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBDirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBQirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSBirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBUSWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MMX_PSUBWirm")>;
+def: InstRW<[HWWriteResGroup16], (instregex "MOVBE(16|32|64)rm")>;
+
+def HWWriteResGroup16_1 : SchedWriteRes<[HWPort23,HWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup16_1], (instregex "PABSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PABSDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PABSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDQrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDUSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PADDWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PAVGWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PCMPGTWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMAXUWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMINSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PMINUWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNBrm128")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNDrm128")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSIGNWrm128")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBQrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBUSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "PSUBWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPABSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDQrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDUSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPADDWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPAVGWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQQrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPEQWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPCMPGTWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMAXUWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPMINUWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNBrm128")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNDrm128")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSIGNWrm128")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBDrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBQrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSBrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBUSWrm")>;
+def: InstRW<[HWWriteResGroup16_1], (instregex "VPSUBWrm")>;
+
+def HWWriteResGroup16_2 : SchedWriteRes<[HWPort23,HWPort15]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPABSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDQYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDUSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPADDWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPAVGWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQQYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPEQWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPCMPGTWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMAXUWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPMINUWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNBYrm256")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNDYrm256")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSIGNWYrm256")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBDYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBQYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSBYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[HWWriteResGroup16_2], (instregex "VPSUBWYrm")>;
+
+def HWWriteResGroup17 : SchedWriteRes<[HWPort23,HWPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup17], (instregex "BLENDPDrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "BLENDPSrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PANDNrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PANDrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PORrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "PXORrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPDrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VBLENDPSrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VINSERTF128rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VINSERTI128rm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPANDNrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPANDrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPBLENDDrmi")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPORrm")>;
+def: InstRW<[HWWriteResGroup17], (instregex "VPXORrm")>;
+
+def HWWriteResGroup17_1 : SchedWriteRes<[HWPort23,HWPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDNirm")>;
+def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PANDirm")>;
+def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PORirm")>;
+def: InstRW<[HWWriteResGroup17_1], (instregex "MMX_PXORirm")>;
+
+def HWWriteResGroup17_2 : SchedWriteRes<[HWPort23,HWPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[HWWriteResGroup17_2], (instregex "VBLENDPSYrmi")>;
+def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDNYrm")>;
+def: InstRW<[HWWriteResGroup17_2], (instregex "VPANDYrm")>;
+def: InstRW<[HWWriteResGroup17_2], (instregex "VPBLENDDYrmi")>;
+def: InstRW<[HWWriteResGroup17_2], (instregex "VPORYrm")>;
+def: InstRW<[HWWriteResGroup17_2], (instregex "VPXORYrm")>;
+
+def HWWriteResGroup18 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup18], (instregex "ADD(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "ADD8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "AND(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "AND8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP8mi")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP8mr")>;
+def: InstRW<[HWWriteResGroup18], (instregex "CMP8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "OR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "OR8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "POP(16|32|64)r(mr)?")>;
+def: InstRW<[HWWriteResGroup18], (instregex "SUB(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "SUB8rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "TEST(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup18], (instregex "TEST8mi")>;
+def: InstRW<[HWWriteResGroup18], (instregex "TEST8mr")>;
+def: InstRW<[HWWriteResGroup18], (instregex "XOR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup18], (instregex "XOR8rm")>;
+
+def HWWriteResGroup19 : SchedWriteRes<[HWPort237,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup19], (instregex "SFENCE")>;
+
+def HWWriteResGroup20 : SchedWriteRes<[HWPort4,HWPort5,HWPort237]> {
let Latency = 2;
let NumMicroOps = 3;
- let ResourceCycles = [3];
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup20], (instregex "EXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRBmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRDmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRQmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "PEXTRWmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "STMXCSR")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VEXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRBmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRDmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRQmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VPEXTRWmr")>;
+def: InstRW<[HWWriteResGroup20], (instregex "VSTMXCSR")>;
+
+def HWWriteResGroup21 : SchedWriteRes<[HWPort4,HWPort6,HWPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteMMXPACKSSrr], (instregex "MMX_PACKSSDWirr",
- "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+def: InstRW<[HWWriteResGroup21], (instregex "FNSTCW16m")>;
-// mm <- m64.
-def WriteMMXPACKSSrm : SchedWriteRes<[HWPort23, HWPort5]> {
- let Latency = 4;
+def HWWriteResGroup22 : SchedWriteRes<[HWPort4,HWPort237,HWPort06]> {
+ let Latency = 2;
let NumMicroOps = 3;
- let ResourceCycles = [1, 3];
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup22], (instregex "SETAEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETBm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETGEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETGm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETLEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETLm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNEm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNOm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNPm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETNSm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETOm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETPm")>;
+def: InstRW<[HWWriteResGroup22], (instregex "SETSm")>;
+
+def HWWriteResGroup23 : SchedWriteRes<[HWPort4,HWPort237,HWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteMMXPACKSSrm], (instregex "MMX_PACKSSDWirm",
- "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+def: InstRW<[HWWriteResGroup23], (instregex "MOVBE(32|64)mr")>;
-// VPMOVSX/ZX BW BD BQ DW DQ.
-// y <- x.
-def WriteVPMOVSX : SchedWriteRes<[HWPort5]> {
- let Latency = 3;
- let NumMicroOps = 1;
+def HWWriteResGroup23_16 : SchedWriteRes<[HWPort06, HWPort237, HWPort4]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVPMOVSX], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+def: InstRW<[HWWriteResGroup23_16], (instregex "MOVBE16mr")>;
-// PBLENDW.
-// x,x,i / v,v,v,i
-def WritePBLENDWr : SchedWriteRes<[HWPort5]>;
-def : InstRW<[WritePBLENDWr], (instregex "(V?)PBLENDW(Y?)rri")>;
+def HWWriteResGroup24 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH(16|32|64)r(mr)?")>;
+def: InstRW<[HWWriteResGroup24], (instregex "PUSH64i8")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSB")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSL")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSQ")>;
+def: InstRW<[HWWriteResGroup24], (instregex "STOSW")>;
-// x,m,i / v,v,m,i
-def WritePBLENDWm : SchedWriteRes<[HWPort5, HWPort23]> {
+def HWWriteResGroup25 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup25], (instregex "BTC(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "BTR(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "BTS(16|32|64)mi8")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR8m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SAR8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL8m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHL8mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR8m1")>;
+def: InstRW<[HWWriteResGroup25], (instregex "SHR8mi")>;
+
+def HWWriteResGroup26 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "ADD(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "ADD8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "ADD8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "AND8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "DEC(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "DEC8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "INC(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "INC8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NEG(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NEG8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NOT(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "NOT8m")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "OR8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "POP(16|32|64)rmm")>;
+def: InstRW<[HWWriteResGroup26], (instregex "PUSH(16|32|64)rmm")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "SUB8mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR8mi")>;
+def: InstRW<[HWWriteResGroup26], (instregex "XOR8mr")>;
+
+def HWWriteResGroup27 : SchedWriteRes<[HWPort5]> {
+ let Latency = 2;
let NumMicroOps = 2;
- let Latency = 4;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [2];
}
-def : InstRW<[WritePBLENDWm, ReadAfterLd], (instregex "(V?)PBLENDW(Y?)rmi")>;
-
-// VPBLENDD.
-// v,v,v,i.
-def WriteVPBLENDDr : SchedWriteRes<[HWPort015]>;
-def : InstRW<[WriteVPBLENDDr], (instregex "VPBLENDD(Y?)rri")>;
-
-// v,v,m,i
-def WriteVPBLENDDm : SchedWriteRes<[HWPort015, HWPort23]> {
+def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPDrr0")>;
+def: InstRW<[HWWriteResGroup27], (instregex "BLENDVPSrr0")>;
+def: InstRW<[HWWriteResGroup27], (instregex "MMX_PINSRWirri")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PBLENDVBrr0")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRBrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRDrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRQrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "PINSRWrri")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPDrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VBLENDVPSrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBYrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPBLENDVBrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRBrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRDrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRQrr")>;
+def: InstRW<[HWWriteResGroup27], (instregex "VPINSRWrri")>;
+
+def HWWriteResGroup28 : SchedWriteRes<[HWPort01]> {
+ let Latency = 2;
let NumMicroOps = 2;
- let Latency = 4;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [2];
}
-def : InstRW<[WriteVPBLENDDm, ReadAfterLd], (instregex "VPBLENDD(Y?)rmi")>;
+def: InstRW<[HWWriteResGroup28], (instregex "FDECSTP")>;
-// MASKMOVQ.
-def WriteMASKMOVQ : SchedWriteRes<[HWPort0, HWPort4, HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 2];
+def HWWriteResGroup29 : SchedWriteRes<[HWPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
}
-def : InstRW<[WriteMASKMOVQ], (instregex "MMX_MASKMOVQ(64)?")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL8r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROL8ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR8r1")>;
+def: InstRW<[HWWriteResGroup29], (instregex "ROR8ri")>;
-// MASKMOVDQU.
-def WriteMASKMOVDQU : SchedWriteRes<[HWPort04, HWPort56, HWPort23]> {
- let Latency = 14;
- let NumMicroOps = 10;
- let ResourceCycles = [4, 2, 4];
+def HWWriteResGroup30 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
}
-def : InstRW<[WriteMASKMOVDQU], (instregex "(V?)MASKMOVDQU(64)?")>;
+def: InstRW<[HWWriteResGroup30], (instregex "LFENCE")>;
+def: InstRW<[HWWriteResGroup30], (instregex "MFENCE")>;
+def: InstRW<[HWWriteResGroup30], (instregex "WAIT")>;
+def: InstRW<[HWWriteResGroup30], (instregex "XGETBV")>;
-// VPMASKMOV D/Q.
-// v,v,m.
-def WriteVPMASKMOVr : SchedWriteRes<[HWPort5, HWPort23]> {
- let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup31 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup31], (instregex "CVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "CVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "EXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRBrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWri")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PEXTRWrr_REV")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSLLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSLLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSLLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRADrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRAWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PSRLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "PTESTrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSYrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTPH2PSrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRBrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWri")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSLLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSLLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSLLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRADrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRAWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRLDrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRLQrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPSRLWrr")>;
+def: InstRW<[HWWriteResGroup31], (instregex "VPTESTrr")>;
+
+def HWWriteResGroup32 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVPMASKMOVr, ReadAfterLd],
- (instregex "VPMASKMOV(D|Q)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup32], (instregex "CLFLUSH")>;
-// m, v,v.
-def WriteVPMASKMOVm : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup33 : SchedWriteRes<[HWPort01,HWPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVPMASKMOVm], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+def: InstRW<[HWWriteResGroup33], (instregex "MMX_MOVDQ2Qrr")>;
-// PMOVMSKB.
-def WritePMOVMSKB : SchedWriteRes<[HWPort0]> {
- let Latency = 3;
+def HWWriteResGroup34 : SchedWriteRes<[HWPort06,HWPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKB(Y?)rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "BEXTR(32|64)rr")>;
+def: InstRW<[HWWriteResGroup34], (instregex "BSWAP(16|32|64)r")>;
-// PEXTR B/W/D/Q.
-// r32,x,i.
-def WritePEXTRr : SchedWriteRes<[HWPort0, HWPort5]> {
+def HWWriteResGroup35 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 2;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8i8")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8ri")>;
+def: InstRW<[HWWriteResGroup35], (instregex "ADC8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVAE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVB(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVG(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVGE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVL(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVLE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNO(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNP(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVNS(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVO(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVP(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CMOVS(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "CWD")>;
+def: InstRW<[HWWriteResGroup35], (instregex "JRCXZ")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8i8")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8ri")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SBB8rr(_REV)?")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SETAr")>;
+def: InstRW<[HWWriteResGroup35], (instregex "SETBEr")>;
+
+def HWWriteResGroup36 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPDrm0")>;
+def: InstRW<[HWWriteResGroup36], (instregex "BLENDVPSrm0")>;
+def: InstRW<[HWWriteResGroup36], (instregex "PBLENDVBrm0")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPDrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VBLENDVPSrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPBLENDVBrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVDrm")>;
+def: InstRW<[HWWriteResGroup36], (instregex "VPMASKMOVQrm")>;
+
+def HWWriteResGroup36_1 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VBLENDVPSYrm")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VPBLENDVBYrm")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVDYrm")>;
+def: InstRW<[HWWriteResGroup36_1], (instregex "VPMASKMOVQYrm")>;
-// m8,x,i.
-def WritePEXTRm : SchedWriteRes<[HWPort23, HWPort4, HWPort5]> {
+def HWWriteResGroup36_2 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 7;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSDWirm")>;
+def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKSSWBirm")>;
+def: InstRW<[HWWriteResGroup36_2], (instregex "MMX_PACKUSWBirm")>;
-// VPBROADCAST B/W.
-// x, m8/16.
-def WriteVPBROADCAST128Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
- let Latency = 5;
+def HWWriteResGroup37 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 7;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [1,2];
}
-def : InstRW<[WriteVPBROADCAST128Ld, ReadAfterLd],
- (instregex "VPBROADCAST(B|W)rm")>;
+def: InstRW<[HWWriteResGroup37], (instregex "LEAVE64")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASB")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASL")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASQ")>;
+def: InstRW<[HWWriteResGroup37], (instregex "SCASW")>;
-// y, m8/16
-def WriteVPBROADCAST256Ld : SchedWriteRes<[HWPort01, HWPort23, HWPort5]> {
+def HWWriteResGroup38 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup38], (instregex "PSLLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSLLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSLLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRADrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRAWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PSRLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "PTESTrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSLLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSLLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSLLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRADrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRAWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRLDrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRLQrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPSRLWrm")>;
+def: InstRW<[HWWriteResGroup38], (instregex "VPTESTrm")>;
+
+def HWWriteResGroup39 : SchedWriteRes<[HWPort0,HWPort01,HWPort23]> {
let Latency = 7;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVPBROADCAST256Ld, ReadAfterLd],
- (instregex "VPBROADCAST(B|W)Yrm")>;
+def: InstRW<[HWWriteResGroup39], (instregex "FLDCW16m")>;
-// VPGATHERDD.
-// x.
-def WriteVPGATHERDD128 : SchedWriteRes<[]> {
- let NumMicroOps = 20;
+def HWWriteResGroup40 : SchedWriteRes<[HWPort0,HWPort23,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVPGATHERDD128, ReadAfterLd], (instregex "VPGATHERDDrm")>;
+def: InstRW<[HWWriteResGroup40], (instregex "LDMXCSR")>;
+def: InstRW<[HWWriteResGroup40], (instregex "VLDMXCSR")>;
-// y.
-def WriteVPGATHERDD256 : SchedWriteRes<[]> {
- let NumMicroOps = 34;
+def HWWriteResGroup41 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVPGATHERDD256, ReadAfterLd], (instregex "VPGATHERDDYrm")>;
+def: InstRW<[HWWriteResGroup41], (instregex "LRETQ")>;
+def: InstRW<[HWWriteResGroup41], (instregex "RETL")>;
+def: InstRW<[HWWriteResGroup41], (instregex "RETQ")>;
-// VPGATHERQD.
-// x.
-def WriteVPGATHERQD128 : SchedWriteRes<[]> {
- let NumMicroOps = 15;
+def HWWriteResGroup42 : SchedWriteRes<[HWPort23,HWPort06,HWPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVPGATHERQD128, ReadAfterLd], (instregex "VPGATHERQDrm")>;
+def: InstRW<[HWWriteResGroup42], (instregex "BEXTR(32|64)rm")>;
-// y.
-def WriteVPGATHERQD256 : SchedWriteRes<[]> {
- let NumMicroOps = 22;
+def HWWriteResGroup43 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup43], (instregex "ADC(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "ADC8rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVAE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVB(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVG(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVGE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVL(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVLE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNE(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNO(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNP(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVNS(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVO(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVP(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "CMOVS(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "SBB(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup43], (instregex "SBB8rm")>;
+
+def HWWriteResGroup44 : SchedWriteRes<[HWPort4,HWPort6,HWPort237,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteVPGATHERQD256, ReadAfterLd], (instregex "VPGATHERQDYrm")>;
+def: InstRW<[HWWriteResGroup44], (instregex "CALL(16|32|64)r")>;
-// VPGATHERDQ.
-// x.
-def WriteVPGATHERDQ128 : SchedWriteRes<[]> {
- let NumMicroOps = 12;
+def HWWriteResGroup45 : SchedWriteRes<[HWPort4,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteVPGATHERDQ128, ReadAfterLd], (instregex "VPGATHERDQrm")>;
+def: InstRW<[HWWriteResGroup45], (instregex "CALL64pcrel32")>;
+def: InstRW<[HWWriteResGroup45], (instregex "SETAm")>;
+def: InstRW<[HWWriteResGroup45], (instregex "SETBEm")>;
-// y.
-def WriteVPGATHERDQ256 : SchedWriteRes<[]> {
- let NumMicroOps = 20;
+def HWWriteResGroup46 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL8m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROL8mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR8m1")>;
+def: InstRW<[HWWriteResGroup46], (instregex "ROR8mi")>;
+
+def HWWriteResGroup47 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
}
-def : InstRW<[WriteVPGATHERDQ256, ReadAfterLd], (instregex "VPGATHERDQYrm")>;
+def: InstRW<[HWWriteResGroup47], (instregex "XADD(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup47], (instregex "XADD8rm")>;
-// VPGATHERQQ.
-// x.
-def WriteVPGATHERQQ128 : SchedWriteRes<[]> {
- let NumMicroOps = 14;
+def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
}
-def : InstRW<[WriteVPGATHERQQ128, ReadAfterLd], (instregex "VPGATHERQQrm")>;
+def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[HWWriteResGroup48], (instregex "FARCALL64")>;
-// y.
-def WriteVPGATHERQQ256 : SchedWriteRes<[]> {
- let NumMicroOps = 22;
+def HWWriteResGroup49 : SchedWriteRes<[HWPort0]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPDrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "MOVMSKPSrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "PMOVMSKBrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBYrr")>;
+def: InstRW<[HWWriteResGroup49], (instregex "VPMOVMSKBrr")>;
+
+def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup50], (instregex "ADDPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADDSUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADD_FPrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADD_FST0r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "ADD_FrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "BSF(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "BSR(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPPDrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPPSrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CMPSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "COMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "COMISSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL64rr(i8)?")>;
+def: InstRW<[HWWriteResGroup50], (instregex "IMUL8r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "LZCNT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MAX(C?)SSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MIN(C?)SSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "MUL8r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PDEP(32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "PEXT(32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "POPCNT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FPrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FST0r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBR_FrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUBSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUB_FPrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUB_FST0r")>;
+def: InstRW<[HWWriteResGroup50], (instregex "SUB_FrST0")>;
+def: InstRW<[HWWriteResGroup50], (instregex "TZCNT(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "UCOMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "UCOMISSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VADDSUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDYrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPDrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSYrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPPSrri")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCMPSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCOMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCOMISSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMAX(C?)SSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)PSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VMIN(C?)SSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBPSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBSDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VSUBSSrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISDrr")>;
+def: InstRW<[HWWriteResGroup50], (instregex "VUCOMISSrr")>;
+
+def HWWriteResGroup50_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
}
-def : InstRW<[WriteVPGATHERQQ256, ReadAfterLd], (instregex "VPGATHERQQYrm")>;
+def: InstRW<[HWWriteResGroup50_16], (instregex "IMUL16rr(i8)?")>;
-//-- Arithmetic instructions --//
+def HWWriteResGroup50_32 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+}
+def: InstRW<[HWWriteResGroup50_32], (instregex "IMUL32rr(i8)?")>;
-////////////////////////////////////////////////////////////////////////////////
-// Horizontal add/sub instructions.
-////////////////////////////////////////////////////////////////////////////////
+def HWWriteResGroup51 : SchedWriteRes<[HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VBROADCASTSSYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VEXTRACTI128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VINSERTF128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VINSERTI128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTBrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPBROADCASTWrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERM2F128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERM2I128rr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMPDYri")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMPSYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPERMQYri")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXBWYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXDQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVSXWQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXBWYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXDQYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWDYrr")>;
+def: InstRW<[HWWriteResGroup51], (instregex "VPMOVZXWQYrr")>;
+
+def HWWriteResGroup52 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup52], (instregex "ADDPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "ADDSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CMPPDrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CMPPSrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CVTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MAX(C?)PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "MIN(C?)PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "SUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VADDSUBPSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPPDrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCMPPSrmi")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMAX(C?)PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VMIN(C?)PSrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBPDrm")>;
+def: InstRW<[HWWriteResGroup52], (instregex "VSUBPSrm")>;
+
+def HWWriteResGroup52_1 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F32m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "ADD_F64m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F16m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F32m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "ILD_F64m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F32m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "SUBR_F64m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F32m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "SUB_F64m")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPDYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VADDPSYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPDYrmi")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VCMPPSYrmi")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VCVTTPS2DQYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PDYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VMAX(C?)PSYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PDYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VMIN(C?)PSYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup52_1], (instregex "VSUBPSYrm")>;
+
+def HWWriteResGroup53 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup53], (instregex "VPERM2F128rm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERM2I128rm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMPDYmi")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMPSYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPERMQYmi")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBDYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXBWYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXDQYrm")>;
+def: InstRW<[HWWriteResGroup53], (instregex "VPMOVZXWQYrm")>;
+
+def HWWriteResGroup53_1 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXBWYrm")>;
+def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXDQYrm")>;
+def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVSXWDYrm")>;
+def: InstRW<[HWWriteResGroup53_1], (instregex "VPMOVZXWDYrm")>;
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def : WriteRes<WriteFHAdd, [HWPort1, HWPort5]> {
- let Latency = 5;
+def HWWriteResGroup54 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 3;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [3];
}
+def: InstRW<[HWWriteResGroup54], (instregex "XADD(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "XADD8rr")>;
+def: InstRW<[HWWriteResGroup54], (instregex "XCHG8rr")>;
-// x,m / v,v,m.
-def : WriteRes<WriteFHAddLd, [HWPort1, HWPort5, HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
+def HWWriteResGroup55 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
+def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDYrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSLLVDrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDYrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRAVDrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDYrr")>;
+def: InstRW<[HWWriteResGroup55], (instregex "VPSRLVDrr")>;
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def : WriteRes<WritePHAdd, [HWPort1, HWPort5]> {
+def HWWriteResGroup56 : SchedWriteRes<[HWPort5,HWPort15]> {
let Latency = 3;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "MMX_PHSUBWrr64")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHADDDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHADDSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHADDWrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHSUBDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHSUBSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "PHSUBWrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDSWrr256")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHADDWrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBDrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBSWrr256")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWYrr")>;
+def: InstRW<[HWWriteResGroup56], (instregex "VPHSUBWrr")>;
+
+def HWWriteResGroup57 : SchedWriteRes<[HWPort5,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-// v <- v,m.
-def : WriteRes<WritePHAddLd, [HWPort1, HWPort5, HWPort23]> {
- let Latency = 6;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSDWirr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKSSWBirr")>;
+def: InstRW<[HWWriteResGroup57], (instregex "MMX_PACKUSWBirr")>;
+
+def HWWriteResGroup58 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 3;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2, 1];
+ let ResourceCycles = [1,2];
}
+def: InstRW<[HWWriteResGroup58], (instregex "CLD")>;
-// PHADD|PHSUB (S) W/D.
-// v <- v,v.
-def WritePHADDSUBr : SchedWriteRes<[HWPort1, HWPort5]> {
+def HWWriteResGroup59 : SchedWriteRes<[HWPort06,HWPort0156]> {
let Latency = 3;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup59], (instregex "CMOVA(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup59], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL8r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCL8ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR(16|32|64)ri")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR8r1")>;
+def: InstRW<[HWWriteResGroup59], (instregex "RCR8ri")>;
+
+def HWWriteResGroup60 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup60], (instregex "ROL(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROL8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "ROR8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SAR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SAR8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHL(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHL8rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHR(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup60], (instregex "SHR8rCL")>;
+
+def HWWriteResGroup61 : SchedWriteRes<[HWPort0,HWPort4,HWPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WritePHADDSUBr], (instregex "MMX_PHADD(W?)rr64",
- "MMX_PHADDSWrr64",
- "MMX_PHSUB(W|D)rr64",
- "MMX_PHSUBSWrr64",
- "(V?)PH(ADD|SUB)(W|D)(Y?)rr",
- "(V?)PH(ADD|SUB)SWrr(256)?")>;
+def: InstRW<[HWWriteResGroup61], (instregex "FNSTSWm")>;
-// v <- v,m.
-def WritePHADDSUBm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 6;
+def HWWriteResGroup62 : SchedWriteRes<[HWPort1,HWPort4,HWPort237]> {
+ let Latency = 4;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2, 1];
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP16m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP32m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "ISTT_FP64m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_F16m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_F32m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_FP16m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_FP32m")>;
+def: InstRW<[HWWriteResGroup62], (instregex "IST_FP64m")>;
+
+def HWWriteResGroup63 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WritePHADDSUBm, ReadAfterLd],
- (instregex "MMX_PHADD(W?)rm64",
- "MMX_PHADDSWrm64",
- "MMX_PHSUB(W|D)rm64",
- "MMX_PHSUBSWrm64",
- "(V?)PH(ADD|SUB)(W|D)(Y?)rm",
- "(V?)PH(ADD|SUB)SWrm(128|256)?")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSLLVDYrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSRAVDYrm")>;
+def: InstRW<[HWWriteResGroup63], (instregex "VPSRLVDYrm")>;
-// PCMPGTQ.
-// v <- v,v.
-def WritePCMPGTQr : SchedWriteRes<[HWPort0]> {
- let Latency = 5;
- let NumMicroOps = 1;
+def HWWriteResGroup63_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+def: InstRW<[HWWriteResGroup63_1], (instregex "VPSLLVDrm")>;
+def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRAVDrm")>;
+def: InstRW<[HWWriteResGroup63_1], (instregex "VPSRLVDrm")>;
-// v <- v,m.
-def WritePCMPGTQm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup64 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WritePCMPGTQm, ReadAfterLd], (instregex "(V?)PCMPGTQ(Y?)rm")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBSWrm64")>;
+def: InstRW<[HWWriteResGroup64], (instregex "MMX_PHSUBWrm64")>;
-// PMULLD.
-// x,x / y,y,y.
-def WritePMULLDr : SchedWriteRes<[HWPort0]> {
+def HWWriteResGroup64_1 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
let Latency = 10;
- let NumMicroOps = 2;
- let ResourceCycles = [2];
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WritePMULLDr], (instregex "(V?)PMULLD(Y?)rr")>;
+def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDDYrm")>;
+def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDSWrm256")>;
+def: InstRW<[HWWriteResGroup64_1], (instregex "VPHADDWYrm")>;
+def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBDYrm")>;
+def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBSWrm256")>;
+def: InstRW<[HWWriteResGroup64_1], (instregex "VPHSUBWYrm")>;
-// x,m / y,y,m.
-def WritePMULLDm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 10;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup64_2 : SchedWriteRes<[HWPort5,HWPort23,HWPort15]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDDrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDSWrm128")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "PHADDWrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBDrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBSWrm128")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "PHSUBWrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDDrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDSWrm128")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "VPHADDWrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBDrm")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBSWrm128")>;
+def: InstRW<[HWWriteResGroup64_2], (instregex "VPHSUBWrm")>;
+
+def HWWriteResGroup65 : SchedWriteRes<[HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def : InstRW<[WritePMULLDm, ReadAfterLd], (instregex "(V?)PMULLD(Y?)rm")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CMOVA(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup65], (instregex "CMOVBE(16|32|64)rm")>;
-//-- Logic instructions --//
+def HWWriteResGroup66 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL8m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCL8mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR8m1")>;
+def: InstRW<[HWWriteResGroup66], (instregex "RCR8mi")>;
+
+def HWWriteResGroup67 : SchedWriteRes<[HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup67], (instregex "ROR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup67], (instregex "ROR8mCL")>;
-// PTEST.
-// v,v.
-def WritePTESTr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
+def HWWriteResGroup68 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[HWWriteResGroup68], (instregex "ADC(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "ADC8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "ADD8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "AND8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "OR8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "SUB8mi")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XCHG8rm")>;
+def: InstRW<[HWWriteResGroup68], (instregex "XOR8mi")>;
+
+def HWWriteResGroup69 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[HWWriteResGroup69], (instregex "ADC(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "ADC8mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "CMPXCHG8rm")>;
+def: InstRW<[HWWriteResGroup69], (instregex "ROL(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "ROL8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SAR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SAR8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mi")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB(16|32|64)mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB8mi")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SBB8mr")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHL(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHL8mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup69], (instregex "SHR8mCL")>;
+
+def HWWriteResGroup70 : SchedWriteRes<[HWPort0,HWPort1]> {
+ let Latency = 4;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[HWWriteResGroup70], (instregex "VCVTTSS2SIrr")>;
+
+def HWWriteResGroup71 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup71], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSLLDYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSLLQYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSLLWYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRADYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRAWYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRLDYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRLQYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPSRLWYrr")>;
+def: InstRW<[HWWriteResGroup71], (instregex "VPTESTYrr")>;
+
+def HWWriteResGroup72 : SchedWriteRes<[HWPort0,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+def: InstRW<[HWWriteResGroup72], (instregex "FNSTSW16r")>;
-// v,m.
-def WritePTESTm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup73 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup73], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTPD2PSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSD2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSI642SDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTSI2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTPS2PHrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSD2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI642SDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[HWWriteResGroup73], (instregex "VCVTTPD2DQrr")>;
+
+def HWWriteResGroup74 : SchedWriteRes<[HWPort1,HWPort6]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WritePTESTr], (instregex "(V?)PTEST(Y?)rm")>;
+def: InstRW<[HWWriteResGroup74], (instregex "IMUL64r")>;
+def: InstRW<[HWWriteResGroup74], (instregex "MUL64r")>;
+def: InstRW<[HWWriteResGroup74], (instregex "MULX64rr")>;
-// PSLL,PSRL,PSRA W/D/Q.
-// x,x / v,v,x.
-def WritePShift : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup74_16 : SchedWriteRes<[HWPort1, HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
}
-def : InstRW<[WritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)(Y?)rr")>;
+def: InstRW<[HWWriteResGroup74_16], (instregex "IMUL16r")>;
+def: InstRW<[HWWriteResGroup74_16], (instregex "MUL16r")>;
-// PSLL,PSRL DQ.
-def : InstRW<[WriteP5], (instregex "(V?)PS(R|L)LDQ(Y?)ri")>;
+def HWWriteResGroup74_32 : SchedWriteRes<[HWPort1,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+}
+def: InstRW<[HWWriteResGroup74_32], (instregex "IMUL32r")>;
+def: InstRW<[HWWriteResGroup74_32], (instregex "MUL32r")>;
-//-- Other --//
+def HWWriteResGroup75 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM16m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOM32m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOMP16m")>;
+def: InstRW<[HWWriteResGroup75], (instregex "FICOMP32m")>;
-// EMMS.
-def WriteEMMS : SchedWriteRes<[]> {
- let Latency = 13;
- let NumMicroOps = 31;
+def HWWriteResGroup76 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSD2SIrm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[HWWriteResGroup76], (instregex "VCVTTSS2SIrm")>;
+
+def HWWriteResGroup77 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteEMMS], (instregex "MMX_EMMS")>;
+def: InstRW<[HWWriteResGroup77], (instregex "VCVTPS2PDYrm")>;
-//=== Floating Point XMM and YMM Instructions ===//
-//-- Move instructions --//
+def HWWriteResGroup77_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup77_1], (instregex "VPTESTYrm")>;
-// MOVMSKP S/D.
-// r32 <- x.
-def WriteMOVMSKPr : SchedWriteRes<[HWPort0]> {
- let Latency = 3;
+def HWWriteResGroup78 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)rr")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2DQrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTPD2PSrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[HWWriteResGroup78], (instregex "VCVTDQ2PDrm")>;
-// r32 <- y.
-def WriteVMOVMSKPYr : SchedWriteRes<[HWPort0]> {
- let Latency = 2;
+def HWWriteResGroup78_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVMOVMSKPYr], (instregex "VMOVMSKP(S|D)Yrr")>;
+def: InstRW<[HWWriteResGroup78_1], (instregex "CVTSD2SSrm")>;
+def: InstRW<[HWWriteResGroup78_1], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[HWWriteResGroup78_1], (instregex "VCVTSD2SSrm")>;
-// VPERM2F128.
-def : InstRW<[WriteFShuffle256], (instregex "VPERM2F128rr")>;
-def : InstRW<[WriteFShuffle256Ld, ReadAfterLd], (instregex "VPERM2F128rm")>;
+def HWWriteResGroup79 : SchedWriteRes<[HWPort1,HWPort6,HWPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup79], (instregex "MULX64rm")>;
-// BLENDVP S/D.
-def : InstRW<[WriteFVarBlend], (instregex "BLENDVP(S|D)rr0")>;
-def : InstRW<[WriteFVarBlendLd, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+def HWWriteResGroup80 : SchedWriteRes<[HWPort5,HWPort23,HWPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBYrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTBrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWYrm")>;
+def: InstRW<[HWWriteResGroup80], (instregex "VPBROADCASTWrm")>;
-// VBROADCASTF128.
-def : InstRW<[WriteLoad], (instregex "VBROADCASTF128")>;
+def HWWriteResGroup81 : SchedWriteRes<[HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[HWWriteResGroup81], (instregex "FNCLEX")>;
-// EXTRACTPS.
-// r32,x,i.
-def WriteEXTRACTPSr : SchedWriteRes<[HWPort0, HWPort5]> {
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup82 : SchedWriteRes<[HWPort015,HWPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
}
-def : InstRW<[WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+def: InstRW<[HWWriteResGroup82], (instregex "VZEROUPPER")>;
-// m32,x,i.
-def WriteEXTRACTPSm : SchedWriteRes<[HWPort0, HWPort5, HWPort23]> {
+def HWWriteResGroup83 : SchedWriteRes<[HWPort1,HWPort6,HWPort0156]> {
let Latency = 4;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def : InstRW<[WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+def: InstRW<[HWWriteResGroup83], (instregex "LAR(16|32|64)rr")>;
-// VEXTRACTF128.
-// x,y,i.
-def : InstRW<[WriteFShuffle256], (instregex "VEXTRACTF128rr")>;
+def HWWriteResGroup84 : SchedWriteRes<[HWPort0,HWPort4,HWPort237,HWPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VMASKMOVPSmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVDmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQYmr")>;
+def: InstRW<[HWWriteResGroup84], (instregex "VPMASKMOVQmr")>;
+
+def HWWriteResGroup85 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup85], (instregex "VCVTPS2PHmr")>;
-// m128,y,i.
-def WriteVEXTRACTF128m : SchedWriteRes<[HWPort23, HWPort4]> {
- let Latency = 4;
+def HWWriteResGroup86 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup86], (instregex "SHLD(16|32|64)mri8")>;
+def: InstRW<[HWWriteResGroup86], (instregex "SHRD(16|32|64)mri8")>;
+
+def HWWriteResGroup87 : SchedWriteRes<[HWPort1,HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[HWWriteResGroup87], (instregex "LAR(16|32|64)rm")>;
+def: InstRW<[HWWriteResGroup87], (instregex "LSL(16|32|64)rm")>;
+
+def HWWriteResGroup88 : SchedWriteRes<[HWPort4,HWPort237,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF16")>;
+def: InstRW<[HWWriteResGroup88], (instregex "PUSHF64")>;
+
+def HWWriteResGroup89 : SchedWriteRes<[HWPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMADDWDirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHUWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULHWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULLWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MMX_PSADBWirr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_FPrST0")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_FST0r")>;
+def: InstRW<[HWWriteResGroup89], (instregex "MUL_FrST0")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PCMPGTQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMADDWDrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULHRSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULHUWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULHWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULLWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PMULUDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "PSADBWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RCPPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RCPSSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RSQRTPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "RSQRTSSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPCMPGTQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMADDWDrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHRSWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHUWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULHWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULLWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPMULUDQrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWYrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VPSADBWrr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRCPPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRCPSSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTPSr")>;
+def: InstRW<[HWWriteResGroup89], (instregex "VRSQRTSSr")>;
+
+def HWWriteResGroup90 : SchedWriteRes<[HWPort01]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup90], (instregex "MULPDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "MULPSrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "MULSDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "MULSSrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPDYrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPSYrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULPSrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULSDrr")>;
+def: InstRW<[HWWriteResGroup90], (instregex "VMULSSrr")>;
+def: InstRW<[HWWriteResGroup90],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
+
+def HWWriteResGroup91 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 10;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMADDWDirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHUWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULHWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULLWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "MMX_PSADBWirm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "RCPSSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "RSQRTSSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VRCPSSm")>;
+def: InstRW<[HWWriteResGroup91], (instregex "VRSQRTSSm")>;
+
+def HWWriteResGroup91_1 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVEXTRACTF128m], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[HWWriteResGroup91_1], (instregex "SQRTSSm")>;
+def: InstRW<[HWWriteResGroup91_1], (instregex "VDIVSSrm")>;
-// VINSERTF128.
-// y,y,x,i.
-def : InstRW<[WriteFShuffle256], (instregex "VINSERTF128rr")>;
+def HWWriteResGroup91_2 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup91_2], (instregex "PCMPGTQrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMADDWDrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMULDQrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHRSWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHUWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMULHWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMULLWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PMULUDQrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "PSADBWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "RCPPSm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "RSQRTPSm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPCMPGTQrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMADDWDrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULDQrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHRSWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHUWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULHWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULLWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPMULUDQrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VPSADBWrm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VRCPPSm")>;
+def: InstRW<[HWWriteResGroup91_2], (instregex "VRSQRTPSm")>;
+
+def HWWriteResGroup91_3 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F32m")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "MUL_F64m")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPCMPGTQYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDUBSWYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMADDWDYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULDQYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHRSWYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHUWYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULHWYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULLWYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPMULUDQYrm")>;
+def: InstRW<[HWWriteResGroup91_3], (instregex "VPSADBWYrm")>;
+
+def HWWriteResGroup92 : SchedWriteRes<[HWPort01,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup92], (instregex "MULPDrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "MULPSrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULPDrm")>;
+def: InstRW<[HWWriteResGroup92], (instregex "VMULPSrm")>;
+def: InstRW<[HWWriteResGroup92],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>;
-// y,y,m128,i.
-def WriteVINSERTF128m : SchedWriteRes<[HWPort015, HWPort23]> {
- let Latency = 4;
+def HWWriteResGroup92_1 : SchedWriteRes<[HWPort01,HWPort23]> {
+ let Latency = 12;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteFShuffle256, ReadAfterLd], (instregex "VINSERTF128rm")>;
+def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPDYrm")>;
+def: InstRW<[HWWriteResGroup92_1], (instregex "VMULPSYrm")>;
+def: InstRW<[HWWriteResGroup92_1],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
-// VMASKMOVP S/D.
-// v,v,m.
-def WriteVMASKMOVPrm : SchedWriteRes<[HWPort5, HWPort23]> {
- let Latency = 4;
+def HWWriteResGroup92_2 : SchedWriteRes<[HWPort01,HWPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup92_2], (instregex "MULSDrm")>;
+def: InstRW<[HWWriteResGroup92_2], (instregex "MULSSrm")>;
+def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSDrm")>;
+def: InstRW<[HWWriteResGroup92_2], (instregex "VMULSSrm")>;
+def: InstRW<[HWWriteResGroup92_2],
+ (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+
+def HWWriteResGroup93 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 5;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[HWWriteResGroup93], (instregex "CVTSI642SSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HADDPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HADDPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HSUBPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "HSUBPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VCVTSI642SSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHADDPSrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPDrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSYrr")>;
+def: InstRW<[HWWriteResGroup93], (instregex "VHSUBPSrr")>;
+
+def HWWriteResGroup94 : SchedWriteRes<[HWPort1,HWPort6,HWPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVMASKMOVPrm], (instregex "VMASKMOVP(S|D)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup94], (instregex "STR(16|32|64)r")>;
-// m128,x,x.
-def WriteVMASKMOVPmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
- let Latency = 13;
+def HWWriteResGroup95 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[HWWriteResGroup95], (instregex "MULX32rr")>;
+
+def HWWriteResGroup96 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[HWWriteResGroup96], (instregex "HADDPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "HADDPSrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "HSUBPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "HSUBPSrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHADDPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHADDPSrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPDrm")>;
+def: InstRW<[HWWriteResGroup96], (instregex "VHSUBPSrm")>;
+
+def HWWriteResGroup96_1 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 12;
let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+ let ResourceCycles = [1,2,1];
}
-def : InstRW<[WriteVMASKMOVPmr], (instregex "VMASKMOVP(S|D)mr")>;
+def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPDYrm")>;
+def: InstRW<[HWWriteResGroup96_1], (instregex "VHADDPSYrm")>;
+def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPDYrm")>;
+def: InstRW<[HWWriteResGroup96_1], (instregex "VHSUBPSYrm")>;
-// m256,y,y.
-def WriteVMASKMOVPYmr : SchedWriteRes<[HWPort0, HWPort1, HWPort4, HWPort23]> {
- let Latency = 14;
+def HWWriteResGroup97 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 10;
let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+def: InstRW<[HWWriteResGroup97], (instregex "CVTTSS2SI64rm")>;
-// VGATHERDPS.
-// x.
-def WriteVGATHERDPS128 : SchedWriteRes<[]> {
- let NumMicroOps = 20;
+def HWWriteResGroup98 : SchedWriteRes<[HWPort1,HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteVGATHERDPS128, ReadAfterLd], (instregex "VGATHERDPSrm")>;
+def: InstRW<[HWWriteResGroup98], (instregex "MULX32rm")>;
-// y.
-def WriteVGATHERDPS256 : SchedWriteRes<[]> {
- let NumMicroOps = 34;
+def HWWriteResGroup99 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
}
-def : InstRW<[WriteVGATHERDPS256, ReadAfterLd], (instregex "VGATHERDPSYrm")>;
+def: InstRW<[HWWriteResGroup99], (instregex "PAUSE")>;
-// VGATHERQPS.
-// x.
-def WriteVGATHERQPS128 : SchedWriteRes<[]> {
- let NumMicroOps = 15;
+def HWWriteResGroup100 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
}
-def : InstRW<[WriteVGATHERQPS128, ReadAfterLd], (instregex "VGATHERQPSrm")>;
+def: InstRW<[HWWriteResGroup100], (instregex "XSETBV")>;
-// y.
-def WriteVGATHERQPS256 : SchedWriteRes<[]> {
- let NumMicroOps = 22;
+def HWWriteResGroup101 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
}
-def : InstRW<[WriteVGATHERQPS256, ReadAfterLd], (instregex "VGATHERQPSYrm")>;
+def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG(16|32|64)rr")>;
+def: InstRW<[HWWriteResGroup101], (instregex "CMPXCHG8rr")>;
-// VGATHERDPD.
-// x.
-def WriteVGATHERDPD128 : SchedWriteRes<[]> {
- let NumMicroOps = 12;
+def HWWriteResGroup102 : SchedWriteRes<[HWPort1,HWPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVGATHERDPD128, ReadAfterLd], (instregex "VGATHERDPDrm")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTDQ2PDYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTPS2PHYrr")>;
+def: InstRW<[HWWriteResGroup102], (instregex "VCVTTPD2DQYrr")>;
-// y.
-def WriteVGATHERDPD256 : SchedWriteRes<[]> {
- let NumMicroOps = 20;
+def HWWriteResGroup103 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI16m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "ADD_FI32m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI16m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUBR_FI32m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI16m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "SUB_FI32m")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPDm")>;
+def: InstRW<[HWWriteResGroup103], (instregex "VROUNDYPSm")>;
+
+def HWWriteResGroup103_1 : SchedWriteRes<[HWPort1,HWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPDm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDPSm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSDm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "ROUNDSSm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPDm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDPSm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSDm")>;
+def: InstRW<[HWWriteResGroup103_1], (instregex "VROUNDSSm")>;
+
+def HWWriteResGroup104 : SchedWriteRes<[HWPort1,HWPort5,HWPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVGATHERDPD256, ReadAfterLd], (instregex "VGATHERDPDYrm")>;
+def: InstRW<[HWWriteResGroup104], (instregex "VCVTDQ2PDYrm")>;
-// VGATHERQPD.
-// x.
-def WriteVGATHERQPD128 : SchedWriteRes<[]> {
- let NumMicroOps = 14;
+def HWWriteResGroup105 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
-def : InstRW<[WriteVGATHERQPD128, ReadAfterLd], (instregex "VGATHERQPDrm")>;
+def: InstRW<[HWWriteResGroup105], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[HWWriteResGroup105], (instregex "SHRD(16|32|64)rrCL")>;
-// y.
-def WriteVGATHERQPD256 : SchedWriteRes<[]> {
- let NumMicroOps = 22;
+def HWWriteResGroup106 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort237]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteVGATHERQPD256, ReadAfterLd], (instregex "VGATHERQPDYrm")>;
+def: InstRW<[HWWriteResGroup106], (instregex "VCVTPS2PHYmr")>;
-//-- Conversion instructions --//
+def HWWriteResGroup107 : SchedWriteRes<[HWPort1,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[HWWriteResGroup107], (instregex "SLDT(16|32|64)r")>;
-// CVTPD2PS.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVTPD2PSrr")>;
+def HWWriteResGroup108 : SchedWriteRes<[HWPort6,HWPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[HWWriteResGroup108], (instregex "STD")>;
-// x,m128.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVTPD2PS(X?)rm")>;
+def HWWriteResGroup109 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 12;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup109], (instregex "SHLD(16|32|64)mrCL")>;
+def: InstRW<[HWWriteResGroup109], (instregex "SHRD(16|32|64)mrCL")>;
-// x,y.
-def WriteCVTPD2PSYrr : SchedWriteRes<[HWPort1, HWPort5]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup110 : SchedWriteRes<[HWPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : InstRW<[WriteCVTPD2PSYrr], (instregex "(V?)CVTPD2PSYrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESDECLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESDECrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESENCLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "AESENCrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESDECLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESDECrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESENCLASTrr")>;
+def: InstRW<[HWWriteResGroup110], (instregex "VAESENCrr")>;
-// x,m256.
-def WriteCVTPD2PSYrm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 9;
+def HWWriteResGroup111 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup111], (instregex "AESDECLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "AESDECrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "AESENCLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "AESENCrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESDECLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESDECrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESENCLASTrm")>;
+def: InstRW<[HWWriteResGroup111], (instregex "VAESENCrm")>;
+
+def HWWriteResGroup112 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 7;
let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+ let ResourceCycles = [1,2];
}
-def : InstRW<[WriteCVTPD2PSYrm], (instregex "(V?)CVTPD2PSYrm")>;
+def: InstRW<[HWWriteResGroup112], (instregex "MPSADBWrri")>;
+def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWYrri")>;
+def: InstRW<[HWWriteResGroup112], (instregex "VMPSADBWrri")>;
-// CVTSD2SS.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+def HWWriteResGroup113 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[HWWriteResGroup113], (instregex "MPSADBWrmi")>;
+def: InstRW<[HWWriteResGroup113], (instregex "VMPSADBWrmi")>;
-// x,m64.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+def HWWriteResGroup113_1 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[HWWriteResGroup113_1], (instregex "VMPSADBWYrmi")>;
-// CVTPS2PD.
-// x,x.
-def WriteCVTPS2PDrr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup114 : SchedWriteRes<[HWPort6,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,1,2];
}
-def : InstRW<[WriteCVTPS2PDrr], (instregex "(V?)CVTPS2PDrr")>;
+def: InstRW<[HWWriteResGroup114], (instregex "LOOP")>;
-// x,m64.
-// y,m128.
-def WriteCVTPS2PDrm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup115 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteCVTPS2PDrm], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI16m")>;
+def: InstRW<[HWWriteResGroup115], (instregex "MUL_FI32m")>;
-// y,x.
-def WriteVCVTPS2PDYrr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 5;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup116 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteVCVTPS2PDYrr], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[HWWriteResGroup116], (instregex "DPPDrri")>;
+def: InstRW<[HWWriteResGroup116], (instregex "VDPPDrri")>;
-// CVTSS2SD.
-// x,x.
-def WriteCVTSS2SDrr : SchedWriteRes<[HWPort0, HWPort5]> {
- let Latency = 2;
- let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+def HWWriteResGroup117 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
}
-def : InstRW<[WriteCVTSS2SDrr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+def: InstRW<[HWWriteResGroup117], (instregex "DPPDrmi")>;
+def: InstRW<[HWWriteResGroup117], (instregex "VDPPDrmi")>;
-// x,m32.
-def WriteCVTSS2SDrm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 5;
+def HWWriteResGroup118 : SchedWriteRes<[HWPort0]> {
+ let Latency = 10;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [2];
}
-def : InstRW<[WriteCVTSS2SDrm], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
-
-// CVTDQ2PD.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V)?CVTDQ2PDrr")>;
-
-// y,x.
-def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVTDQ2PDYrr")>;
-
-// CVT(T)PD2DQ.
-// x,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(V?)CVT(T?)PD2DQrr")>;
-// x,m128.
-def : InstRW<[WriteP1_P5_Lat4Ld], (instregex "(V?)CVT(T?)PD2DQrm")>;
-// x,y.
-def : InstRW<[WriteP1_P5_Lat6], (instregex "VCVT(T?)PD2DQYrr")>;
-// x,m256.
-def : InstRW<[WriteP1_P5_Lat6Ld], (instregex "VCVT(T?)PD2DQYrm")>;
-
-// CVT(T)PS2PI.
-// mm,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PS2PIirr")>;
-
-// CVTPI2PD.
-// x,mm.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PI2PDirr")>;
-
-// CVT(T)PD2PI.
-// mm,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "MMX_CVT(T?)PD2PIirr")>;
-
-// CVSTSI2SS.
-// x,r32.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
-
-// CVT(T)SS2SI.
-// r32,x.
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
-
-// CVTSI2SD.
-// x,r32/64.
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
-
-// CVTSD2SI.
-// r32/64
-def : InstRW<[WriteP0_P1_Lat4], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rr")>;
-// r32,m32.
-def : InstRW<[WriteP0_P1_Lat4Ld], (instregex "(Int_)?(V?)CVT(T?)SD2SI(64)?rm")>;
-
-// VCVTPS2PH.
-// x,v,i.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPS2PH(Y?)rr")>;
-// m,v,i.
-def : InstRW<[WriteP1_P5_Lat4Ld, WriteRMW], (instregex "VCVTPS2PH(Y?)mr")>;
-
-// VCVTPH2PS.
-// v,x.
-def : InstRW<[WriteP1_P5_Lat4], (instregex "VCVTPH2PS(Y?)rr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "PMULLDrr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDYrr")>;
+def: InstRW<[HWWriteResGroup118], (instregex "VPMULLDrr")>;
-//-- Arithmetic instructions --//
+def HWWriteResGroup119 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup119], (instregex "PMULLDrm")>;
+def: InstRW<[HWWriteResGroup119], (instregex "VPMULLDrm")>;
-// HADD, HSUB PS/PD
-// x,x / v,v,v.
-def WriteHADDSUBPr : SchedWriteRes<[HWPort1, HWPort5]> {
- let Latency = 5;
+def HWWriteResGroup119_1 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 17;
let NumMicroOps = 3;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteHADDSUBPr], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rr")>;
+def: InstRW<[HWWriteResGroup119_1], (instregex "VPMULLDYrm")>;
-// x,m / v,v,m.
-def WriteHADDSUBPm : SchedWriteRes<[HWPort1, HWPort5, HWPort23]> {
- let Latency = 9;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 2, 1];
+def HWWriteResGroup120 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,4,1,2];
}
-def : InstRW<[WriteHADDSUBPm], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)rm")>;
+def: InstRW<[HWWriteResGroup120], (instregex "RCL(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup120], (instregex "RCL8mCL")>;
-// MULL SS/SD PS/PD.
-// x,x / v,v,v.
-def WriteMULr : SchedWriteRes<[HWPort01]> {
- let Latency = 5;
+def HWWriteResGroup121 : SchedWriteRes<[HWPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : InstRW<[WriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+def: InstRW<[HWWriteResGroup121], (instregex "DIVPSrr")>;
+def: InstRW<[HWWriteResGroup121], (instregex "DIVSSrr")>;
-// x,m / v,v,m.
-def WriteMULm : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 9;
+def HWWriteResGroup122 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 17;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteMULm], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+def: InstRW<[HWWriteResGroup122], (instregex "DIVPSrm")>;
-// VDIVPS.
-// y,y,y.
-def WriteVDIVPSYrr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 19; // 18-21 cycles.
+def HWWriteResGroup122_1 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup122_1], (instregex "DIVSSrm")>;
+
+def HWWriteResGroup123 : SchedWriteRes<[HWPort0]> {
+ let Latency = 11;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [3];
}
-def : InstRW<[WriteVDIVPSYrr], (instregex "VDIVPSYrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRIrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[HWWriteResGroup123], (instregex "VPCMPISTRM128rr")>;
-// y,y,m256.
-def WriteVDIVPSYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 23; // 18-21 + 4 cycles.
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup124 : SchedWriteRes<[HWPort0,HWPort5]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVDIVPSYrm, ReadAfterLd], (instregex "VDIVPSYrm")>;
+def: InstRW<[HWWriteResGroup124], (instregex "PCLMULQDQrr")>;
+def: InstRW<[HWWriteResGroup124], (instregex "VPCLMULQDQrr")>;
-// VDIVPD.
-// y,y,y.
-def WriteVDIVPDYrr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 27; // 19-35 cycles.
+def HWWriteResGroup125 : SchedWriteRes<[HWPort0,HWPort015]> {
+ let Latency = 11;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVDIVPDYrr], (instregex "VDIVPDYrr")>;
+def: InstRW<[HWWriteResGroup125], (instregex "VRCPPSYr")>;
+def: InstRW<[HWWriteResGroup125], (instregex "VRSQRTPSYr")>;
-// y,y,m256.
-def WriteVDIVPDYrm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 31; // 19-35 + 4 cycles.
+def HWWriteResGroup126 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 17;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [3,1];
}
-def : InstRW<[WriteVDIVPDYrm, ReadAfterLd], (instregex "VDIVPDYrm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRIrm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[HWWriteResGroup126], (instregex "VPCMPISTRM128rm")>;
-// VRCPPS.
-// y,y.
-def WriteVRCPPSr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 7;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup127 : SchedWriteRes<[HWPort0,HWPort5,HWPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup127], (instregex "PCLMULQDQrm")>;
+def: InstRW<[HWWriteResGroup127], (instregex "VPCLMULQDQrm")>;
-// y,m256.
-def WriteVRCPPSm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 11;
+def HWWriteResGroup128 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+ let Latency = 18;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteVRCPPSm], (instregex "VRCPPSYm(_Int)?")>;
+def: InstRW<[HWWriteResGroup128], (instregex "VRCPPSYm")>;
+def: InstRW<[HWWriteResGroup128], (instregex "VRSQRTPSYm")>;
-// ROUND SS/SD PS/PD.
-// v,v,i.
-def WriteROUNDr : SchedWriteRes<[HWPort1]> {
- let Latency = 6;
+def HWWriteResGroup129 : SchedWriteRes<[HWPort1,HWPort06,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,2,3];
+}
+def: InstRW<[HWWriteResGroup129], (instregex "RCL(16|32|64)rCL")>;
+def: InstRW<[HWWriteResGroup129], (instregex "RCR(16|32|64)rCL")>;
+
+def HWWriteResGroup130 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,4,1,3];
+}
+def: InstRW<[HWWriteResGroup130], (instregex "RCL8rCL")>;
+
+def HWWriteResGroup131 : SchedWriteRes<[HWPort06,HWPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[HWWriteResGroup131], (instregex "LOOPE")>;
+def: InstRW<[HWWriteResGroup131], (instregex "LOOPNE")>;
+
+def HWWriteResGroup132 : SchedWriteRes<[HWPort4,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[HWWriteResGroup132], (instregex "CMPXCHG8B")>;
+
+def HWWriteResGroup133 : SchedWriteRes<[HWPort0]> {
+ let Latency = 13;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup133], (instregex "SQRTPSr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "SQRTSSr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "VDIVPSrr")>;
+def: InstRW<[HWWriteResGroup133], (instregex "VDIVSSrr")>;
+
+def HWWriteResGroup134 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup134], (instregex "DIVSDrm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "SQRTPSm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VDIVPSrm")>;
+def: InstRW<[HWWriteResGroup134], (instregex "VSQRTSSm")>;
+
+def HWWriteResGroup135 : SchedWriteRes<[HWPort1,HWPort23,HWPort237,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,3,1,3];
+}
+def: InstRW<[HWWriteResGroup135], (instregex "RCR(16|32|64)mCL")>;
+def: InstRW<[HWWriteResGroup135], (instregex "RCR8mCL")>;
+
+def HWWriteResGroup136 : SchedWriteRes<[HWPort0]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup136], (instregex "DIVPDrr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "DIVSDrr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "VSQRTPSr")>;
+def: InstRW<[HWWriteResGroup136], (instregex "VSQRTSSr")>;
+
+def HWWriteResGroup137 : SchedWriteRes<[HWPort5]> {
+ let Latency = 14;
let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : InstRW<[WriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+def: InstRW<[HWWriteResGroup137], (instregex "AESIMCrr")>;
+def: InstRW<[HWWriteResGroup137], (instregex "VAESIMCrr")>;
-// v,m,i.
-def WriteROUNDm : SchedWriteRes<[HWPort1, HWPort23]> {
- let Latency = 10;
+def HWWriteResGroup138 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup138], (instregex "DIVPDrm")>;
+def: InstRW<[HWWriteResGroup138], (instregex "VSQRTPSm")>;
+
+def HWWriteResGroup139 : SchedWriteRes<[HWPort5,HWPort23]> {
+ let Latency = 20;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+def: InstRW<[HWWriteResGroup139], (instregex "AESIMCrm")>;
+def: InstRW<[HWWriteResGroup139], (instregex "VAESIMCrm")>;
-// DPPS.
-// x,x,i / v,v,v,i.
-def WriteDPPSr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
+def HWWriteResGroup140 : SchedWriteRes<[HWPort0,HWPort1,HWPort5]> {
let Latency = 14;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[HWWriteResGroup140], (instregex "DPPSrri")>;
+def: InstRW<[HWWriteResGroup140], (instregex "VDPPSYrri")>;
+def: InstRW<[HWWriteResGroup140], (instregex "VDPPSrri")>;
+
+def HWWriteResGroup141 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,1,1,1];
+}
+def: InstRW<[HWWriteResGroup141], (instregex "DPPSrmi")>;
+def: InstRW<[HWWriteResGroup141], (instregex "VDPPSrmi")>;
+
+def HWWriteResGroup141_1 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,1,1,1];
+}
+def: InstRW<[HWWriteResGroup141_1], (instregex "VDPPSYrmi")>;
+
+def HWWriteResGroup142 : SchedWriteRes<[HWPort1,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,3,1,4];
+}
+def: InstRW<[HWWriteResGroup142], (instregex "RCR8rCL")>;
+
+def HWWriteResGroup143 : SchedWriteRes<[HWPort23,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 15;
+ let ResourceCycles = [1,14];
+}
+def: InstRW<[HWWriteResGroup143], (instregex "POPF16")>;
+
+def HWWriteResGroup144 : SchedWriteRes<[HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 21;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[HWWriteResGroup144], (instregex "INSB")>;
+def: InstRW<[HWWriteResGroup144], (instregex "INSL")>;
+def: InstRW<[HWWriteResGroup144], (instregex "INSW")>;
+
+def HWWriteResGroup145 : SchedWriteRes<[HWPort5]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ResourceCycles = [16];
}
-def : InstRW<[WriteDPPSr], (instregex "(V?)DPPS(Y?)rri")>;
+def: InstRW<[HWWriteResGroup145], (instregex "VZEROALL")>;
-// x,m,i / v,v,m,i.
-def WriteDPPSm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23, HWPort6]> {
+def HWWriteResGroup146 : SchedWriteRes<[HWPort0,HWPort4,HWPort5,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 22;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[HWWriteResGroup146], (instregex "CMPXCHG16B")>;
+
+def HWWriteResGroup147 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[HWWriteResGroup147], (instregex "XCH_F")>;
+
+def HWWriteResGroup148 : SchedWriteRes<[HWPort0,HWPort5,HWPort0156]> {
let Latency = 18;
- let NumMicroOps = 6;
- let ResourceCycles = [2, 1, 1, 1, 1];
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
}
-def : InstRW<[WriteDPPSm, ReadAfterLd], (instregex "(V?)DPPS(Y?)rmi")>;
+def: InstRW<[HWWriteResGroup148], (instregex "PCMPESTRIrr")>;
+def: InstRW<[HWWriteResGroup148], (instregex "VPCMPESTRIrr")>;
-// DPPD.
-// x,x,i.
-def WriteDPPDr : SchedWriteRes<[HWPort0, HWPort1, HWPort5]> {
- let Latency = 9;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup149 : SchedWriteRes<[HWPort5,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
}
-def : InstRW<[WriteDPPDr], (instregex "(V?)DPPDrri")>;
+def: InstRW<[HWWriteResGroup149], (instregex "CPUID")>;
+def: InstRW<[HWWriteResGroup149], (instregex "RDTSC")>;
-// x,m,i.
-def WriteDPPDm : SchedWriteRes<[HWPort0, HWPort1, HWPort5, HWPort23]> {
- let Latency = 13;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup150 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
}
-def : InstRW<[WriteDPPDm], (instregex "(V?)DPPDrmi")>;
+def: InstRW<[HWWriteResGroup150], (instregex "PCMPESTRIrm")>;
+def: InstRW<[HWWriteResGroup150], (instregex "VPCMPESTRIrm")>;
-// VFMADD.
-// v,v,v.
-def WriteFMADDr : SchedWriteRes<[HWPort01]> {
- let Latency = 5;
+def HWWriteResGroup151 : SchedWriteRes<[HWPort6,HWPort23,HWPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [3,1,15];
+}
+def: InstRW<[HWWriteResGroup151], (instregex "XRSTOR(64)?")>;
+
+def HWWriteResGroup152 : SchedWriteRes<[HWPort0,HWPort5,HWPort015,HWPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[HWWriteResGroup152], (instregex "PCMPESTRM128rr")>;
+def: InstRW<[HWWriteResGroup152], (instregex "VPCMPESTRM128rr")>;
+
+def HWWriteResGroup153 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015,HWPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+def: InstRW<[HWWriteResGroup153], (instregex "PCMPESTRM128rm")>;
+def: InstRW<[HWWriteResGroup153], (instregex "VPCMPESTRM128rm")>;
+
+def HWWriteResGroup154 : SchedWriteRes<[HWPort0]> {
+ let Latency = 20;
let NumMicroOps = 1;
+ let ResourceCycles = [1];
}
-def : InstRW<[WriteFMADDr],
- (instregex
- // 3p forms.
- "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)r(Y)?",
- // 3s forms.
- "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)r",
- // 4s/4s_int forms.
- "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
- // 4p forms.
- "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
-
-// v,v,m.
-def WriteFMADDm : SchedWriteRes<[HWPort01, HWPort23]> {
- let Latency = 9;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_FPrST0")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_FST0r")>;
+def: InstRW<[HWWriteResGroup154], (instregex "DIV_FrST0")>;
+def: InstRW<[HWWriteResGroup154], (instregex "SQRTPDr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "SQRTSDr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "VDIVPDrr")>;
+def: InstRW<[HWWriteResGroup154], (instregex "VDIVSDrr")>;
+
+def HWWriteResGroup155 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 27;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteFMADDm],
- (instregex
- // 3p forms.
- "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(r213|r132|r231)m(Y)?",
- // 3s forms.
- "VF(N?)M(ADD|SUB)S(S|D)(r132|r231|r213)m",
- // 4s/4s_int forms.
- "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
- // 4p forms.
- "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F32m")>;
+def: InstRW<[HWWriteResGroup155], (instregex "DIVR_F64m")>;
+def: InstRW<[HWWriteResGroup155], (instregex "VSQRTPDm")>;
-//-- Math instructions --//
+def HWWriteResGroup155_1 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 26;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[HWWriteResGroup155_1], (instregex "SQRTPDm")>;
+def: InstRW<[HWWriteResGroup155_1], (instregex "VDIVPDrm")>;
+def: InstRW<[HWWriteResGroup155_1], (instregex "VSQRTSDm")>;
-// VSQRTPS.
-// y,y.
-def WriteVSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 19;
- let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+def HWWriteResGroup155_2 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+def: InstRW<[HWWriteResGroup155_2], (instregex "SQRTSDm")>;
+def: InstRW<[HWWriteResGroup155_2], (instregex "VDIVSDrm")>;
-// y,m256.
-def WriteVSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 23;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup156 : SchedWriteRes<[HWPort5,HWPort6,HWPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
}
-def : InstRW<[WriteVSQRTPSYm], (instregex "VSQRTPSYm")>;
+def: InstRW<[HWWriteResGroup156], (instregex "MWAITrr")>;
-// VSQRTPD.
-// y,y.
-def WriteVSQRTPDYr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 28;
+def HWWriteResGroup157 : SchedWriteRes<[HWPort0]> {
+ let Latency = 21;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup157], (instregex "VSQRTPDr")>;
+def: InstRW<[HWWriteResGroup157], (instregex "VSQRTSDr")>;
+
+def HWWriteResGroup159 : SchedWriteRes<[HWPort0,HWPort015]> {
+ let Latency = 21;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [2,1];
}
-def : InstRW<[WriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+def: InstRW<[HWWriteResGroup159], (instregex "VDIVPSYrr")>;
+def: InstRW<[HWWriteResGroup159], (instregex "VSQRTPSYr")>;
-// y,m256.
-def WriteVSQRTPDYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 32;
+def HWWriteResGroup160 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+ let Latency = 28;
let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+ let ResourceCycles = [2,1,1];
}
-def : InstRW<[WriteVSQRTPDYm], (instregex "VSQRTPDYm")>;
+def: InstRW<[HWWriteResGroup160], (instregex "VDIVPSYrm")>;
+def: InstRW<[HWWriteResGroup160], (instregex "VSQRTPSYm")>;
-// RSQRT SS/PS.
-// x,x.
-def WriteRSQRTr : SchedWriteRes<[HWPort0]> {
- let Latency = 5;
+def HWWriteResGroup161 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 30;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteRSQRTr], (instregex "(V?)RSQRT(SS|PS)r(_Int)?")>;
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI16m")>;
+def: InstRW<[HWWriteResGroup161], (instregex "DIVR_FI32m")>;
-// x,m128.
-def WriteRSQRTm : SchedWriteRes<[HWPort0, HWPort23]> {
- let Latency = 9;
+def HWWriteResGroup162 : SchedWriteRes<[HWPort0]> {
+ let Latency = 24;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FPrST0")>;
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FST0r")>;
+def: InstRW<[HWWriteResGroup162], (instregex "DIVR_FrST0")>;
+
+def HWWriteResGroup163 : SchedWriteRes<[HWPort0,HWPort23]> {
+ let Latency = 31;
let NumMicroOps = 2;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1,1];
}
-def : InstRW<[WriteRSQRTm], (instregex "(V?)RSQRT(SS|PS)m(_Int)?")>;
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F32m")>;
+def: InstRW<[HWWriteResGroup163], (instregex "DIV_F64m")>;
-// RSQRTPS 256.
-// y,y.
-def WriteRSQRTPSYr : SchedWriteRes<[HWPort0, HWPort15]> {
- let Latency = 7;
+def HWWriteResGroup164 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 27;
+ let ResourceCycles = [1,5,1,1,19];
+}
+def: InstRW<[HWWriteResGroup164], (instregex "XSAVE64")>;
+
+def HWWriteResGroup165 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort0156]> {
+ let Latency = 31;
+ let NumMicroOps = 28;
+ let ResourceCycles = [1,6,1,1,19];
+}
+def: InstRW<[HWWriteResGroup165], (instregex "XSAVE(OPT)?")>;
+
+def HWWriteResGroup166 : SchedWriteRes<[HWPort0,HWPort1,HWPort23]> {
+ let Latency = 34;
let NumMicroOps = 3;
- let ResourceCycles = [2, 1];
+ let ResourceCycles = [1,1,1];
}
-def : InstRW<[WriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI16m")>;
+def: InstRW<[HWWriteResGroup166], (instregex "DIV_FI32m")>;
-// y,m256.
-def WriteRSQRTPSYm : SchedWriteRes<[HWPort0, HWPort15, HWPort23]> {
- let Latency = 11;
- let NumMicroOps = 4;
- let ResourceCycles = [2, 1, 1];
+def HWWriteResGroup167 : SchedWriteRes<[HWPort0,HWPort5,HWPort23,HWPort015]> {
+ let Latency = 34;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,1,1];
}
-def : InstRW<[WriteRSQRTPSYm], (instregex "VRSQRTPSYm(_Int)?")>;
+def: InstRW<[HWWriteResGroup167], (instregex "AESKEYGENASSIST128rm")>;
+def: InstRW<[HWWriteResGroup167], (instregex "VAESKEYGENASSIST128rm")>;
-//-- Logic instructions --//
+def HWWriteResGroup168 : SchedWriteRes<[HWPort0,HWPort5,HWPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,7,2];
+}
+def: InstRW<[HWWriteResGroup168], (instregex "AESKEYGENASSIST128rr")>;
+def: InstRW<[HWWriteResGroup168], (instregex "VAESKEYGENASSIST128rr")>;
-// AND, ANDN, OR, XOR PS/PD.
-// x,x / v,v,v.
-def : InstRW<[WriteP5], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
-// x,m / v,v,m.
-def : InstRW<[WriteP5Ld, ReadAfterLd],
- (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+def HWWriteResGroup170 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort06,HWPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)ri")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN(16|32)rr")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN8ri")>;
+def: InstRW<[HWWriteResGroup170], (instregex "IN8rr")>;
-//-- Other instructions --//
+def HWWriteResGroup171 : SchedWriteRes<[HWPort5,HWPort6,HWPort23,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 36;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)ir")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT(16|32)rr")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT8ir")>;
+def: InstRW<[HWWriteResGroup171], (instregex "OUT8rr")>;
+
+def HWWriteResGroup172 : SchedWriteRes<[HWPort01,HWPort15,HWPort015,HWPort0156]> {
+ let Latency = 31;
+ let NumMicroOps = 31;
+ let ResourceCycles = [8,1,21,1];
+}
+def: InstRW<[HWWriteResGroup172], (instregex "MMX_EMMS")>;
+
+def HWWriteResGroup173 : SchedWriteRes<[HWPort0,HWPort015]> {
+ let Latency = 35;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[HWWriteResGroup173], (instregex "VDIVPDYrr")>;
+def: InstRW<[HWWriteResGroup173], (instregex "VSQRTPDYr")>;
-// VZEROUPPER.
-def WriteVZEROUPPER : SchedWriteRes<[]> {
+def HWWriteResGroup174 : SchedWriteRes<[HWPort0,HWPort23,HWPort015]> {
+ let Latency = 42;
let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[HWWriteResGroup174], (instregex "VDIVPDYrm")>;
+def: InstRW<[HWWriteResGroup174], (instregex "VSQRTPDYm")>;
+
+def HWWriteResGroup175 : SchedWriteRes<[HWPort1,HWPort4,HWPort5,HWPort6,HWPort23,HWPort237,HWPort15,HWPort0156]> {
+ let Latency = 41;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[HWWriteResGroup175], (instregex "VMCLEARm")>;
+
+def HWWriteResGroup176 : SchedWriteRes<[HWPort5,HWPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[HWWriteResGroup176], (instregex "RDTSCP")>;
+
+def HWWriteResGroup177 : SchedWriteRes<[HWPort0,HWPort01,HWPort23,HWPort05,HWPort06,HWPort015,HWPort0156]> {
+ let Latency = 61;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,2,8,1,10,2,39];
+}
+def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
+def: InstRW<[HWWriteResGroup177], (instregex "FLDENVm")>;
+
+def HWWriteResGroup178 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 64;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[HWWriteResGroup178], (instregex "FXRSTOR64")>;
+
+def HWWriteResGroup179 : SchedWriteRes<[HWPort0,HWPort6,HWPort23,HWPort05,HWPort06,HWPort15,HWPort0156]> {
+ let Latency = 64;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[HWWriteResGroup179], (instregex "FXRSTOR")>;
+
+def HWWriteResGroup180 : SchedWriteRes<[HWPort5,HWPort01,HWPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[HWWriteResGroup180], (instregex "FNINIT")>;
+
+def HWWriteResGroup181 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort01,HWPort0156]> {
+ let Latency = 98;
+ let NumMicroOps = 32;
+ let ResourceCycles = [7,7,3,3,1,11];
+}
+def: InstRW<[HWWriteResGroup181], (instregex "DIV(16|32|64)r")>;
+
+def HWWriteResGroup182 : SchedWriteRes<[HWPort0,HWPort1,HWPort5,HWPort6,HWPort06,HWPort0156]> {
+ let Latency = 112;
+ let NumMicroOps = 66;
+ let ResourceCycles = [4,2,4,8,14,34];
+}
+def: InstRW<[HWWriteResGroup182], (instregex "IDIV(16|32|64)r")>;
+
+def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,HWPort237,HWPort06,HWPort0156]> {
+ let Latency = 115;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,9,11,8,1,11,21,30];
}
-def : InstRW<[WriteVZEROUPPER], (instregex "VZEROUPPER")>;
+def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
+def: InstRW<[HWWriteResGroup183], (instregex "FSTENVm")>;
-// VZEROALL.
-def WriteVZEROALL : SchedWriteRes<[]> {
+def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> {
+ let Latency = 26;
let NumMicroOps = 12;
+ let ResourceCycles = [2,2,1,3,2,2];
}
-def : InstRW<[WriteVZEROALL], (instregex "VZEROALL")>;
+def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm,
+ VPGATHERDQrm,
+ VPGATHERDDrm)>;
-// LDMXCSR.
-def WriteLDMXCSR : SchedWriteRes<[HWPort0, HWPort6, HWPort23]> {
- let Latency = 6;
- let NumMicroOps = 3;
- let ResourceCycles = [1, 1, 1];
+def HWWriteResGroup185 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 24;
+ let NumMicroOps = 22;
+ let ResourceCycles = [5,3,4,1,5,4];
}
-def : InstRW<[WriteLDMXCSR], (instregex "(V)?LDMXCSR")>;
+def: InstRW<[HWWriteResGroup185], (instrs VGATHERQPDYrm,
+ VPGATHERQQYrm)>;
-// STMXCSR.
-def WriteSTMXCSR : SchedWriteRes<[HWPort0, HWPort4, HWPort6, HWPort237]> {
- let Latency = 7;
- let NumMicroOps = 4;
- let ResourceCycles = [1, 1, 1, 1];
+def HWWriteResGroup186 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 28;
+ let NumMicroOps = 22;
+ let ResourceCycles = [5,3,4,1,5,4];
+}
+def: InstRW<[HWWriteResGroup186], (instrs VPGATHERQDYrm)>;
+
+def HWWriteResGroup187 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 22;
+ let ResourceCycles = [5,3,4,1,5,4];
+}
+def: InstRW<[HWWriteResGroup187], (instrs VPGATHERQDrm)>;
+
+def HWWriteResGroup188 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 20;
+ let ResourceCycles = [3,3,4,1,5,4];
+}
+def: InstRW<[HWWriteResGroup188], (instrs VGATHERDPDYrm,
+ VPGATHERDQYrm)>;
+
+def HWWriteResGroup189 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 34;
+ let ResourceCycles = [5,3,8,1,9,8];
+}
+def: InstRW<[HWWriteResGroup189], (instrs VGATHERDPSYrm,
+ VPGATHERDDYrm)>;
+
+def HWWriteResGroup190 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 23;
+ let NumMicroOps = 14;
+ let ResourceCycles = [3,3,2,1,3,2];
+}
+def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPDrm,
+ VPGATHERQQrm)>;
+
+def HWWriteResGroup191 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 28;
+ let NumMicroOps = 15;
+ let ResourceCycles = [3,3,2,1,4,2];
+}
+def: InstRW<[HWWriteResGroup191], (instrs VGATHERQPSYrm)>;
+
+def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 15;
+ let ResourceCycles = [3,3,2,1,4,2];
}
-def : InstRW<[WriteSTMXCSR], (instregex "(V)?STMXCSR")>;
+def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
+ VGATHERDPSrm)>;
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSandyBridge.td b/lib/Target/X86/X86SchedSandyBridge.td
index b8ec5883152c..4466d30f14c7 100644
--- a/lib/Target/X86/X86SchedSandyBridge.td
+++ b/lib/Target/X86/X86SchedSandyBridge.td
@@ -24,8 +24,8 @@ def SandyBridgeModel : SchedMachineModel {
// Based on the LSD (loop-stream detector) queue size.
let LoopMicroOpBufferSize = 28;
- // FIXME: SSE4 and AVX are unimplemented. This flag is set to allow
- // the scheduler to assign a default model to unrecognized opcodes.
+ // This flag is set to allow the scheduler to assign
+ // a default model to unrecognized opcodes.
let CompleteModel = 0;
}
@@ -48,6 +48,7 @@ def SBPort23 : ProcResource<2>;
def SBPort4 : ProcResource<1>;
// Many micro-ops are capable of issuing on multiple ports.
+def SBPort01 : ProcResGroup<[SBPort0, SBPort1]>;
def SBPort05 : ProcResGroup<[SBPort0, SBPort5]>;
def SBPort15 : ProcResGroup<[SBPort1, SBPort5]>;
def SBPort015 : ProcResGroup<[SBPort0, SBPort1, SBPort5]>;
@@ -115,10 +116,10 @@ def : WriteRes<WriteIDivLd, [SBPort23, SBPort0, SBDivider]> {
// Scalar and vector floating point.
defm : SBWriteResPair<WriteFAdd, SBPort1, 3>;
defm : SBWriteResPair<WriteFMul, SBPort0, 5>;
-defm : SBWriteResPair<WriteFDiv, SBPort0, 12>; // 10-14 cycles.
+defm : SBWriteResPair<WriteFDiv, SBPort0, 24>;
defm : SBWriteResPair<WriteFRcp, SBPort0, 5>;
defm : SBWriteResPair<WriteFRsqrt, SBPort0, 5>;
-defm : SBWriteResPair<WriteFSqrt, SBPort0, 15>;
+defm : SBWriteResPair<WriteFSqrt, SBPort0, 14>;
defm : SBWriteResPair<WriteCvtF2I, SBPort1, 3>;
defm : SBWriteResPair<WriteCvtI2F, SBPort1, 4>;
defm : SBWriteResPair<WriteCvtF2F, SBPort1, 3>;
@@ -134,11 +135,11 @@ def : WriteRes<WriteFVarBlendLd, [SBPort0, SBPort5, SBPort23]> {
}
// Vector integer operations.
-defm : SBWriteResPair<WriteVecShift, SBPort05, 1>;
-defm : SBWriteResPair<WriteVecLogic, SBPort015, 1>;
-defm : SBWriteResPair<WriteVecALU, SBPort15, 1>;
+defm : SBWriteResPair<WriteVecShift, SBPort5, 1>;
+defm : SBWriteResPair<WriteVecLogic, SBPort5, 1>;
+defm : SBWriteResPair<WriteVecALU, SBPort1, 3>;
defm : SBWriteResPair<WriteVecIMul, SBPort0, 5>;
-defm : SBWriteResPair<WriteShuffle, SBPort15, 1>;
+defm : SBWriteResPair<WriteShuffle, SBPort5, 1>;
defm : SBWriteResPair<WriteBlend, SBPort15, 1>;
def : WriteRes<WriteVarBlend, [SBPort1, SBPort5]> {
let Latency = 2;
@@ -148,13 +149,15 @@ def : WriteRes<WriteVarBlendLd, [SBPort1, SBPort5, SBPort23]> {
let Latency = 6;
let ResourceCycles = [1, 1, 1];
}
-def : WriteRes<WriteMPSAD, [SBPort0, SBPort1, SBPort5]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 1];
+def : WriteRes<WriteMPSAD, [SBPort0,SBPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
}
-def : WriteRes<WriteMPSADLd, [SBPort0, SBPort1, SBPort5, SBPort23]> {
- let Latency = 6;
- let ResourceCycles = [1, 1, 1, 1];
+def : WriteRes<WriteMPSADLd, [SBPort0,SBPort23,SBPort15]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
}
////////////////////////////////////////////////////////////////////////////////
@@ -204,13 +207,15 @@ def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
}
// Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [SBPort015]> {
- let Latency = 3;
+def : WriteRes<WritePCmpIStrI, [SBPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
let ResourceCycles = [3];
}
-def : WriteRes<WritePCmpIStrILd, [SBPort015, SBPort23]> {
- let Latency = 3;
- let ResourceCycles = [3, 1];
+def : WriteRes<WritePCmpIStrILd, [SBPort0,SBPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
}
// Packed Compare Explicit Length Strings, Return Index
@@ -224,22 +229,26 @@ def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
}
// AES Instructions.
-def : WriteRes<WriteAESDecEnc, [SBPort015]> {
- let Latency = 8;
- let ResourceCycles = [2];
+def : WriteRes<WriteAESDecEnc, [SBPort5,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
}
-def : WriteRes<WriteAESDecEncLd, [SBPort015, SBPort23]> {
- let Latency = 8;
- let ResourceCycles = [2, 1];
+def : WriteRes<WriteAESDecEncLd, [SBPort5,SBPort23,SBPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
}
-def : WriteRes<WriteAESIMC, [SBPort015]> {
- let Latency = 8;
+def : WriteRes<WriteAESIMC, [SBPort5]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
let ResourceCycles = [2];
}
-def : WriteRes<WriteAESIMCLd, [SBPort015, SBPort23]> {
- let Latency = 8;
- let ResourceCycles = [2, 1];
+def : WriteRes<WriteAESIMCLd, [SBPort5,SBPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
}
def : WriteRes<WriteAESKeyGen, [SBPort015]> {
@@ -267,9 +276,2583 @@ def : WriteRes<WriteMicrocoded, [SBPort015]> { let Latency = 100; }
def : WriteRes<WriteFence, [SBPort23, SBPort4]>;
def : WriteRes<WriteNop, []>;
-// AVX2 is not supported on that architecture, but we should define the basic
+// AVX2/FMA is not supported on that architecture, but we should define the basic
// scheduling resources anyway.
defm : SBWriteResPair<WriteFShuffle256, SBPort0, 1>;
defm : SBWriteResPair<WriteShuffle256, SBPort0, 1>;
defm : SBWriteResPair<WriteVarVecShift, SBPort0, 1>;
+defm : SBWriteResPair<WriteFMA, SBPort01, 5>;
+
+// Remaining SNB instrs.
+
+def SBWriteResGroup0 : SchedWriteRes<[SBPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup0], (instregex "CVTSS2SDrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSLLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSLLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSLLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRADri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRAWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "PSRLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSLLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSLLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSLLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRADri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRAWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRLDri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRLQri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VPSRLWri")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDYrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPDrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSYrr")>;
+def: InstRW<[SBWriteResGroup0], (instregex "VTESTPSrr")>;
+
+def SBWriteResGroup1 : SchedWriteRes<[SBPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup1], (instregex "COMP_FST0r")>;
+def: InstRW<[SBWriteResGroup1], (instregex "COM_FST0r")>;
+def: InstRW<[SBWriteResGroup1], (instregex "UCOM_FPr")>;
+def: InstRW<[SBWriteResGroup1], (instregex "UCOM_Fr")>;
+
+def SBWriteResGroup2 : SchedWriteRes<[SBPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup2], (instregex "ANDNPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ANDNPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ANDPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ANDPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "FDECSTP")>;
+def: InstRW<[SBWriteResGroup2], (instregex "FFREE")>;
+def: InstRW<[SBWriteResGroup2], (instregex "FINCSTP")>;
+def: InstRW<[SBWriteResGroup2], (instregex "FNOP")>;
+def: InstRW<[SBWriteResGroup2], (instregex "INSERTPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JAE_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JAE_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JA_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JA_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JBE_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JBE_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JB_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JB_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JE_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JE_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JGE_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JGE_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JG_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JG_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JLE_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JLE_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JL_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JL_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JMP64r")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JMP_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JMP_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNE_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNE_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNO_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNO_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNP_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNP_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNS_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JNS_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JO_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JO_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JP_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JP_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JS_1")>;
+def: InstRW<[SBWriteResGroup2], (instregex "JS_4")>;
+def: InstRW<[SBWriteResGroup2], (instregex "LD_Frr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "LOOP")>;
+def: InstRW<[SBWriteResGroup2], (instregex "LOOPE")>;
+def: InstRW<[SBWriteResGroup2], (instregex "LOOPNE")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOV64toPQIrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVAPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVAPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVDDUPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVHLPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVLHPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVSDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVSHDUPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVSLDUPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVSSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVUPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "MOVUPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ORPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ORPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "RETQ")>;
+def: InstRW<[SBWriteResGroup2], (instregex "SHUFPDrri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "SHUFPSrri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ST_FPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "ST_Frr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "UNPCKHPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "UNPCKLPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDNPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDNPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VANDPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VINSERTF128rr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VINSERTPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVAPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVDDUPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVDI2PDIrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVHLPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVSDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVSSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VMOVUPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VORPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VORPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VORPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VORPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERM2F128rr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VPERMILPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDYrri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPDrri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSYrri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VSHUFPSrri")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VUNPCKLPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VXORPDYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VXORPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VXORPSYrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "VXORPSrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "XORPDrr")>;
+def: InstRW<[SBWriteResGroup2], (instregex "XORPSrr")>;
+
+def SBWriteResGroup3 : SchedWriteRes<[SBPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup3], (instregex "LEA(16|32|64)(_32)?r")>;
+
+def SBWriteResGroup4 : SchedWriteRes<[SBPort05]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup4], (instregex "BLENDPDrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BLENDPSrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)ri8")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BT(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)ri8")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BTC(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)ri8")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BTR(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)ri8")>;
+def: InstRW<[SBWriteResGroup4], (instregex "BTS(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "CDQ")>;
+def: InstRW<[SBWriteResGroup4], (instregex "CQO")>;
+def: InstRW<[SBWriteResGroup4], (instregex "LAHF")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SAHF")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SAR(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SAR8ri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETAEr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETBr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETEr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETGEr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETGr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETLEr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETLr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETNEr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETNOr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETNPr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETNSr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETOr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETPr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SETSr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SHL(16|32|64)r1")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SHL8r1")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SHL8ri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SHR(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "SHR8ri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDYrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPDrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSYrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VBLENDPSrri")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQAYrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQArr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUYrr")>;
+def: InstRW<[SBWriteResGroup4], (instregex "VMOVDQUrr")>;
+
+def SBWriteResGroup5 : SchedWriteRes<[SBPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PADDQirr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PABSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PABSDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PABSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PACKSSDWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PACKSSWBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PACKUSDWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PACKUSWBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDUSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDUSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PADDWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PALIGNRrri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PAVGBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PAVGWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PBLENDWrri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPEQWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PCMPGTWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMAXSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMAXSDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMAXSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMAXUBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMAXUDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMAXUWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMINSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMINSDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMINSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMINUBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMINUDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMINUWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVSXWQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PMOVZXWQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSHUFBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSHUFDri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSHUFHWri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSHUFLWri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSIGNBrr128")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSIGNDrr128")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSIGNWrr128")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSLLDQri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSRLDQri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBUSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PSUBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPABSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPABSDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPABSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSDWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPACKSSWBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSDWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPACKUSWBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDUSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPADDWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPALIGNRrri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPAVGBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPAVGWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPBLENDWrri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPEQWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPCMPGTWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMAXSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMAXUWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMINSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMINSDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMINSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMINUBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMINUDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMINUWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFDri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFHWri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSHUFLWri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNBrr128")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNDrr128")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSIGNWrr128")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSLLDQri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSRLDQri")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSBrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBUSWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPSUBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[SBWriteResGroup5], (instregex "VPUNPCKLWDrr")>;
+
+def SBWriteResGroup6 : SchedWriteRes<[SBPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "ADD(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "ADD8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "ADD8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "ADD8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "AND(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "AND8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "AND8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "AND8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CBW")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CMC")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CMP(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CMP8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CMP8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CMP8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "CWDE")>;
+def: InstRW<[SBWriteResGroup6], (instregex "DEC(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "DEC8r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "INC(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "INC8r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOV(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOV8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOV8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVDQArr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVDQUrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr16")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr32")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVSX(16|32|64)rr8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr16")>;
+def: InstRW<[SBWriteResGroup6], (instregex "MOVZX(16|32|64)rr8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "NEG(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "NEG8r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "NOT(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "NOT8r")>;
+def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "OR(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "OR8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "OR8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "OR8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "PANDNrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "PANDrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "PORrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "PXORrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "STC")>;
+def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "SUB(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "SUB8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "SUB8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "SUB8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "TEST(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "TEST8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "TEST8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "TEST8rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VPANDNrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VPANDrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VPORrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "VPXORrr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "XOR(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup6], (instregex "XOR8i8")>;
+def: InstRW<[SBWriteResGroup6], (instregex "XOR8ri")>;
+def: InstRW<[SBWriteResGroup6], (instregex "XOR8rr")>;
+
+def SBWriteResGroup7 : SchedWriteRes<[SBPort0]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPDrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "MOVMSKPSrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "MOVPQIto64rr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "PMOVMSKBrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[SBWriteResGroup7], (instregex "VMOVPQIto64rr")>;
+
+def SBWriteResGroup9 : SchedWriteRes<[SBPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPDrr0")>;
+def: InstRW<[SBWriteResGroup9], (instregex "BLENDVPSrr0")>;
+def: InstRW<[SBWriteResGroup9], (instregex "ROL(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup9], (instregex "ROL8ri")>;
+def: InstRW<[SBWriteResGroup9], (instregex "ROR(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup9], (instregex "ROR8ri")>;
+def: InstRW<[SBWriteResGroup9], (instregex "SETAr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "SETBEr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPDrr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[SBWriteResGroup9], (instregex "VBLENDVPSrr")>;
+
+def SBWriteResGroup10 : SchedWriteRes<[SBPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup10], (instregex "VPBLENDVBrr")>;
+
+def SBWriteResGroup11 : SchedWriteRes<[SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup11], (instregex "SCASB")>;
+def: InstRW<[SBWriteResGroup11], (instregex "SCASL")>;
+def: InstRW<[SBWriteResGroup11], (instregex "SCASQ")>;
+def: InstRW<[SBWriteResGroup11], (instregex "SCASW")>;
+
+def SBWriteResGroup12 : SchedWriteRes<[SBPort0,SBPort1]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup12], (instregex "COMISDrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "COMISSrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "UCOMISDrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "UCOMISSrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VCOMISDrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VCOMISSrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISDrr")>;
+def: InstRW<[SBWriteResGroup12], (instregex "VUCOMISSrr")>;
+
+def SBWriteResGroup13 : SchedWriteRes<[SBPort0,SBPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup13], (instregex "CVTPS2PDrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "PTESTrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPTESTYrr")>;
+def: InstRW<[SBWriteResGroup13], (instregex "VPTESTrr")>;
+
+def SBWriteResGroup14 : SchedWriteRes<[SBPort0,SBPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup14], (instregex "PSLLDrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSLLQrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSLLWrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSRADrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSRAWrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSRLDrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSRLQrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "PSRLWrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSLLDrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSLLQrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSLLWrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSRADrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSRAWrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSRLDrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSRLQrr")>;
+def: InstRW<[SBWriteResGroup14], (instregex "VPSRLWrr")>;
+
+def SBWriteResGroup15 : SchedWriteRes<[SBPort0,SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup15], (instregex "CWD")>;
+def: InstRW<[SBWriteResGroup15], (instregex "FNSTSW16r")>;
+
+def SBWriteResGroup16 : SchedWriteRes<[SBPort1,SBPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup16], (instregex "BSWAP(16|32|64)r")>;
+
+def SBWriteResGroup17 : SchedWriteRes<[SBPort5,SBPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup17], (instregex "PINSRBrr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "PINSRDrr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "PINSRQrr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "PINSRWrri")>;
+def: InstRW<[SBWriteResGroup17], (instregex "VPINSRBrr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "VPINSRDrr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "VPINSRQrr")>;
+def: InstRW<[SBWriteResGroup17], (instregex "VPINSRWrri")>;
+
+def SBWriteResGroup18 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup18], (instregex "JRCXZ")>;
+def: InstRW<[SBWriteResGroup18], (instregex "MMX_MOVDQ2Qrr")>;
+
+def SBWriteResGroup19 : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADC(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADC8ri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "ADC8rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVAE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVB(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVG(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVGE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVL(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVLE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVNE(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVNO(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVNP(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVNS(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVO(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVP(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "CMOVS(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)ri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SBB(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SBB8ri")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SBB8rr")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[SBWriteResGroup19], (instregex "SHRD(16|32|64)rri8")>;
+
+def SBWriteResGroup20 : SchedWriteRes<[SBPort0]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[SBWriteResGroup20], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMADDUBSWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMADDWDrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULDQrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULHRSWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULHUWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULHWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULLDrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULLWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PMULUDQrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "PSADBWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMADDWDrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULDQrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULHRSWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULHUWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULHWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULLDrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULLWrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPMULUDQrr")>;
+def: InstRW<[SBWriteResGroup20], (instregex "VPSADBWrr")>;
+
+def SBWriteResGroup21 : SchedWriteRes<[SBPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup21], (instregex "ADDPDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADDPSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADDSDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADDSSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADDSUBPSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADD_FPrST0")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADD_FST0r")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ADD_FrST0")>;
+def: InstRW<[SBWriteResGroup21], (instregex "BSF(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "BSR(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CMPPDrri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CMPPSrri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CMPSDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CMPSSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r8")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CRC32r(16|32|64)r64")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CVTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)PSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MAX(C?)SSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)PSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MIN(C?)SSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "MUL8r")>;
+def: InstRW<[SBWriteResGroup21], (instregex "POPCNT(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "PUSHFS64")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPDr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ROUNDPSr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSDr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "ROUNDSSr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBPDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBPSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FPrST0")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FST0r")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBR_FrST0")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBSDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUBSSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUB_FPrST0")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUB_FST0r")>;
+def: InstRW<[SBWriteResGroup21], (instregex "SUB_FrST0")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDPDYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDPDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDPSYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDPSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDSDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDSSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VADDSUBPSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDYrri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCMPPDrri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSYrri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCMPPSrri")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCMPSDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCMPSSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)PSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMAX(C?)SSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)PSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VMIN(C?)SSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPDr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VROUNDPSr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSDr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VROUNDSSr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPDr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VROUNDYPSr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VSUBPDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSYrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VSUBPSrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VSUBSDrr")>;
+def: InstRW<[SBWriteResGroup21], (instregex "VSUBSSrr")>;
+
+def SBWriteResGroup22 : SchedWriteRes<[SBPort0,SBPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup22], (instregex "EXTRACTPSrr")>;
+def: InstRW<[SBWriteResGroup22], (instregex "VEXTRACTPSrr")>;
+
+def SBWriteResGroup23 : SchedWriteRes<[SBPort0,SBPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup23], (instregex "PEXTRBrr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "PEXTRDrr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "PEXTRQrr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "PEXTRWri")>;
+def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRBrr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRDrr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRQrr")>;
+def: InstRW<[SBWriteResGroup23], (instregex "VPEXTRWri")>;
+
+def SBWriteResGroup23_2 : SchedWriteRes<[SBPort05]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup23_2], (instregex "ROL(16|32|64)rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "ROL8rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "ROR(16|32|64)rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "ROR8rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "SAR(16|32|64)rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "SAR8rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "SHL(16|32|64)rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "SHL8rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "SHR(16|32|64)rCL")>;
+def: InstRW<[SBWriteResGroup23_2], (instregex "SHR8rCL")>;
+
+def SBWriteResGroup24 : SchedWriteRes<[SBPort15]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBSWrr64")>;
+def: InstRW<[SBWriteResGroup24], (instregex "MMX_PHSUBWrr64")>;
+def: InstRW<[SBWriteResGroup24], (instregex "PHADDDrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "PHADDSWrr128")>;
+def: InstRW<[SBWriteResGroup24], (instregex "PHADDWrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "PHSUBDrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "PHSUBSWrr128")>;
+def: InstRW<[SBWriteResGroup24], (instregex "PHSUBWrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "VPHADDDrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "VPHADDSWrr128")>;
+def: InstRW<[SBWriteResGroup24], (instregex "VPHADDWrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBDrr")>;
+def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[SBWriteResGroup24], (instregex "VPHSUBWrr")>;
+
+def SBWriteResGroup25 : SchedWriteRes<[SBPort015]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup25], (instregex "ADC8i8")>;
+def: InstRW<[SBWriteResGroup25], (instregex "LEAVE64")>;
+def: InstRW<[SBWriteResGroup25], (instregex "OUT32rr")>;
+def: InstRW<[SBWriteResGroup25], (instregex "OUT8rr")>;
+def: InstRW<[SBWriteResGroup25], (instregex "SBB8i8")>;
+def: InstRW<[SBWriteResGroup25], (instregex "XADD(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup25], (instregex "XADD8rr")>;
+
+def SBWriteResGroup25_2 : SchedWriteRes<[SBPort5,SBPort05]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVBE_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVB_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVE_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNBE_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNB_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNE_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVNP_F")>;
+def: InstRW<[SBWriteResGroup25_2], (instregex "CMOVP_F")>;
+
+def SBWriteResGroup26 : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup26], (instregex "CMOVA(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup26], (instregex "CMOVBE(16|32|64)rr")>;
+
+def SBWriteResGroup26_2 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIPr")>;
+def: InstRW<[SBWriteResGroup26_2], (instregex "COM_FIr")>;
+def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIPr")>;
+def: InstRW<[SBWriteResGroup26_2], (instregex "UCOM_FIr")>;
+
+def SBWriteResGroup27 : SchedWriteRes<[SBPort0,SBPort1]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup27], (instregex "MUL(16|32|64)r")>;
+
+def SBWriteResGroup28 : SchedWriteRes<[SBPort1,SBPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup28], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "CVTPD2PSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "CVTSD2SSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "CVTSI642SDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "CVTSI2SDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDYrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTDQ2PDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTSD2SSrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI642SDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[SBWriteResGroup28], (instregex "VCVTTPD2DQrr")>;
+
+def SBWriteResGroup29 : SchedWriteRes<[SBPort1,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup29], (instregex "MOV64sr")>;
+
+def SBWriteResGroup29_2 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup29_2], (instregex "OUT32ir")>;
+def: InstRW<[SBWriteResGroup29_2], (instregex "OUT8ir")>;
+def: InstRW<[SBWriteResGroup29_2], (instregex "PAUSE")>;
+
+def SBWriteResGroup29_3 : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+def: InstRW<[SBWriteResGroup29_3], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[SBWriteResGroup29_3], (instregex "SHRD(16|32|64)rrCL")>;
+
+def SBWriteResGroup30 : SchedWriteRes<[SBPort0]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup30], (instregex "MULPDrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "MULPSrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "MULSDrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "MULSSrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "MUL_FPrST0")>;
+def: InstRW<[SBWriteResGroup30], (instregex "MUL_FST0r")>;
+def: InstRW<[SBWriteResGroup30], (instregex "MUL_FrST0")>;
+def: InstRW<[SBWriteResGroup30], (instregex "PCMPGTQrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[SBWriteResGroup30], (instregex "RCPPSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "RCPSSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "RSQRTPSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "RSQRTSSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VMULPDYrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VMULPDrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VMULPSYrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VMULPSrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VMULSDrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VMULSSrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VPCMPGTQrr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VRCPPSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VRCPSSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTPSr")>;
+def: InstRW<[SBWriteResGroup30], (instregex "VRSQRTSSr")>;
+
+def SBWriteResGroup31 : SchedWriteRes<[SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup31], (instregex "MOV(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOV8rm")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm16")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm32")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVSX(16|32|64)rm8")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm16")>;
+def: InstRW<[SBWriteResGroup31], (instregex "MOVZX(16|32|64)rm8")>;
+def: InstRW<[SBWriteResGroup31], (instregex "PREFETCH")>;
+
+def SBWriteResGroup32 : SchedWriteRes<[SBPort0,SBPort1]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[SBWriteResGroup32], (instregex "VCVTTSS2SIrr")>;
+
+def SBWriteResGroup33 : SchedWriteRes<[SBPort4,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup33], (instregex "MOV(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOV8mr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVAPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVAPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVDQAmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVDQUmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVHPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVHPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVLPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVLPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVNTDQmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVNTI_64mr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVNTImr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVNTPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVPDI2DImr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVPQI2QImr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVPQIto64mr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVSDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVSSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVUPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "MOVUPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "PUSH64i8")>;
+def: InstRW<[SBWriteResGroup33], (instregex "PUSH(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVAPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQAmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVDQUmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVHPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVLPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTDQmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVNTPSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVSDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVSSmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPDmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSYmr")>;
+def: InstRW<[SBWriteResGroup33], (instregex "VMOVUPSmr")>;
+
+def SBWriteResGroup34 : SchedWriteRes<[SBPort0,SBPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup34], (instregex "MPSADBWrri")>;
+def: InstRW<[SBWriteResGroup34], (instregex "VMPSADBWrri")>;
+
+def SBWriteResGroup35 : SchedWriteRes<[SBPort1,SBPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup35], (instregex "CLI")>;
+def: InstRW<[SBWriteResGroup35], (instregex "CVTSI642SSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "CVTSI2SSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "HADDPDrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "HADDPSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "HSUBPDrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "HSUBPSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI642SSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDYrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHADDPDrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSYrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHADDPSrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDYrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPDrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSYrr")>;
+def: InstRW<[SBWriteResGroup35], (instregex "VHSUBPSrr")>;
+
+def SBWriteResGroup35_2 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP16m")>;
+def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP32m")>;
+def: InstRW<[SBWriteResGroup35_2], (instregex "ISTT_FP64m")>;
+def: InstRW<[SBWriteResGroup35_2], (instregex "PUSHGS64")>;
+
+def SBWriteResGroup36 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup36], (instregex "CALL64pcrel32")>;
+def: InstRW<[SBWriteResGroup36], (instregex "CALL(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup36], (instregex "EXTRACTPSmr")>;
+def: InstRW<[SBWriteResGroup36], (instregex "VEXTRACTPSmr")>;
+
+def SBWriteResGroup37 : SchedWriteRes<[SBPort4,SBPort01,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDYmr")>;
+def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSYmr")>;
+def: InstRW<[SBWriteResGroup37], (instregex "VMASKMOVPSmr")>;
+
+def SBWriteResGroup38 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup38], (instregex "SETAEm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETBm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETEm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETGEm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETGm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETLEm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETLm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETNEm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETNOm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETNPm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETNSm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETOm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETPm")>;
+def: InstRW<[SBWriteResGroup38], (instregex "SETSm")>;
+
+def SBWriteResGroup39 : SchedWriteRes<[SBPort4,SBPort23,SBPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup39], (instregex "PEXTRBmr")>;
+def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRBmr")>;
+def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRDmr")>;
+def: InstRW<[SBWriteResGroup39], (instregex "VPEXTRWmr")>;
+
+def SBWriteResGroup40 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup40], (instregex "MOV8mi")>;
+def: InstRW<[SBWriteResGroup40], (instregex "STOSB")>;
+def: InstRW<[SBWriteResGroup40], (instregex "STOSL")>;
+def: InstRW<[SBWriteResGroup40], (instregex "STOSQ")>;
+def: InstRW<[SBWriteResGroup40], (instregex "STOSW")>;
+
+def SBWriteResGroup41 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup41], (instregex "FNINIT")>;
+
+def SBWriteResGroup42 : SchedWriteRes<[SBPort05,SBPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG(16|32|64)rr")>;
+def: InstRW<[SBWriteResGroup42], (instregex "CMPXCHG8rr")>;
+
+def SBWriteResGroup43 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup43], (instregex "SETAm")>;
+def: InstRW<[SBWriteResGroup43], (instregex "SETBEm")>;
+
+def SBWriteResGroup44 : SchedWriteRes<[SBPort0,SBPort4,SBPort5,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup44], (instregex "LDMXCSR")>;
+def: InstRW<[SBWriteResGroup44], (instregex "STMXCSR")>;
+def: InstRW<[SBWriteResGroup44], (instregex "VLDMXCSR")>;
+def: InstRW<[SBWriteResGroup44], (instregex "VSTMXCSR")>;
+
+def SBWriteResGroup45 : SchedWriteRes<[SBPort0,SBPort4,SBPort23,SBPort15]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup45], (instregex "PEXTRDmr")>;
+def: InstRW<[SBWriteResGroup45], (instregex "PEXTRQmr")>;
+def: InstRW<[SBWriteResGroup45], (instregex "VPEXTRQmr")>;
+def: InstRW<[SBWriteResGroup45], (instregex "PUSHF16")>;
+def: InstRW<[SBWriteResGroup45], (instregex "PUSHF64")>;
+
+def SBWriteResGroup46 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup46], (instregex "CLFLUSH")>;
+
+def SBWriteResGroup47 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup47], (instregex "FXRSTOR")>;
+
+def SBWriteResGroup48 : SchedWriteRes<[SBPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup48], (instregex "LDDQUrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOV64toPQIrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVAPDrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVAPSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVDDUPrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVDQArm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVDQUrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVNTDQArm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVQI2PQIrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVSDrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVSHDUPrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVSLDUPrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVSSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVUPDrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "MOVUPSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "POP(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUYrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VLDDQUrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPDrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVAPSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVDDUPrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQArm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVDQUrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVNTDQArm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVSDrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVSSrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPDrm")>;
+def: InstRW<[SBWriteResGroup48], (instregex "VMOVUPSrm")>;
+
+def SBWriteResGroup49 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup49], (instregex "JMP(16|32|64)m")>;
+def: InstRW<[SBWriteResGroup49], (instregex "MOV16sm")>;
+
+def SBWriteResGroup50 : SchedWriteRes<[SBPort23,SBPort05]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup50], (instregex "BT(16|32|64)mi8")>;
+
+def SBWriteResGroup51 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[SBWriteResGroup51], (instregex "MMX_PSIGNWrm64")>;
+
+def SBWriteResGroup52 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup52], (instregex "ADD(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "ADD8rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "AND(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "AND8rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CMP(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CMP8mi")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CMP8mr")>;
+def: InstRW<[SBWriteResGroup52], (instregex "CMP8rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "LODSL")>;
+def: InstRW<[SBWriteResGroup52], (instregex "LODSQ")>;
+def: InstRW<[SBWriteResGroup52], (instregex "OR(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "OR8rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "SUB(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "SUB8rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "XOR(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup52], (instregex "XOR8rm")>;
+
+def SBWriteResGroup53 : SchedWriteRes<[SBPort4,SBPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup53], (instregex "ST_F32m")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ST_F64m")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ST_FP32m")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ST_FP64m")>;
+def: InstRW<[SBWriteResGroup53], (instregex "ST_FP80m")>;
+
+def SBWriteResGroup54 : SchedWriteRes<[SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VBROADCASTSSYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPDYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVAPSYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQAYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVDQUYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPDYrm")>;
+def: InstRW<[SBWriteResGroup54], (instregex "VMOVUPSYrm")>;
+
+def SBWriteResGroup55 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup55], (instregex "CVTPS2PDrm")>;
+def: InstRW<[SBWriteResGroup55], (instregex "CVTSS2SDrm")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDYrm")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VTESTPDrm")>;
+def: InstRW<[SBWriteResGroup55], (instregex "VTESTPSrm")>;
+
+def SBWriteResGroup56 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup56], (instregex "ANDNPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "ANDNPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "ANDPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "ANDPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "INSERTPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "MOVHPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "MOVHPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "MOVLPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "MOVLPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "ORPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "ORPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "SHUFPDrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "SHUFPSrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "UNPCKHPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "UNPCKLPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VANDNPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VANDNPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VANDPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VANDPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VBROADCASTF128")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VINSERTPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VMOVHPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VMOVLPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VORPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VORPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VPERMILPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPDrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VSHUFPSrmi")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VUNPCKLPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VXORPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "VXORPSrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "XORPDrm")>;
+def: InstRW<[SBWriteResGroup56], (instregex "XORPSrm")>;
+
+def SBWriteResGroup57 : SchedWriteRes<[SBPort5,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup57], (instregex "AESDECLASTrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "AESDECrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "AESENCLASTrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "AESENCrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VAESDECLASTrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VAESDECrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VAESENCLASTrr")>;
+def: InstRW<[SBWriteResGroup57], (instregex "VAESENCrr")>;
+
+def SBWriteResGroup58 : SchedWriteRes<[SBPort23,SBPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup58], (instregex "BLENDPDrmi")>;
+def: InstRW<[SBWriteResGroup58], (instregex "BLENDPSrmi")>;
+def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPDrmi")>;
+def: InstRW<[SBWriteResGroup58], (instregex "VBLENDPSrmi")>;
+def: InstRW<[SBWriteResGroup58], (instregex "VINSERTF128rm")>;
+
+def SBWriteResGroup59 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup59], (instregex "MMX_PADDQirm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PABSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PABSDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PABSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PACKSSDWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PACKSSWBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PACKUSDWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PACKUSWBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDUSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDUSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PADDWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PALIGNRrmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PAVGBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PAVGWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PBLENDWrmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPEQWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PCMPGTWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PINSRBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PINSRDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PINSRQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PINSRWrmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMAXSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMAXSDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMAXSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMAXUBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMAXUDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMAXUWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMINSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMINSDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMINSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMINUBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMINUDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMINUWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVSXWQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PMOVZXWQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSHUFBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSHUFDmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSHUFHWmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSHUFLWmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSIGNBrm128")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSIGNDrm128")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSIGNWrm128")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBUSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PSUBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPABSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPABSDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPABSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSDWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPACKSSWBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSDWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPACKUSWBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDUSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPADDWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPALIGNRrmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPAVGBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPAVGWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPBLENDWrmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPEQWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPCMPGTWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPINSRBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPINSRDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPINSRQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPINSRWrmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMAXSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMAXUWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMINSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMINSDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMINSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMINUBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMINUDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMINUWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPMOVZXWQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFDmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFHWmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSHUFLWmi")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNBrm128")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNDrm128")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSIGNWrm128")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSBrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBUSWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPSUBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[SBWriteResGroup59], (instregex "VPUNPCKLWDrm")>;
+
+def SBWriteResGroup60 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup60], (instregex "PANDNrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "PANDrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "PORrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "PXORrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "VPANDNrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "VPANDrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "VPORrm")>;
+def: InstRW<[SBWriteResGroup60], (instregex "VPXORrm")>;
+
+def SBWriteResGroup61 : SchedWriteRes<[SBPort0,SBPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup61], (instregex "VRCPPSYr")>;
+def: InstRW<[SBWriteResGroup61], (instregex "VRSQRTPSYr")>;
+
+def SBWriteResGroup62 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup62], (instregex "VERRm")>;
+def: InstRW<[SBWriteResGroup62], (instregex "VERWm")>;
+
+def SBWriteResGroup63 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup63], (instregex "LODSB")>;
+def: InstRW<[SBWriteResGroup63], (instregex "LODSW")>;
+
+def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup64], (instregex "FARJMP64")>;
+
+def SBWriteResGroup65 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup65], (instregex "ADC(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "ADC8rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVAE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVB(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVG(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVGE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVL(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVLE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVNE(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVNO(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVNP(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVNS(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVO(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVP(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "CMOVS(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "SBB(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup65], (instregex "SBB8rm")>;
+
+def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup66], (instregex "FNSTSWm")>;
+
+def SBWriteResGroup67 : SchedWriteRes<[SBPort1,SBPort5,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup67], (instregex "SLDT(16|32|64)r")>;
+def: InstRW<[SBWriteResGroup67], (instregex "STR(16|32|64)r")>;
+
+def SBWriteResGroup68 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup68], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[SBWriteResGroup68], (instregex "FNSTCW16m")>;
+
+def SBWriteResGroup69 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup69], (instregex "BTC(16|32|64)mi8")>;
+def: InstRW<[SBWriteResGroup69], (instregex "BTR(16|32|64)mi8")>;
+def: InstRW<[SBWriteResGroup69], (instregex "BTS(16|32|64)mi8")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SAR(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SAR8mi")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)m1")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SHL(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SHL8m1")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SHL8mi")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SHR(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup69], (instregex "SHR8mi")>;
+
+def SBWriteResGroup70 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "ADD(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "ADD8mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "ADD8mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "AND(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "AND8mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "AND8mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "DEC(16|32|64)m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "DEC8m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "INC(16|32|64)m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "INC8m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "NEG(16|32|64)m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "NEG8m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "NOT(16|32|64)m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "NOT8m")>;
+def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "OR(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "OR8mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "OR8mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "SUB(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "SUB8mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "SUB8mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "TEST(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "TEST8mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "TEST8mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "XOR(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup70], (instregex "XOR8mi")>;
+def: InstRW<[SBWriteResGroup70], (instregex "XOR8mr")>;
+
+def SBWriteResGroup71 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[SBWriteResGroup71], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[SBWriteResGroup71], (instregex "VTESTPDYrm")>;
+def: InstRW<[SBWriteResGroup71], (instregex "VTESTPSYrm")>;
+
+def SBWriteResGroup72 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup72], (instregex "BSF(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup72], (instregex "BSR(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m64")>;
+def: InstRW<[SBWriteResGroup72], (instregex "CRC32r(16|32|64)m8")>;
+def: InstRW<[SBWriteResGroup72], (instregex "FCOM32m")>;
+def: InstRW<[SBWriteResGroup72], (instregex "FCOM64m")>;
+def: InstRW<[SBWriteResGroup72], (instregex "FCOMP32m")>;
+def: InstRW<[SBWriteResGroup72], (instregex "FCOMP64m")>;
+def: InstRW<[SBWriteResGroup72], (instregex "MUL8m")>;
+
+def SBWriteResGroup73 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup73], (instregex "VANDNPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VANDNPSYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VANDPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VANDPSYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VORPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VORPSYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VPERM2F128rm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYmi")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYmi")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VPERMILPSYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKHPSYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VXORPDYrm")>;
+def: InstRW<[SBWriteResGroup73], (instregex "VXORPSYrm")>;
+
+def SBWriteResGroup74 : SchedWriteRes<[SBPort23,SBPort05]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[SBWriteResGroup74], (instregex "VBLENDPSYrmi")>;
+
+def SBWriteResGroup75 : SchedWriteRes<[SBPort23,SBPort05]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPDrm0")>;
+def: InstRW<[SBWriteResGroup75], (instregex "BLENDVPSrm0")>;
+def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPDrm")>;
+def: InstRW<[SBWriteResGroup75], (instregex "VBLENDVPSrm")>;
+def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[SBWriteResGroup75], (instregex "VMASKMOVPSrm")>;
+
+def SBWriteResGroup76 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup76], (instregex "PBLENDVBrr0")>;
+def: InstRW<[SBWriteResGroup76], (instregex "VPBLENDVBrm")>;
+
+def SBWriteResGroup77 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup77], (instregex "COMISDrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "COMISSrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "UCOMISDrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "UCOMISSrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "VCOMISDrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "VCOMISSrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISDrm")>;
+def: InstRW<[SBWriteResGroup77], (instregex "VUCOMISSrm")>;
+
+def SBWriteResGroup78 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup78], (instregex "PTESTrm")>;
+def: InstRW<[SBWriteResGroup78], (instregex "VPTESTrm")>;
+
+def SBWriteResGroup79 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup79], (instregex "PSLLDrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSLLQrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSLLWrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSRADrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSRAWrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSRLDrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSRLQrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "PSRLWrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSLLDrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSLLQrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSLLWrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSRADrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSRAWrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSRLDrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSRLQrm")>;
+def: InstRW<[SBWriteResGroup79], (instregex "VPSRLWrm")>;
+
+def SBWriteResGroup80 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBSWrm64")>;
+def: InstRW<[SBWriteResGroup80], (instregex "MMX_PHSUBWrm64")>;
+
+def SBWriteResGroup81 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup81], (instregex "CMPXCHG8rm")>;
+
+def SBWriteResGroup82 : SchedWriteRes<[SBPort23,SBPort05,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup82], (instregex "CMOVA(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup82], (instregex "CMOVBE(16|32|64)rm")>;
+
+def SBWriteResGroup83 : SchedWriteRes<[SBPort23,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
+}
+def: InstRW<[SBWriteResGroup83], (instregex "CMPSB")>;
+def: InstRW<[SBWriteResGroup83], (instregex "CMPSL")>;
+def: InstRW<[SBWriteResGroup83], (instregex "CMPSQ")>;
+def: InstRW<[SBWriteResGroup83], (instregex "CMPSW")>;
+
+def SBWriteResGroup84 : SchedWriteRes<[SBPort4,SBPort5,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup84], (instregex "FLDCW16m")>;
+
+def SBWriteResGroup85 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup85], (instregex "ROL(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup85], (instregex "ROL8mi")>;
+def: InstRW<[SBWriteResGroup85], (instregex "ROR(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup85], (instregex "ROR8mi")>;
+
+def SBWriteResGroup86 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,2];
+}
+def: InstRW<[SBWriteResGroup86], (instregex "MOVSB")>;
+def: InstRW<[SBWriteResGroup86], (instregex "MOVSL")>;
+def: InstRW<[SBWriteResGroup86], (instregex "MOVSQ")>;
+def: InstRW<[SBWriteResGroup86], (instregex "MOVSW")>;
+def: InstRW<[SBWriteResGroup86], (instregex "XADD(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup86], (instregex "XADD8rm")>;
+
+def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SBWriteResGroup87], (instregex "FARCALL64")>;
+
+def SBWriteResGroup88 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup88], (instregex "SHLD(16|32|64)mri8")>;
+def: InstRW<[SBWriteResGroup88], (instregex "SHRD(16|32|64)mri8")>;
+
+def SBWriteResGroup89 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup89], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMADDUBSWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMADDWDrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULDQrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULHRSWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULHUWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULHWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULLDrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULLWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PMULUDQrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "PSADBWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMADDWDrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULDQrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULHRSWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULHUWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULHWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULLDrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULLWrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPMULUDQrm")>;
+def: InstRW<[SBWriteResGroup89], (instregex "VPSADBWrm")>;
+
+def SBWriteResGroup90 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup90], (instregex "ADDPDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ADDPSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ADDSDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ADDSSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ADDSUBPSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CMPPDrmi")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CMPPSrmi")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CMPSDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CMPSSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTSI642SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTSI2SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)PSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MAX(C?)SSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)PSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MIN(C?)SSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "POPCNT(16|32|64)rm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPDm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ROUNDPSm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSDm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "ROUNDSSm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "SUBPDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "SUBPSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "SUBSDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "SUBSSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VADDPDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VADDPSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VADDSDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VADDSSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VADDSUBPSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCMPPDrmi")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCMPPSrmi")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCMPSDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCMPSSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI642SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCVTSI2SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)PSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMAX(C?)SSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)PSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VMIN(C?)SSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPDm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VROUNDPSm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSDm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VROUNDSSm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VSUBPDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VSUBPSrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VSUBSDrm")>;
+def: InstRW<[SBWriteResGroup90], (instregex "VSUBSSrm")>;
+
+def SBWriteResGroup91 : SchedWriteRes<[SBPort23,SBPort05]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[SBWriteResGroup91], (instregex "VBLENDVPSYrm")>;
+def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[SBWriteResGroup91], (instregex "VMASKMOVPSYrm")>;
+
+def SBWriteResGroup92 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup92], (instregex "DPPDrri")>;
+def: InstRW<[SBWriteResGroup92], (instregex "VDPPDrri")>;
+
+def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTSD2SIrm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTSS2SIrm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[SBWriteResGroup93], (instregex "MUL(16|32|64)m")>;
+
+def SBWriteResGroup94 : SchedWriteRes<[SBPort0,SBPort5,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup94], (instregex "VPTESTYrm")>;
+
+def SBWriteResGroup95 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup95], (instregex "LD_F32m")>;
+def: InstRW<[SBWriteResGroup95], (instregex "LD_F64m")>;
+def: InstRW<[SBWriteResGroup95], (instregex "LD_F80m")>;
+
+def SBWriteResGroup96 : SchedWriteRes<[SBPort23,SBPort15]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SBWriteResGroup96], (instregex "PHADDDrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "PHADDSWrm128")>;
+def: InstRW<[SBWriteResGroup96], (instregex "PHADDWrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "PHSUBDrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "PHSUBSWrm128")>;
+def: InstRW<[SBWriteResGroup96], (instregex "PHSUBWrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VPHADDDrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VPHADDSWrm128")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VPHADDWrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBDrm")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBSWrm128")>;
+def: InstRW<[SBWriteResGroup96], (instregex "VPHSUBWrm")>;
+
+def SBWriteResGroup97 : SchedWriteRes<[SBPort1,SBPort4,SBPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup97], (instregex "IST_F16m")>;
+def: InstRW<[SBWriteResGroup97], (instregex "IST_F32m")>;
+def: InstRW<[SBWriteResGroup97], (instregex "IST_FP16m")>;
+def: InstRW<[SBWriteResGroup97], (instregex "IST_FP32m")>;
+def: InstRW<[SBWriteResGroup97], (instregex "IST_FP64m")>;
+
+def SBWriteResGroup97_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,2,3];
+}
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROL(16|32|64)mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROL8mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROR(16|32|64)mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "ROR8mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "SAR(16|32|64)mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "SAR8mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "SHL(16|32|64)mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "SHL8mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "SHR(16|32|64)mCL")>;
+def: InstRW<[SBWriteResGroup97_2], (instregex "SHR8mCL")>;
+
+def SBWriteResGroup98 : SchedWriteRes<[SBPort4,SBPort23,SBPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,2,3];
+}
+def: InstRW<[SBWriteResGroup98], (instregex "ADC(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup98], (instregex "ADC8mi")>;
+def: InstRW<[SBWriteResGroup98], (instregex "SBB(16|32|64)mi")>;
+def: InstRW<[SBWriteResGroup98], (instregex "SBB8mi")>;
+
+def SBWriteResGroup99 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,2,2,1];
+}
+def: InstRW<[SBWriteResGroup99], (instregex "ADC(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup99], (instregex "ADC8mr")>;
+def: InstRW<[SBWriteResGroup99], (instregex "SBB(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup99], (instregex "SBB8mr")>;
+
+def SBWriteResGroup100 : SchedWriteRes<[SBPort4,SBPort5,SBPort23,SBPort05,SBPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,2,1,1];
+}
+def: InstRW<[SBWriteResGroup100], (instregex "BT(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup100], (instregex "BTC(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup100], (instregex "BTR(16|32|64)mr")>;
+def: InstRW<[SBWriteResGroup100], (instregex "BTS(16|32|64)mr")>;
+
+def SBWriteResGroup101 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup101], (instregex "ADD_F32m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "ADD_F64m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "ILD_F16m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "ILD_F32m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "ILD_F64m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F32m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "SUBR_F64m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "SUB_F32m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "SUB_F64m")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VADDPDYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VADDPSYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VCMPPDYrmi")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VCMPPSYrmi")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VCVTTPS2DQYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PDYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VMAX(C?)PSYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PDYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VMIN(C?)PSYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPDm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VROUNDYPSm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VSUBPDYrm")>;
+def: InstRW<[SBWriteResGroup101], (instregex "VSUBPSYrm")>;
+
+def SBWriteResGroup102 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTSD2SIrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSD2SIrm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[SBWriteResGroup102], (instregex "VCVTTSS2SIrm")>;
+
+def SBWriteResGroup103 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup103], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "CVTPD2PSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "CVTSD2SSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "CVTSI642SSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "CVTSI2SSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "MMX_CVTTPD2PIirm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDYrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTDQ2PDrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2DQrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTPD2PSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTSD2SSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI642SSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTSI2SSrm")>;
+def: InstRW<[SBWriteResGroup103], (instregex "VCVTTPD2DQrm")>;
+
+def SBWriteResGroup103_2 : SchedWriteRes<[SBPort4,SBPort23,SBPort05,SBPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,2,3,1];
+}
+def: InstRW<[SBWriteResGroup103_2], (instregex "SHLD(16|32|64)mrCL")>;
+def: InstRW<[SBWriteResGroup103_2], (instregex "SHRD(16|32|64)mrCL")>;
+
+def SBWriteResGroup104 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup104], (instregex "MULPDrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "MULPSrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "MULSDrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "MULSSrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "PCMPGTQrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[SBWriteResGroup104], (instregex "RCPPSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "RCPSSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "RSQRTPSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "RSQRTSSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VMULPDrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VMULPSrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VMULSDrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VMULSSrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VPCMPGTQrm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VRCPPSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VRCPSSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTPSm")>;
+def: InstRW<[SBWriteResGroup104], (instregex "VRSQRTSSm")>;
+
+def SBWriteResGroup105 : SchedWriteRes<[SBPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRIrr")>;
+def: InstRW<[SBWriteResGroup105], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[SBWriteResGroup105], (instregex "VPCMPISTRM128rr")>;
+
+def SBWriteResGroup106 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup106], (instregex "FICOM16m")>;
+def: InstRW<[SBWriteResGroup106], (instregex "FICOM32m")>;
+def: InstRW<[SBWriteResGroup106], (instregex "FICOMP16m")>;
+def: InstRW<[SBWriteResGroup106], (instregex "FICOMP32m")>;
+
+def SBWriteResGroup107 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2DQYrm")>;
+def: InstRW<[SBWriteResGroup107], (instregex "VCVTPD2PSYrm")>;
+def: InstRW<[SBWriteResGroup107], (instregex "VCVTTPD2DQYrm")>;
+
+def SBWriteResGroup108 : SchedWriteRes<[SBPort0,SBPort23,SBPort15]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SBWriteResGroup108], (instregex "MPSADBWrmi")>;
+def: InstRW<[SBWriteResGroup108], (instregex "VMPSADBWrmi")>;
+
+def SBWriteResGroup109 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup109], (instregex "HADDPDrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "HADDPSrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "HSUBPDrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "HSUBPSrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "VHADDPDrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "VHADDPSrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPDrm")>;
+def: InstRW<[SBWriteResGroup109], (instregex "VHSUBPSrm")>;
+
+def SBWriteResGroup110 : SchedWriteRes<[SBPort5]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SBWriteResGroup110], (instregex "AESIMCrr")>;
+def: InstRW<[SBWriteResGroup110], (instregex "VAESIMCrr")>;
+
+def SBWriteResGroup111 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup111], (instregex "MUL_F32m")>;
+def: InstRW<[SBWriteResGroup111], (instregex "MUL_F64m")>;
+def: InstRW<[SBWriteResGroup111], (instregex "VMULPDYrm")>;
+def: InstRW<[SBWriteResGroup111], (instregex "VMULPSYrm")>;
+
+def SBWriteResGroup112 : SchedWriteRes<[SBPort0,SBPort1,SBPort5]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup112], (instregex "DPPSrri")>;
+def: InstRW<[SBWriteResGroup112], (instregex "VDPPSYrri")>;
+def: InstRW<[SBWriteResGroup112], (instregex "VDPPSrri")>;
+
+def SBWriteResGroup113 : SchedWriteRes<[SBPort1,SBPort5,SBPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SBWriteResGroup113], (instregex "VHADDPDYrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "VHADDPSYrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPDYrm")>;
+def: InstRW<[SBWriteResGroup113], (instregex "VHSUBPSYrm")>;
+
+def SBWriteResGroup114 : SchedWriteRes<[SBPort1,SBPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI16m")>;
+def: InstRW<[SBWriteResGroup114], (instregex "ADD_FI32m")>;
+def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI16m")>;
+def: InstRW<[SBWriteResGroup114], (instregex "SUBR_FI32m")>;
+def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI16m")>;
+def: InstRW<[SBWriteResGroup114], (instregex "SUB_FI32m")>;
+
+def SBWriteResGroup115 : SchedWriteRes<[SBPort5,SBPort23,SBPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup115], (instregex "AESDECLASTrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "AESDECrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "AESENCLASTrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "AESENCrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "VAESDECLASTrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "VAESDECrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "VAESENCLASTrm")>;
+def: InstRW<[SBWriteResGroup115], (instregex "VAESENCrm")>;
+
+def SBWriteResGroup116 : SchedWriteRes<[SBPort0]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup116], (instregex "DIVPSrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "DIVSSrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "SQRTPSr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "SQRTSSr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "VDIVPSrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "VDIVSSrr")>;
+def: InstRW<[SBWriteResGroup116], (instregex "VSQRTPSr")>;
+
+def SBWriteResGroup117 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup117], (instregex "VSQRTSSm")>;
+
+def SBWriteResGroup118 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SBWriteResGroup118], (instregex "VRCPPSYm")>;
+def: InstRW<[SBWriteResGroup118], (instregex "VRSQRTPSYm")>;
+
+def SBWriteResGroup119 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI16m")>;
+def: InstRW<[SBWriteResGroup119], (instregex "MUL_FI32m")>;
+
+def SBWriteResGroup120 : SchedWriteRes<[SBPort0,SBPort1,SBPort5,SBPort23]> {
+ let Latency = 15;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SBWriteResGroup120], (instregex "DPPDrmi")>;
+def: InstRW<[SBWriteResGroup120], (instregex "VDPPDrmi")>;
+
+def SBWriteResGroup121 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRIrm")>;
+def: InstRW<[SBWriteResGroup121], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[SBWriteResGroup121], (instregex "VPCMPISTRM128rm")>;
+
+def SBWriteResGroup122 : SchedWriteRes<[SBPort5,SBPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup122], (instregex "AESIMCrm")>;
+def: InstRW<[SBWriteResGroup122], (instregex "VAESIMCrm")>;
+
+def SBWriteResGroup123 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup123], (instregex "DIVPSrm")>;
+def: InstRW<[SBWriteResGroup123], (instregex "DIVSSrm")>;
+def: InstRW<[SBWriteResGroup123], (instregex "SQRTPSm")>;
+def: InstRW<[SBWriteResGroup123], (instregex "SQRTSSm")>;
+def: InstRW<[SBWriteResGroup123], (instregex "VDIVPSrm")>;
+def: InstRW<[SBWriteResGroup123], (instregex "VDIVSSrm")>;
+def: InstRW<[SBWriteResGroup123], (instregex "VSQRTPSm")>;
+
+def SBWriteResGroup124 : SchedWriteRes<[SBPort0]> {
+ let Latency = 21;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup124], (instregex "VSQRTSDr")>;
+
+def SBWriteResGroup125 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup125], (instregex "VSQRTSDm")>;
+
+def SBWriteResGroup126 : SchedWriteRes<[SBPort0]> {
+ let Latency = 22;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup126], (instregex "DIVPDrr")>;
+def: InstRW<[SBWriteResGroup126], (instregex "DIVSDrr")>;
+def: InstRW<[SBWriteResGroup126], (instregex "SQRTPDr")>;
+def: InstRW<[SBWriteResGroup126], (instregex "SQRTSDr")>;
+def: InstRW<[SBWriteResGroup126], (instregex "VDIVPDrr")>;
+def: InstRW<[SBWriteResGroup126], (instregex "VDIVSDrr")>;
+def: InstRW<[SBWriteResGroup126], (instregex "VSQRTPDr")>;
+
+def SBWriteResGroup127 : SchedWriteRes<[SBPort0]> {
+ let Latency = 24;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FPrST0")>;
+def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FST0r")>;
+def: InstRW<[SBWriteResGroup127], (instregex "DIVR_FrST0")>;
+def: InstRW<[SBWriteResGroup127], (instregex "DIV_FPrST0")>;
+def: InstRW<[SBWriteResGroup127], (instregex "DIV_FST0r")>;
+def: InstRW<[SBWriteResGroup127], (instregex "DIV_FrST0")>;
+
+def SBWriteResGroup128 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 28;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup128], (instregex "DIVPDrm")>;
+def: InstRW<[SBWriteResGroup128], (instregex "DIVSDrm")>;
+def: InstRW<[SBWriteResGroup128], (instregex "SQRTPDm")>;
+def: InstRW<[SBWriteResGroup128], (instregex "SQRTSDm")>;
+def: InstRW<[SBWriteResGroup128], (instregex "VDIVPDrm")>;
+def: InstRW<[SBWriteResGroup128], (instregex "VDIVSDrm")>;
+def: InstRW<[SBWriteResGroup128], (instregex "VSQRTPDm")>;
+
+def SBWriteResGroup129 : SchedWriteRes<[SBPort0,SBPort05]> {
+ let Latency = 29;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup129], (instregex "VDIVPSYrr")>;
+def: InstRW<[SBWriteResGroup129], (instregex "VSQRTPSYr")>;
+
+def SBWriteResGroup130 : SchedWriteRes<[SBPort0,SBPort23]> {
+ let Latency = 31;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F32m")>;
+def: InstRW<[SBWriteResGroup130], (instregex "DIVR_F64m")>;
+def: InstRW<[SBWriteResGroup130], (instregex "DIV_F32m")>;
+def: InstRW<[SBWriteResGroup130], (instregex "DIV_F64m")>;
+
+def SBWriteResGroup131 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
+ let Latency = 34;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI16m")>;
+def: InstRW<[SBWriteResGroup131], (instregex "DIVR_FI32m")>;
+def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI16m")>;
+def: InstRW<[SBWriteResGroup131], (instregex "DIV_FI32m")>;
+
+def SBWriteResGroup132 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
+ let Latency = 36;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SBWriteResGroup132], (instregex "VDIVPSYrm")>;
+def: InstRW<[SBWriteResGroup132], (instregex "VSQRTPSYm")>;
+
+def SBWriteResGroup133 : SchedWriteRes<[SBPort0,SBPort05]> {
+ let Latency = 45;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SBWriteResGroup133], (instregex "VDIVPDYrr")>;
+def: InstRW<[SBWriteResGroup133], (instregex "VSQRTPDYr")>;
+
+def SBWriteResGroup134 : SchedWriteRes<[SBPort0,SBPort23,SBPort05]> {
+ let Latency = 52;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SBWriteResGroup134], (instregex "VDIVPDYrm")>;
+def: InstRW<[SBWriteResGroup134], (instregex "VSQRTPDYm")>;
+
+def SBWriteResGroup135 : SchedWriteRes<[SBPort0]> {
+ let Latency = 114;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SBWriteResGroup135], (instregex "VSQRTSSr")>;
+
} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeClient.td b/lib/Target/X86/X86SchedSkylakeClient.td
new file mode 100644
index 000000000000..9a417b2d3e82
--- /dev/null
+++ b/lib/Target/X86/X86SchedSkylakeClient.td
@@ -0,0 +1,3993 @@
+//=- X86SchedSkylake.td - X86 Skylake Client Scheduling ------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Skylake Client to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SkylakeClientModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SKylake can
+ // decode 6 instructions per cycle.
+ let IssueWidth = 6;
+ let MicroOpBufferSize = 224; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 14;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SkylakeClientModel in {
+
+// Skylake Client can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def SKLPort0 : ProcResource<1>;
+def SKLPort1 : ProcResource<1>;
+def SKLPort2 : ProcResource<1>;
+def SKLPort3 : ProcResource<1>;
+def SKLPort4 : ProcResource<1>;
+def SKLPort5 : ProcResource<1>;
+def SKLPort6 : ProcResource<1>;
+def SKLPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SKLPort01 : ProcResGroup<[SKLPort0, SKLPort1]>;
+def SKLPort23 : ProcResGroup<[SKLPort2, SKLPort3]>;
+def SKLPort237 : ProcResGroup<[SKLPort2, SKLPort3, SKLPort7]>;
+def SKLPort04 : ProcResGroup<[SKLPort0, SKLPort4]>;
+def SKLPort05 : ProcResGroup<[SKLPort0, SKLPort5]>;
+def SKLPort06 : ProcResGroup<[SKLPort0, SKLPort6]>;
+def SKLPort15 : ProcResGroup<[SKLPort1, SKLPort5]>;
+def SKLPort16 : ProcResGroup<[SKLPort1, SKLPort6]>;
+def SKLPort56 : ProcResGroup<[SKLPort5, SKLPort6]>;
+def SKLPort015 : ProcResGroup<[SKLPort0, SKLPort1, SKLPort5]>;
+def SKLPort056 : ProcResGroup<[SKLPort0, SKLPort5, SKLPort6]>;
+def SKLPort0156: ProcResGroup<[SKLPort0, SKLPort1, SKLPort5, SKLPort6]>;
+
+// 60 Entry Unified Scheduler
+def SKLPortAny : ProcResGroup<[SKLPort0, SKLPort1, SKLPort2, SKLPort3, SKLPort4,
+ SKLPort5, SKLPort6, SKLPort7]> {
+ let BufferSize=60;
+}
+
+// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SKLWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [SKLPort23, ExePort]> {
+ let Latency = !add(Lat, 5);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKLPort4]>;
+
+// Arithmetic.
+defm : SKLWriteResPair<WriteALU, SKLPort0156, 1>; // Simple integer ALU op.
+defm : SKLWriteResPair<WriteIMul, SKLPort1, 3>; // Integer multiplication.
+def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
+def SKLDivider : ProcResource<1>; // Integer division issued on port 0.
+def : WriteRes<WriteIDiv, [SKLPort0, SKLDivider]> { // Integer division.
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SKLPort23, SKLPort0, SKLDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+def : WriteRes<WriteLEA, [SKLPort15]>; // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm : SKLWriteResPair<WriteShift, SKLPort06, 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+def : WriteRes<WriteLoad, [SKLPort23]> { let Latency = 5; }
+def : WriteRes<WriteStore, [SKLPort237, SKLPort4]>;
+def : WriteRes<WriteMove, [SKLPort0156]>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero, []>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : SKLWriteResPair<WriteJump, SKLPort06, 1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : SKLWriteResPair<WriteFAdd, SKLPort1, 3>; // Floating point add/sub/compare.
+defm : SKLWriteResPair<WriteFMul, SKLPort0, 5>; // Floating point multiplication.
+defm : SKLWriteResPair<WriteFDiv, SKLPort0, 12>; // 10-14 cycles. // Floating point division.
+defm : SKLWriteResPair<WriteFSqrt, SKLPort0, 15>; // Floating point square root.
+defm : SKLWriteResPair<WriteFRcp, SKLPort0, 5>; // Floating point reciprocal estimate.
+defm : SKLWriteResPair<WriteFRsqrt, SKLPort0, 5>; // Floating point reciprocal square root estimate.
+defm : SKLWriteResPair<WriteFMA, SKLPort01, 4>; // Fused Multiply Add.
+defm : SKLWriteResPair<WriteFShuffle, SKLPort5, 1>; // Floating point vector shuffles.
+defm : SKLWriteResPair<WriteFBlend, SKLPort015, 1>; // Floating point vector blends.
+def : WriteRes<WriteFVarBlend, [SKLPort5]> { // Fp vector variable blends.
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [SKLPort5, SKLPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : SKLWriteResPair<WriteVecALU, SKLPort15, 1>; // Vector integer ALU op, no logicals.
+defm : SKLWriteResPair<WriteVecShift, SKLPort0, 1>; // Vector integer shifts.
+defm : SKLWriteResPair<WriteVecIMul, SKLPort0, 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteShuffle, SKLPort5, 1>; // Vector shuffles.
+defm : SKLWriteResPair<WriteBlend, SKLPort15, 1>; // Vector blends.
+
+def : WriteRes<WriteVarBlend, [SKLPort5]> { // Vector variable blends.
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [SKLPort5, SKLPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteMPSAD, [SKLPort0, SKLPort5]> { // Vector MPSAD.
+ let Latency = 6;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [SKLPort23, SKLPort0, SKLPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 2];
+}
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm : SKLWriteResPair<WriteVecLogic, SKLPort015, 1>; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm : SKLWriteResPair<WriteCvtF2I, SKLPort1, 3>; // Float -> Integer.
+defm : SKLWriteResPair<WriteCvtI2F, SKLPort1, 4>; // Integer -> Float.
+defm : SKLWriteResPair<WriteCvtF2F, SKLPort1, 3>; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// String instructions.
+def : WriteRes<WritePCmpIStrM, [SKLPort0]> {
+ let Latency = 10;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SKLPort0, SKLPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 1];
+}
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SKLPort0, SKLPort16, SKLPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [SKLPort05, SKLPort16, SKLPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [6, 2, 1];
+}
+ // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SKLPort0]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SKLPort0, SKLPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SKLPort05, SKLPort16]> {
+ let Latency = 11;
+ let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [SKLPort0, SKLPort16, SKLPort5, SKLPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [SKLPort5]> { // Decryption, encryption.
+ let Latency = 7;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [SKLPort5, SKLPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESIMC, [SKLPort5]> { // InvMixColumn.
+ let Latency = 14;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SKLPort5, SKLPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteAESKeyGen, [SKLPort0, SKLPort5]> { // Key Generation.
+ let Latency = 10;
+ let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [SKLPort0, SKLPort5, SKLPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SKLPort0, SKLPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [SKLPort0, SKLPort5, SKLPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1, 1];
+}
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem, [SKLPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : SKLWriteResPair<WriteFShuffle256, SKLPort5, 3>; // Fp 256-bit width vector shuffles.
+defm : SKLWriteResPair<WriteShuffle256, SKLPort5, 3>; // 256-bit width vector shuffles.
+def : WriteRes<WriteVarVecShift, [SKLPort0, SKLPort5]> { // Variable vector shifts.
+ let Latency = 2;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [SKLPort0, SKLPort5, SKLPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1, 1];
+}
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [SKLPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence, [SKLPort23, SKLPort4]>;
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [SKLPort1]> {
+ let Latency = 3;
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [SKLPort1, SKLPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [SKLPort15]>;
+
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [SKLPort15, SKLPort23]> {
+ let Latency = 5;
+ let ResourceCycles = [1, 1];
+}
+
+// Remaining instrs.
+
+def SKLWriteResGroup1 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDSWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PADDUSWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PAVGWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQDirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPEQWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTDirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PCMPGTWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXSWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMAXUBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINSWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PMINUBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLDrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLQrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSLLWrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRADrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRAWrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLDrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLQrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWri")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSRLWrr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBSWirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSBirr")>;
+def: InstRW<[SKLWriteResGroup1], (instregex "MMX_PSUBUSWirr")>;
+
+def SKLWriteResGroup2 : SchedWriteRes<[SKLPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+
+def SKLWriteResGroup3 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup3], (instregex "COMP_FST0r")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "COM_FST0r")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "INSERTPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64rr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PSHUFWri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOV64toPQIrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVDDUPrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVHLPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVLHPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVSHDUPrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVSLDUPrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSDWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PACKSSWBrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSDWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PACKUSWBrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PALIGNRrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PBLENDWrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVSXWQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PMOVZXWQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFBrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFDri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFHWri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PSHUFLWri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PSLLDQri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PSRLDQri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPDrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "SHUFPSrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_FPr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "UCOM_Fr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKHPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "UNPCKLPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VINSERTPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDDUPrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVHLPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVLHPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSDWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKSSWBrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSDWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPACKUSWBrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRYrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPALIGNRrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWYrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPBLENDWrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPBROADCASTQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPERMILPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFBrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFDri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFHWri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSHUFLWri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSLLDQri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQYri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPSRLDQri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDYrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPDrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSYrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VSHUFPSrri")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[SKLWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+
+def SKLWriteResGroup4 : SchedWriteRes<[SKLPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def SKLWriteResGroup5 : SchedWriteRes<[SKLPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup5], (instregex "PABSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PABSDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PABSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PADDSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PADDSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PADDUSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PAVGBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PAVGWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQQrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPEQWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PCMPGTWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMAXSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMAXUWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMINSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMINSDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMINSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMINUBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMINUDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PMINUWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNBrr128")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNDrr128")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSIGNWrr128")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSLLDri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSLLQri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSLLWri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSRADri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSRAWri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSRLDri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSRLQri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSRLWri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSUBSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "PSUBUSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPABSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPABSDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPABSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPADDUSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPAVGWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQQrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPEQWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPCMPGTWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMAXUWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPMINUWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBYrr256")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNBrr128")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDYrr256")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNDrr128")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWYrr256")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSIGNWrr128")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLDri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLQri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLVQrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSLLWri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRADri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAVDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRAWri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLDri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLQri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVDrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLVQrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWYri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSRLWri")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBSWrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSBrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWYrr")>;
+def: InstRW<[SKLWriteResGroup5], (instregex "VPSUBUSWrr")>;
+
+def SKLWriteResGroup6 : SchedWriteRes<[SKLPort05]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup6], (instregex "FINCSTP")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "FNOP")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDBirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDDirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDQirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PADDWirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDNirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PANDirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PORirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBBirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBDirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBQirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PSUBWirr")>;
+def: InstRW<[SKLWriteResGroup6], (instregex "MMX_PXORirr")>;
+
+def SKLWriteResGroup7 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "ADCX(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "ADOX(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BT(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CDQ")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CLAC")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "CQO")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JAE_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JAE_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JA_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JA_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JBE_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JBE_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JB_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JB_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JE_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JE_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JGE_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JGE_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JG_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JG_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JLE_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JLE_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JL_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JL_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JMP_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JMP_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNE_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNE_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNO_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNO_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNP_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNP_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNS_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JNS_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JO_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JO_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JP_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JP_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JS_1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "JS_4")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "RORX(32|64)ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SAR8r1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SAR8ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SARX(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETAEr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETBr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETEr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETGEr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETGr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETLEr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETLr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETNEr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETNOr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETNPr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETNSr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETOr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETPr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SETSr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHL8r1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHL8ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHLX(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHR8r1")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHR8ri")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "SHRX(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup7], (instregex "STAC")>;
+
+def SKLWriteResGroup8 : SchedWriteRes<[SKLPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "BLSI(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "BLSR(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "BZHI(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+
+def SKLWriteResGroup9 : SchedWriteRes<[SKLPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "ANDNPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "ANDPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "ANDPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPDrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "BLENDPSrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "ORPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "ORPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PADDBrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PADDDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PADDQrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PADDWrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PANDNrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PANDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PORrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PSUBBrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PSUBDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PSUBQrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PSUBWrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "PXORrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDNPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VANDPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDYrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPDrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSYrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VBLENDPSrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VORPDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VORPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VORPSYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VORPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDBrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDQrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPADDWrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPANDNrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPANDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPANDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDYrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPBLENDDrri")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPORYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPORrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBBrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBQrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPSUBWrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPXORYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VPXORrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VXORPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSYrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "VXORPSrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "XORPDrr")>;
+def: InstRW<[SKLWriteResGroup9], (instregex "XORPSrr")>;
+
+def SKLWriteResGroup10 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "AND8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CBW")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CLC")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMC")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "CWDE")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "DEC(16|32|64)r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "DEC8r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "INC(16|32|64)r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "INC8r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "LAHF")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "NEG(16|32|64)r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "NEG8r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "NOOP")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "NOT(16|32|64)r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "NOT8r")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "OR8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SAHF")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SGDT64m")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SIDT64m")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SLDT64m")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SMSW16m")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "STC")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "STRm")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "SYSCALL")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "TEST8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "TEST8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "TEST8rr")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR8i8")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR8ri")>;
+def: InstRW<[SKLWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
+
+def SKLWriteResGroup11 : SchedWriteRes<[SKLPort4,SKLPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup11], (instregex "FBSTPm")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVD64mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVNTQmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MMX_MOVQ64mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOV(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mi")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOV8mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVAPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQAmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVDQUmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVHPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVLPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTDQmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTI_64mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTImr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVNTPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVPDI2DImr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQI2QImr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVPQIto64mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVSDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVSSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "MOVUPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP32m")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP64m")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "ST_FP80m")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VEXTRACTI128mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVAPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQAmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVDQUmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVHPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVLPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTDQmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVNTPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVSSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPDmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSYmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMOVUPSmr")>;
+def: InstRW<[SKLWriteResGroup11], (instregex "VMPTRSTm")>;
+
+def SKLWriteResGroup12 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup12], (instregex "COMISDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "COMISSrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MMX_MOVD64grr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MOVMSKPSrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "MOVPQIto64rr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "PMOVMSKBrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "UCOMISSrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VCOMISSrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBYrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDYrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSYrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VTESTPSrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISDrr")>;
+def: InstRW<[SKLWriteResGroup12], (instregex "VUCOMISSrr")>;
+
+def SKLWriteResGroup13 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "MMX_PINSRWirri")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "PINSRBrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "PINSRDrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "PINSRQrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "PINSRWrri")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRBrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRDrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRQrr")>;
+def: InstRW<[SKLWriteResGroup13], (instregex "VPINSRWrri")>;
+
+def SKLWriteResGroup14 : SchedWriteRes<[SKLPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup14], (instregex "FDECSTP")>;
+def: InstRW<[SKLWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>;
+
+def SKLWriteResGroup15 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROL(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROL8r1")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROL8ri")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROR(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROR8r1")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "ROR8ri")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "SETAr")>;
+def: InstRW<[SKLWriteResGroup15], (instregex "SETBEr")>;
+
+def SKLWriteResGroup16 : SchedWriteRes<[SKLPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPDrr0")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "BLENDVPSrr0")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "PBLENDVBrr0")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPDrr")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "VBLENDVPSrr")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBYrr")>;
+def: InstRW<[SKLWriteResGroup16], (instregex "VPBLENDVBrr")>;
+
+def SKLWriteResGroup17 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup17], (instregex "LFENCE")>;
+def: InstRW<[SKLWriteResGroup17], (instregex "WAIT")>;
+def: InstRW<[SKLWriteResGroup17], (instregex "XGETBV")>;
+
+def SKLWriteResGroup18 : SchedWriteRes<[SKLPort0,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup18], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVDQU")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDYmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSYmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VMASKMOVPSmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDYmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVDmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQYmr")>;
+def: InstRW<[SKLWriteResGroup18], (instregex "VPMASKMOVQmr")>;
+
+def SKLWriteResGroup19 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup19], (instregex "PSLLDrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSLLQrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSLLWrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSRADrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSRAWrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSRLDrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSRLQrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "PSRLWrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLDrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLQrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSLLWrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSRADrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSRAWrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLDrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLQrr")>;
+def: InstRW<[SKLWriteResGroup19], (instregex "VPSRLWrr")>;
+
+def SKLWriteResGroup20 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup20], (instregex "CLFLUSH")>;
+
+def SKLWriteResGroup21 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup21], (instregex "SFENCE")>;
+
+def SKLWriteResGroup22 : SchedWriteRes<[SKLPort06,SKLPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup22], (instregex "BEXTR(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;
+
+def SKLWriteResGroup23 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup23], (instregex "ADC8i8")>;
+def: InstRW<[SKLWriteResGroup23], (instregex "ADC8ri")>;
+def: InstRW<[SKLWriteResGroup23], (instregex "CWD")>;
+def: InstRW<[SKLWriteResGroup23], (instregex "JRCXZ")>;
+def: InstRW<[SKLWriteResGroup23], (instregex "SBB8i8")>;
+def: InstRW<[SKLWriteResGroup23], (instregex "SBB8ri")>;
+
+def SKLWriteResGroup24 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup24], (instregex "EXTRACTPSmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRBmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRDmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRQmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "PEXTRWmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "STMXCSR")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "VEXTRACTPSmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRBmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRDmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRQmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "VPEXTRWmr")>;
+def: InstRW<[SKLWriteResGroup24], (instregex "VSTMXCSR")>;
+
+def SKLWriteResGroup25 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup25], (instregex "FNSTCW16m")>;
+
+def SKLWriteResGroup26 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup26], (instregex "SETAEm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETBm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETEm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETGEm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETGm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETLEm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETLm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETNEm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETNOm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETNPm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETNSm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETOm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETPm")>;
+def: InstRW<[SKLWriteResGroup26], (instregex "SETSm")>;
+
+def SKLWriteResGroup27 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>;
+
+def SKLWriteResGroup28 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
+def: InstRW<[SKLWriteResGroup28], (instregex "PUSH64i8")>;
+def: InstRW<[SKLWriteResGroup28], (instregex "STOSB")>;
+def: InstRW<[SKLWriteResGroup28], (instregex "STOSL")>;
+def: InstRW<[SKLWriteResGroup28], (instregex "STOSQ")>;
+def: InstRW<[SKLWriteResGroup28], (instregex "STOSW")>;
+
+def SKLWriteResGroup29 : SchedWriteRes<[SKLPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup29], (instregex "BSF(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "BSR(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "IMUL64rr(i8)?")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "IMUL8r")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "LZCNT(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "MUL8r")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "PDEP(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "PEXT(32|64)rr")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "POPCNT(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[SKLWriteResGroup29], (instregex "TZCNT(16|32|64)rr")>;
+
+def SKLWriteResGroup29_16 : SchedWriteRes<[SKLPort1, SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup29_16], (instregex "IMUL16rr(i8)?")>;
+
+def SKLWriteResGroup29_32 : SchedWriteRes<[SKLPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+def: InstRW<[SKLWriteResGroup29_32], (instregex "IMUL32rr(i8)?")>;
+
+def SKLWriteResGroup30 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FPrST0")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FST0r")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "ADD_FrST0")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "MMX_PSADBWirr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "PCMPGTQrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "PSADBWrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FPrST0")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FST0r")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "SUBR_FrST0")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FPrST0")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FST0r")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "SUB_FrST0")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VBROADCASTSSYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VEXTRACTI128rr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTF128rr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VINSERTI128rr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTBrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPBROADCASTWrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPCMPGTQrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2F128rr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPERM2I128rr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPERMDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPDYri")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPERMPSYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPERMQYri")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXBWYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXDQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVSXWQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXBWYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXDQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWDYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPMOVZXWQYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWYrr")>;
+def: InstRW<[SKLWriteResGroup30], (instregex "VPSADBWrr")>;
+
+def SKLWriteResGroup31 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup31], (instregex "EXTRACTPSrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "MMX_PEXTRWirri")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRBrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRDrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRQrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWri")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "PEXTRWrr_REV")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "PTESTrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRBrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRDrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRQrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWri")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPEXTRWrr_REV")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTYrr")>;
+def: InstRW<[SKLWriteResGroup31], (instregex "VPTESTrr")>;
+
+def SKLWriteResGroup32 : SchedWriteRes<[SKLPort0,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup32], (instregex "FNSTSW16r")>;
+
+def SKLWriteResGroup33 : SchedWriteRes<[SKLPort06]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKLWriteResGroup33], (instregex "ROL(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "ROL8rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "ROR(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "ROR8rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "SAR(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "SAR8rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "SHL(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "SHL8rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "SHR(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup33], (instregex "SHR8rCL")>;
+
+def SKLWriteResGroup34 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKLWriteResGroup34], (instregex "XADD(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup34], (instregex "XADD8rr")>;
+def: InstRW<[SKLWriteResGroup34], (instregex "XCHG8rr")>;
+
+def SKLWriteResGroup35 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[SKLWriteResGroup35], (instregex "MMX_PHSUBSWrr64")>;
+
+def SKLWriteResGroup36 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup36], (instregex "PHADDSWrr128")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "PHSUBSWrr128")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr128")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "VPHADDSWrr256")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[SKLWriteResGroup36], (instregex "VPHSUBSWrr256")>;
+
+def SKLWriteResGroup37 : SchedWriteRes<[SKLPort5,SKLPort05]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[SKLWriteResGroup37], (instregex "MMX_PHSUBWrr64")>;
+
+def SKLWriteResGroup38 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup38], (instregex "PHADDDrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "PHADDWrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBDrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "PHSUBWrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDYrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDDrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWYrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHADDWrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDYrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBDrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWYrr")>;
+def: InstRW<[SKLWriteResGroup38], (instregex "VPHSUBWrr")>;
+
+def SKLWriteResGroup39 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSDWirr")>;
+def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKSSWBirr")>;
+def: InstRW<[SKLWriteResGroup39], (instregex "MMX_PACKUSWBirr")>;
+
+def SKLWriteResGroup40 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup40], (instregex "CLD")>;
+
+def SKLWriteResGroup41 : SchedWriteRes<[SKLPort237,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup41], (instregex "MFENCE")>;
+
+def SKLWriteResGroup42 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL8r1")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCL8ri")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)r1")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCR(16|32|64)ri")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCR8r1")>;
+def: InstRW<[SKLWriteResGroup42], (instregex "RCR8ri")>;
+
+def SKLWriteResGroup43 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort237]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup43], (instregex "FNSTSWm")>;
+
+def SKLWriteResGroup44 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKLWriteResGroup44], (instregex "SETAm")>;
+def: InstRW<[SKLWriteResGroup44], (instregex "SETBEm")>;
+
+def SKLWriteResGroup45 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort237,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup45], (instregex "CALL(16|32|64)r")>;
+
+def SKLWriteResGroup46 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup46], (instregex "CALL64pcrel32")>;
+
+def SKLWriteResGroup47 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup47], (instregex "AESDECLASTrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "AESDECrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "AESENCLASTrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "AESENCrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMADDWDirr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHUWirr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULHWirr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULLWirr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FPrST0")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FST0r")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "MUL_FrST0")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "RCPPSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "RCPSSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTPSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "RSQRTSSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECLASTrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VAESDECrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCLASTrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VAESENCrr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSYr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VRCPPSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VRCPSSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSYr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTPSr")>;
+def: InstRW<[SKLWriteResGroup47], (instregex "VRSQRTSSr")>;
+
+def SKLWriteResGroup48 : SchedWriteRes<[SKLPort01]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup48], (instregex "ADDPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "ADDPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "ADDSDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "ADDSSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "ADDSUBPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "MULPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "MULPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "MULSDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "MULSSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "SUBPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "SUBPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "SUBSDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "SUBSSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDSDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDSSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VADDSUBPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VMULPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VMULPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VMULSDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VMULSSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSYrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VSUBPSrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSDrr")>;
+def: InstRW<[SKLWriteResGroup48], (instregex "VSUBSSrr")>;
+def: InstRW<[SKLWriteResGroup48],
+ (instregex
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)(Y)?r",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
+
+def SKLWriteResGroup49 : SchedWriteRes<[SKLPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup49], (instregex "CMPPDrri")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "CMPPSrri")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "CMPSDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "CMPSSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "CVTPS2DQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)PSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MAX(C?)SSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)PSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "MIN(C?)SSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMADDUBSWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMADDWDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMULDQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMULHRSWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMULHUWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMULHWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMULLWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "PMULUDQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDYrri")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPDrri")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSYrri")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCMPPSrri")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCMPSSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)PSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMAX(C?)SSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)PSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VMIN(C?)SSrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMADDWDrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULDQrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHRSWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHUWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULHWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULLWrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQYrr")>;
+def: InstRW<[SKLWriteResGroup49], (instregex "VPMULUDQrr")>;
+
+def SKLWriteResGroup50 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup50], (instregex "MPSADBWrri")>;
+def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWYrri")>;
+def: InstRW<[SKLWriteResGroup50], (instregex "VMPSADBWrri")>;
+
+def SKLWriteResGroup51 : SchedWriteRes<[SKLPort1,SKLPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup51], (instregex "IMUL64r")>;
+def: InstRW<[SKLWriteResGroup51], (instregex "MUL64r")>;
+def: InstRW<[SKLWriteResGroup51], (instregex "MULX64rr")>;
+
+def SKLWriteResGroup51_16 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def: InstRW<[SKLWriteResGroup51_16], (instregex "IMUL16r")>;
+def: InstRW<[SKLWriteResGroup51_16], (instregex "MUL16r")>;
+
+def SKLWriteResGroup52 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLDYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLQYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSLLWYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSRADYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSRAWYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLDYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLQYrr")>;
+def: InstRW<[SKLWriteResGroup52], (instregex "VPSRLWYrr")>;
+
+def SKLWriteResGroup53 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP16m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP32m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "ISTT_FP64m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST_F16m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST_F32m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP16m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP32m")>;
+def: InstRW<[SKLWriteResGroup53], (instregex "IST_FP64m")>;
+
+def SKLWriteResGroup54 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[SKLWriteResGroup54], (instregex "FNCLEX")>;
+
+def SKLWriteResGroup55 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKLWriteResGroup55], (instregex "PAUSE")>;
+
+def SKLWriteResGroup56 : SchedWriteRes<[SKLPort015,SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKLWriteResGroup56], (instregex "VZEROUPPER")>;
+
+def SKLWriteResGroup57 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKLWriteResGroup57], (instregex "LAR(16|32|64)rr")>;
+
+def SKLWriteResGroup58 : SchedWriteRes<[SKLPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64rm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVD64to64rm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MMX_MOVQ64rm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOV(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOV64toPQIrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOV8rm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVDDUPrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVQI2PQIrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSDrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSSrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHNTA")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT0")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT1")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "PREFETCHT2")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDDUPrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSDrm")>;
+def: InstRW<[SKLWriteResGroup58], (instregex "VMOVSSrm")>;
+
+def SKLWriteResGroup59 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup59], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[SKLWriteResGroup59], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[SKLWriteResGroup59], (instregex "VCVTDQ2PDrr")>;
+
+def SKLWriteResGroup60 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2DQrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTPD2PSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTPS2PDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTSD2SSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI642SDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTSI2SSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTSS2SDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPH2PSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTPS2PHrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSD2SSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI642SDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[SKLWriteResGroup60], (instregex "VCVTTPD2DQrr")>;
+
+def SKLWriteResGroup61 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup61], (instregex "STR(16|32|64)r")>;
+
+def SKLWriteResGroup62 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup62], (instregex "IMUL32r")>;
+def: InstRW<[SKLWriteResGroup62], (instregex "MUL32r")>;
+def: InstRW<[SKLWriteResGroup62], (instregex "MULX32rr")>;
+
+def SKLWriteResGroup63 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[SKLWriteResGroup63], (instregex "XSETBV")>;
+
+def SKLWriteResGroup64 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
+}
+def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG(16|32|64)rr")>;
+def: InstRW<[SKLWriteResGroup64], (instregex "CMPXCHG8rr")>;
+
+def SKLWriteResGroup65 : SchedWriteRes<[SKLPort4,SKLPort237,SKLPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF16")>;
+def: InstRW<[SKLWriteResGroup65], (instregex "PUSHF64")>;
+
+def SKLWriteResGroup66 : SchedWriteRes<[SKLPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup66], (instregex "PCLMULQDQrr")>;
+def: InstRW<[SKLWriteResGroup66], (instregex "VPCLMULQDQrr")>;
+
+def SKLWriteResGroup67 : SchedWriteRes<[SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup67], (instregex "LDDQUrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPDrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVAPSrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQArm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVDQUrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVNTDQArm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVSHDUPrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVSLDUPrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPDrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "MOVUPSrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VLDDQUrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPDrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVAPSrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQArm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVDQUrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVNTDQArm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPDrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VMOVUPSrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTDrm")>;
+def: InstRW<[SKLWriteResGroup67], (instregex "VPBROADCASTQrm")>;
+
+def SKLWriteResGroup68 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup68], (instregex "MMX_CVTPI2PSirr")>;
+
+def SKLWriteResGroup69 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDSWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PADDUSWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PAVGWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQDirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPEQWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTDirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PCMPGTWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXSWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMAXUBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINSWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PMINUBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLDrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLQrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSLLWrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRADrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRAWrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLDrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLQrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSRLWrm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBSWirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSBirm")>;
+def: InstRW<[SKLWriteResGroup69], (instregex "MMX_PSUBUSWirm")>;
+
+def SKLWriteResGroup70 : SchedWriteRes<[SKLPort0,SKLPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "CVTSD2SIrr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "CVTSS2SIrr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSD2SIrr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[SKLWriteResGroup70], (instregex "VCVTTSD2SIrr")>;
+
+def SKLWriteResGroup71 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PINSRWirmi")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PSHUFWmi")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHBWirm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHDQirm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKHWDirm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLBWirm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLDQirm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MMX_PUNPCKLWDirm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MOVHPSrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "MOVLPSrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PINSRBrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PINSRDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PINSRQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PINSRWrmi")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXBWrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXDQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVSXWQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXBWrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXDQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "PMOVZXWQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VMOVHPSrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VMOVLPSrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRBrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPINSRWrmi")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[SKLWriteResGroup71], (instregex "VPMOVZXWQrm")>;
+
+def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup72], (instregex "FARJMP64")>;
+def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>;
+
+def SKLWriteResGroup73 : SchedWriteRes<[SKLPort23,SKLPort05]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDBirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDDirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDQirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PADDWirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDNirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PANDirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PORirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBBirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBDirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBQirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PSUBWirm")>;
+def: InstRW<[SKLWriteResGroup73], (instregex "MMX_PXORirm")>;
+
+def SKLWriteResGroup74 : SchedWriteRes<[SKLPort23,SKLPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup74], (instregex "ADC(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "ADC8rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "ADCX(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "ADOX(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "BT(16|32|64)mi8")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVAE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVB(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVG(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVGE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVL(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVLE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNE(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNO(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNP(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVNS(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVO(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVP(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "CMOVS(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "RORX32mi")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "RORX64mi")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SARX32rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SARX64rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SBB(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SBB8rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SHLX32rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SHLX64rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SHRX32rm")>;
+def: InstRW<[SKLWriteResGroup74], (instregex "SHRX64rm")>;
+
+def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup75], (instregex "ANDN(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "BLSI(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "BLSMSK(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "BLSR(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "BZHI(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup75], (instregex "MOVBE(16|32|64)rm")>;
+
+def SKLWriteResGroup76 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup76], (instregex "ADD(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "ADD8rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "AND(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "AND8rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "CMP(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mi")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "CMP8mr")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "CMP8rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "OR(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "OR8rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "POP(16|32|64)r(mr)?")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "SUB(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "SUB8rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "TEST(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mi")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "TEST8mr")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "XOR(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup76], (instregex "XOR8rm")>;
+
+def SKLWriteResGroup77 : SchedWriteRes<[SKLPort5,SKLPort01]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup77], (instregex "HADDPDrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "HADDPSrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPDrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "HSUBPSrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDYrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPDrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSYrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHADDPSrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDYrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPDrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSYrr")>;
+def: InstRW<[SKLWriteResGroup77], (instregex "VHSUBPSrr")>;
+
+def SKLWriteResGroup78 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup78], (instregex "CVTSI642SSrr")>;
+def: InstRW<[SKLWriteResGroup78], (instregex "VCVTSI642SSrr")>;
+
+def SKLWriteResGroup79 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKLWriteResGroup79], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[SKLWriteResGroup79], (instregex "SHRD(16|32|64)rrCL")>;
+
+def SKLWriteResGroup80 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort06,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup80], (instregex "SLDT(16|32|64)r")>;
+
+def SKLWriteResGroup81 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup81], (instregex "VCVTPS2PHmr")>;
+
+def SKLWriteResGroup82 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup82], (instregex "BTC(16|32|64)mi8")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "BTR(16|32|64)mi8")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "BTS(16|32|64)mi8")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SAR(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SAR8m1")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SAR8mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHL(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHL8m1")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHL8mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHR(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHR8m1")>;
+def: InstRW<[SKLWriteResGroup82], (instregex "SHR8mi")>;
+
+def SKLWriteResGroup83 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "ADD(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "ADD8mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "AND(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "AND8mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "AND8mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "DEC(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "DEC8m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "INC(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "INC8m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "NEG(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "NEG8m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "NOT(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "NOT8m")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "OR(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "OR8mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "OR8mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "POP(16|32|64)rmm")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "PUSH(16|32|64)rmm")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "SUB(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "SUB8mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "XOR(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mi")>;
+def: InstRW<[SKLWriteResGroup83], (instregex "XOR8mr")>;
+
+def SKLWriteResGroup84 : SchedWriteRes<[SKLPort6,SKLPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[SKLWriteResGroup84], (instregex "STD")>;
+
+def SKLWriteResGroup85 : SchedWriteRes<[SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup85], (instregex "LD_F32m")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "LD_F64m")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "LD_F80m")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTF128")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTI128")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VBROADCASTSSYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VLDDQUYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPDYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVAPSYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQAYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVDQUYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVNTDQAYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPDYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VMOVUPSYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTDYrm")>;
+def: InstRW<[SKLWriteResGroup85], (instregex "VPBROADCASTQYrm")>;
+
+def SKLWriteResGroup86 : SchedWriteRes<[SKLPort0,SKLPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup86], (instregex "VCVTDQ2PDYrr")>;
+
+def SKLWriteResGroup87 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup87], (instregex "COMISDrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "COMISSrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISDrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "UCOMISSrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISDrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "VCOMISSrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISDrm")>;
+def: InstRW<[SKLWriteResGroup87], (instregex "VUCOMISSrm")>;
+
+def SKLWriteResGroup88 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup88], (instregex "INSERTPSrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSDWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PACKSSWBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSDWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PACKUSWBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PALIGNRrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PBLENDWrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFDmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFHWmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PSHUFLWmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPDrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "SHUFPSrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKHPSrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "UNPCKLPSrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VINSERTPSrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSDWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPACKSSWBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSDWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPACKUSWBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPALIGNRrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPBLENDWrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPBROADCASTWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPERMILPSrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFBrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFDmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFHWmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPSHUFLWmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VPUNPCKLWDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPDrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VSHUFPSrmi")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[SKLWriteResGroup88], (instregex "VUNPCKLPSrm")>;
+
+def SKLWriteResGroup89 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPH2PSYrr")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTPS2PHYrr")>;
+def: InstRW<[SKLWriteResGroup89], (instregex "VCVTTPD2DQYrr")>;
+
+def SKLWriteResGroup90 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup90], (instregex "PABSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PABSDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PABSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PADDSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PADDSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PADDUSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PAVGBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PAVGWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPEQWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PCMPGTWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMAXSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMAXUWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMINSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMINSDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMINSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMINUBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMINUDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PMINUWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNBrm128")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNDrm128")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSIGNWrm128")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSLLDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSLLQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSLLWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSRADrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSRAWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSRLDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSRLQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSRLWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSUBSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "PSUBUSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPABSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPABSDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPABSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPADDSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPADDUSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPAVGWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPEQWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPCMPGTWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMAXUWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMINSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPMINUWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNBrm128")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNDrm128")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSIGNWrm128")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLVQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSLLWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRADrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAVDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRAWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVDrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLVQrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSRLWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBSWrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSBrm")>;
+def: InstRW<[SKLWriteResGroup90], (instregex "VPSUBUSWrm")>;
+
+def SKLWriteResGroup91 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "ANDNPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "ANDPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "ANDPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPDrmi")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "BLENDPSrmi")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "ORPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "ORPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PADDBrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PADDDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PADDQrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PADDWrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PANDNrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PANDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PORrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PSUBBrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PSUBDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PSUBQrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PSUBWrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "PXORrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VANDNPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VANDPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VANDPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPDrmi")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VBLENDPSrmi")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTF128rm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VINSERTI128rm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VORPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VORPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPADDBrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPADDDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPADDQrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPADDWrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPANDNrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPANDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPBLENDDrmi")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPMASKMOVQrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPORrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBBrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBQrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPSUBWrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VPXORrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VXORPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "VXORPSrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "XORPDrm")>;
+def: InstRW<[SKLWriteResGroup91], (instregex "XORPSrm")>;
+
+def SKLWriteResGroup92 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSDWirm")>;
+def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKSSWBirm")>;
+def: InstRW<[SKLWriteResGroup92], (instregex "MMX_PACKUSWBirm")>;
+
+def SKLWriteResGroup93 : SchedWriteRes<[SKLPort23,SKLPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup93], (instregex "CMOVA(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup93], (instregex "CMOVBE(16|32|64)rm")>;
+
+def SKLWriteResGroup94 : SchedWriteRes<[SKLPort23,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup94], (instregex "LEAVE64")>;
+def: InstRW<[SKLWriteResGroup94], (instregex "SCASB")>;
+def: InstRW<[SKLWriteResGroup94], (instregex "SCASL")>;
+def: InstRW<[SKLWriteResGroup94], (instregex "SCASQ")>;
+def: InstRW<[SKLWriteResGroup94], (instregex "SCASW")>;
+
+def SKLWriteResGroup95 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[SKLWriteResGroup95], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[SKLWriteResGroup95], (instregex "VCVTTSS2SIrr")>;
+
+def SKLWriteResGroup96 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup96], (instregex "FLDCW16m")>;
+
+def SKLWriteResGroup97 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup97], (instregex "LDMXCSR")>;
+def: InstRW<[SKLWriteResGroup97], (instregex "VLDMXCSR")>;
+
+def SKLWriteResGroup98 : SchedWriteRes<[SKLPort6,SKLPort23,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup98], (instregex "LRETQ")>;
+def: InstRW<[SKLWriteResGroup98], (instregex "RETQ")>;
+
+def SKLWriteResGroup99 : SchedWriteRes<[SKLPort23,SKLPort06,SKLPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup99], (instregex "BEXTR(32|64)rm")>;
+
+def SKLWriteResGroup100 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL8m1")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROL8mi")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROR(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROR8m1")>;
+def: InstRW<[SKLWriteResGroup100], (instregex "ROR8mi")>;
+
+def SKLWriteResGroup101 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup101], (instregex "XADD(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup101], (instregex "XADD8rm")>;
+
+def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup102], (instregex "FARCALL64")>;
+
+def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,1,2];
+}
+def: InstRW<[SKLWriteResGroup103], (instregex "LOOP")>;
+
+def SKLWriteResGroup104 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup104], (instregex "AESIMCrr")>;
+def: InstRW<[SKLWriteResGroup104], (instregex "VAESIMCrr")>;
+
+def SKLWriteResGroup105 : SchedWriteRes<[SKLPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKLWriteResGroup105], (instregex "PMULLDrr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPDr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDPSr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSDr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "ROUNDSSr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDYrr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VPMULLDrr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPDr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDPSr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSDr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDSSr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPDr")>;
+def: InstRW<[SKLWriteResGroup105], (instregex "VROUNDYPSr")>;
+
+def SKLWriteResGroup106 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPDrm")>;
+def: InstRW<[SKLWriteResGroup106], (instregex "VTESTPSrm")>;
+
+def SKLWriteResGroup107 : SchedWriteRes<[SKLPort1,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup107], (instregex "BSF(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "BSR(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "IMUL64m")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "IMUL(32|64)rm(i8)?")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "IMUL8m")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "LZCNT(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "MUL(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "MUL8m")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "PDEP(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "PEXT(32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "POPCNT(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup107], (instregex "TZCNT(16|32|64)rm")>;
+
+def SKLWriteResGroup107_16 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup107_16], (instregex "IMUL16rm(i8)?")>;
+
+def SKLWriteResGroup107_16_2 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 5;
+}
+def: InstRW<[SKLWriteResGroup107_16_2], (instregex "IMUL16m")>;
+def: InstRW<[SKLWriteResGroup107_16_2], (instregex "MUL16m")>;
+
+def SKLWriteResGroup107_32 : SchedWriteRes<[SKLPort1, SKLPort0156, SKLPort23]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup107_32], (instregex "IMUL32m")>;
+def: InstRW<[SKLWriteResGroup107_32], (instregex "MUL32m")>;
+
+def SKLWriteResGroup108 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOM32m")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOM64m")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP32m")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "FCOMP64m")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "MMX_PSADBWirm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSDWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPACKSSWBYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSDWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPACKUSWBYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPALIGNRYrmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPBLENDWYrmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTBYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPBROADCASTWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPDYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPERMILPSYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBDYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXBQYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPMOVSXWQYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFBYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFDYmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFHWYmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPSHUFLWYmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHBWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHDQYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHQDQYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKHWDYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLBWYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLDQYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLQDQYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VPUNPCKLWDYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPDYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKHPSYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[SKLWriteResGroup108], (instregex "VUNPCKLPSYrm")>;
+
+def SKLWriteResGroup109 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup109], (instregex "VPABSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPABSDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPABSWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPADDSWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPADDUSWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPAVGWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQQYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPEQWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPCMPGTWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXSWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMAXUWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMINSWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPMINUWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNBYrm256")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNDYrm256")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSIGNWYrm256")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLQYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLVQYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSLLWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRADYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAVDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRAWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLQYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVDYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLVQYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSRLWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBSWYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSBYrm")>;
+def: InstRW<[SKLWriteResGroup109], (instregex "VPSUBUSWYrm")>;
+
+def SKLWriteResGroup110 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VANDNPSYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VANDPDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VANDPSYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VBLENDPSYrmi")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VORPDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VORPSYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPADDBYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPADDDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPADDQYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPADDWYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPANDNYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPANDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPBLENDDYrmi")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPMASKMOVQYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPORYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBBYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBQYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPSUBWYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VPXORYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VXORPDYrm")>;
+def: InstRW<[SKLWriteResGroup110], (instregex "VXORPSYrm")>;
+
+def SKLWriteResGroup111 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPDrm0")>;
+def: InstRW<[SKLWriteResGroup111], (instregex "BLENDVPSrm0")>;
+def: InstRW<[SKLWriteResGroup111], (instregex "PBLENDVBrm0")>;
+def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPDrm")>;
+def: InstRW<[SKLWriteResGroup111], (instregex "VBLENDVPSrm")>;
+def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBYrm")>;
+def: InstRW<[SKLWriteResGroup111], (instregex "VPBLENDVBrm")>;
+
+def SKLWriteResGroup112 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[SKLWriteResGroup112], (instregex "MMX_PHSUBSWrm64")>;
+
+def SKLWriteResGroup113 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort05]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[SKLWriteResGroup113], (instregex "MMX_PHSUBWrm64")>;
+
+def SKLWriteResGroup114 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort237,SKLPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup114], (instregex "VCVTPS2PHYmr")>;
+
+def SKLWriteResGroup115 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[SKLWriteResGroup115], (instregex "ROR(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup115], (instregex "ROR8mCL")>;
+
+def SKLWriteResGroup116 : SchedWriteRes<[SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL8m1")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCL8mi")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)m1")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCR(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCR8m1")>;
+def: InstRW<[SKLWriteResGroup116], (instregex "RCR8mi")>;
+
+def SKLWriteResGroup117 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[SKLWriteResGroup117], (instregex "ROL(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "ROL8mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "SAR(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "SAR8mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "SHL(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "SHL8mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "SHR(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup117], (instregex "SHR8mCL")>;
+
+def SKLWriteResGroup118 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[SKLWriteResGroup118], (instregex "ADC(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup118], (instregex "ADC8mi")>;
+
+def SKLWriteResGroup119 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[SKLWriteResGroup119], (instregex "ADC(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "ADC8mr")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "CMPXCHG8rm")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mi")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "SBB(16|32|64)mr")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mi")>;
+def: InstRW<[SKLWriteResGroup119], (instregex "SBB8mr")>;
+
+def SKLWriteResGroup120 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMADDWDirm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHUWirm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULHWirm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULLWirm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "RCPSSm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "RSQRTSSm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "VRCPSSm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "VRSQRTSSm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPDYrm")>;
+def: InstRW<[SKLWriteResGroup120], (instregex "VTESTPSYrm")>;
+
+def SKLWriteResGroup121 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup121], (instregex "PCMPGTQrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "PSADBWrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "VPCMPGTQrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXBWYrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXDQYrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVSXWDYrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "VPMOVZXWDYrm")>;
+def: InstRW<[SKLWriteResGroup121], (instregex "VPSADBWrm")>;
+
+def SKLWriteResGroup122 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup122], (instregex "ADDSDrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "ADDSSrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "MULSDrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "MULSSrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "SUBSDrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "SUBSSrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "VADDSDrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "VADDSSrm")>;
+def: InstRW<[SKLWriteResGroup122],
+ (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "VMULSDrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "VMULSSrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSDrm")>;
+def: InstRW<[SKLWriteResGroup122], (instregex "VSUBSSrm")>;
+
+def SKLWriteResGroup123 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup123], (instregex "CMPSDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "CMPSSrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "CVTPS2PDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MAX(C?)SSrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MIN(C?)SSrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VCMPSSrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPH2PSrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VMAX(C?)SSrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SDrm")>;
+def: InstRW<[SKLWriteResGroup123], (instregex "VMIN(C?)SSrm")>;
+
+def SKLWriteResGroup124 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup124], (instregex "DPPDrri")>;
+def: InstRW<[SKLWriteResGroup124], (instregex "VDPPDrri")>;
+
+def SKLWriteResGroup125 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[SKLWriteResGroup125], (instregex "VBLENDVPSYrm")>;
+
+def SKLWriteResGroup126 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup126], (instregex "PTESTrm")>;
+def: InstRW<[SKLWriteResGroup126], (instregex "VPTESTrm")>;
+
+def SKLWriteResGroup127 : SchedWriteRes<[SKLPort1,SKLPort5,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup127], (instregex "MULX64rm")>;
+
+def SKLWriteResGroup128 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup128], (instregex "PHADDSWrm128")>;
+def: InstRW<[SKLWriteResGroup128], (instregex "PHSUBSWrm128")>;
+def: InstRW<[SKLWriteResGroup128], (instregex "VPHADDSWrm128")>;
+def: InstRW<[SKLWriteResGroup128], (instregex "VPHSUBSWrm128")>;
+
+def SKLWriteResGroup129 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup129], (instregex "PHADDDrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "PHADDWrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBDrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "PHSUBWrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDDrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "VPHADDWrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBDrm")>;
+def: InstRW<[SKLWriteResGroup129], (instregex "VPHSUBWrm")>;
+
+def SKLWriteResGroup130 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup130], (instregex "SHLD(16|32|64)mri8")>;
+def: InstRW<[SKLWriteResGroup130], (instregex "SHRD(16|32|64)mri8")>;
+
+def SKLWriteResGroup131 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKLWriteResGroup131], (instregex "LAR(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup131], (instregex "LSL(16|32|64)rm")>;
+
+def SKLWriteResGroup132 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup132], (instregex "AESDECLASTrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "AESDECrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "AESENCLASTrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "AESENCrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "RCPPSm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "RSQRTPSm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECLASTrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "VAESDECrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCLASTrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "VAESENCrm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "VRCPPSm")>;
+def: InstRW<[SKLWriteResGroup132], (instregex "VRSQRTPSm")>;
+
+def SKLWriteResGroup133 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F32m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "ADD_F64m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F16m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F32m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "ILD_F64m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F32m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "SUBR_F64m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F32m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "SUB_F64m")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPCMPGTQYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2F128rm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPERM2I128rm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPERMDYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPDYmi")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPERMPSYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPERMQYmi")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBDYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBQYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXBWYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXDQYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPMOVZXWQYrm")>;
+def: InstRW<[SKLWriteResGroup133], (instregex "VPSADBWYrm")>;
+
+def SKLWriteResGroup134 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup134], (instregex "ADDPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "ADDPSrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "ADDSUBPSrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "MULPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "MULPSrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "SUBPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "SUBPSrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VADDPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VADDPSrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VADDSUBPSrm")>;
+def: InstRW<[SKLWriteResGroup134],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VMULPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VMULPSrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPDrm")>;
+def: InstRW<[SKLWriteResGroup134], (instregex "VSUBPSrm")>;
+
+def SKLWriteResGroup135 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup135], (instregex "CMPPDrmi")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "CMPPSrmi")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "CVTPS2DQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "CVTSS2SDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "MAX(C?)PSrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "MIN(C?)PSrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMADDUBSWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMADDWDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMULDQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMULHRSWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMULHUWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMULHWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMULLWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "PMULUDQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPDrmi")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCMPPSrmi")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPH2PSYrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VMAX(C?)PSrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VMIN(C?)PSrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMADDWDrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMULDQrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHRSWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHUWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMULHWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMULLWrm")>;
+def: InstRW<[SKLWriteResGroup135], (instregex "VPMULUDQrm")>;
+
+def SKLWriteResGroup136 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRIrr")>;
+def: InstRW<[SKLWriteResGroup136], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[SKLWriteResGroup136], (instregex "VPCMPISTRM128rr")>;
+
+def SKLWriteResGroup137 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup137], (instregex "MPSADBWrmi")>;
+def: InstRW<[SKLWriteResGroup137], (instregex "VMPSADBWrmi")>;
+
+def SKLWriteResGroup138 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup138], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[SKLWriteResGroup138], (instregex "VPTESTYrm")>;
+
+def SKLWriteResGroup139 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup139], (instregex "CVTSD2SSrm")>;
+def: InstRW<[SKLWriteResGroup139], (instregex "VCVTSD2SSrm")>;
+
+def SKLWriteResGroup140 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup140], (instregex "VPHADDSWrm256")>;
+def: InstRW<[SKLWriteResGroup140], (instregex "VPHSUBSWrm256")>;
+
+def SKLWriteResGroup141 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDDYrm")>;
+def: InstRW<[SKLWriteResGroup141], (instregex "VPHADDWYrm")>;
+def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBDYrm")>;
+def: InstRW<[SKLWriteResGroup141], (instregex "VPHSUBWYrm")>;
+
+def SKLWriteResGroup142 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort06,SKLPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup142], (instregex "MULX32rm")>;
+
+def SKLWriteResGroup143 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,3];
+}
+def: InstRW<[SKLWriteResGroup143], (instregex "ADD8mi")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "AND8mi")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "OR8mi")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "SUB8mi")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "XCHG(16|32|64)rm")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "XCHG8rm")>;
+def: InstRW<[SKLWriteResGroup143], (instregex "XOR8mi")>;
+
+def SKLWriteResGroup144 : SchedWriteRes<[SKLPort05,SKLPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 10;
+ let ResourceCycles = [9,1];
+}
+def: InstRW<[SKLWriteResGroup144], (instregex "MMX_EMMS")>;
+
+def SKLWriteResGroup145 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup145], (instregex "DIVPSrr")>;
+def: InstRW<[SKLWriteResGroup145], (instregex "DIVSSrr")>;
+def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSYrr")>;
+def: InstRW<[SKLWriteResGroup145], (instregex "VDIVPSrr")>;
+def: InstRW<[SKLWriteResGroup145], (instregex "VDIVSSrr")>;
+
+def SKLWriteResGroup146 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F32m")>;
+def: InstRW<[SKLWriteResGroup146], (instregex "MUL_F64m")>;
+def: InstRW<[SKLWriteResGroup146], (instregex "VRCPPSYm")>;
+def: InstRW<[SKLWriteResGroup146], (instregex "VRSQRTPSYm")>;
+
+def SKLWriteResGroup147 : SchedWriteRes<[SKLPort01,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup147], (instregex "VADDPDYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VADDPSYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[SKLWriteResGroup147],
+ (instregex "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VMULPDYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VMULPSYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPDYrm")>;
+def: InstRW<[SKLWriteResGroup147], (instregex "VSUBPSYrm")>;
+
+def SKLWriteResGroup148 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPDYrmi")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VCMPPSYrmi")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VCVTPS2PDYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VCVTTPS2DQYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PDYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VMAX(C?)PSYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PDYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VMIN(C?)PSYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDUBSWYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMADDWDYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMULDQYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHRSWYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHUWYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMULHWYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMULLWYrm")>;
+def: InstRW<[SKLWriteResGroup148], (instregex "VPMULUDQYrm")>;
+
+def SKLWriteResGroup149 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOM16m")>;
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOM32m")>;
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP16m")>;
+def: InstRW<[SKLWriteResGroup149], (instregex "FICOMP32m")>;
+def: InstRW<[SKLWriteResGroup149], (instregex "VMPSADBWYrmi")>;
+
+def SKLWriteResGroup150 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup150], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[SKLWriteResGroup150], (instregex "VCVTDQ2PDrm")>;
+
+def SKLWriteResGroup151 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTSD2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTSS2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSD2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSD2SIrm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[SKLWriteResGroup151], (instregex "VCVTTSS2SIrm")>;
+
+def SKLWriteResGroup152 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2DQrm")>;
+def: InstRW<[SKLWriteResGroup152], (instregex "CVTPD2PSrm")>;
+def: InstRW<[SKLWriteResGroup152], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[SKLWriteResGroup152], (instregex "MMX_CVTTPD2PIirm")>;
+
+def SKLWriteResGroup153 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[SKLWriteResGroup153], (instregex "SHLD(16|32|64)mrCL")>;
+def: InstRW<[SKLWriteResGroup153], (instregex "SHRD(16|32|64)mrCL")>;
+
+def SKLWriteResGroup154 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,3,2];
+}
+def: InstRW<[SKLWriteResGroup154], (instregex "RCL(16|32|64)rCL")>;
+def: InstRW<[SKLWriteResGroup154], (instregex "RCR(16|32|64)rCL")>;
+
+def SKLWriteResGroup155 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,1,2];
+}
+def: InstRW<[SKLWriteResGroup155], (instregex "RCL8rCL")>;
+
+def SKLWriteResGroup156 : SchedWriteRes<[SKLPort06,SKLPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[SKLWriteResGroup156], (instregex "LOOPE")>;
+def: InstRW<[SKLWriteResGroup156], (instregex "LOOPNE")>;
+
+def SKLWriteResGroup157 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 12;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSYr")>;
+def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTPSr")>;
+def: InstRW<[SKLWriteResGroup157], (instregex "VSQRTSSr")>;
+
+def SKLWriteResGroup158 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup158], (instregex "PCLMULQDQrm")>;
+def: InstRW<[SKLWriteResGroup158], (instregex "VPCLMULQDQrm")>;
+
+def SKLWriteResGroup159 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup159], (instregex "HADDPDrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "HADDPSrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPDrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "HSUBPSrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPDrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "VHADDPSrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPDrm")>;
+def: InstRW<[SKLWriteResGroup159], (instregex "VHSUBPSrm")>;
+
+def SKLWriteResGroup160 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup160], (instregex "CVTTSS2SI64rm")>;
+
+def SKLWriteResGroup161 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 13;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup161], (instregex "SQRTPSr")>;
+def: InstRW<[SKLWriteResGroup161], (instregex "SQRTSSr")>;
+
+def SKLWriteResGroup162 : SchedWriteRes<[SKLPort5,SKLPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI16m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "ADD_FI32m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI16m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "SUBR_FI32m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI16m")>;
+def: InstRW<[SKLWriteResGroup162], (instregex "SUB_FI32m")>;
+
+def SKLWriteResGroup163 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup163], (instregex "VCVTDQ2PDYrm")>;
+
+def SKLWriteResGroup164 : SchedWriteRes<[SKLPort5,SKLPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKLWriteResGroup164], (instregex "DPPSrri")>;
+def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSYrri")>;
+def: InstRW<[SKLWriteResGroup164], (instregex "VDPPSrri")>;
+
+def SKLWriteResGroup165 : SchedWriteRes<[SKLPort5,SKLPort01,SKLPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPDYrm")>;
+def: InstRW<[SKLWriteResGroup165], (instregex "VHADDPSYrm")>;
+def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPDYrm")>;
+def: InstRW<[SKLWriteResGroup165], (instregex "VHSUBPSYrm")>;
+
+def SKLWriteResGroup166 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup166], (instregex "DIVPDrr")>;
+def: InstRW<[SKLWriteResGroup166], (instregex "DIVSDrr")>;
+def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDYrr")>;
+def: InstRW<[SKLWriteResGroup166], (instregex "VDIVPDrr")>;
+def: InstRW<[SKLWriteResGroup166], (instregex "VDIVSDrr")>;
+
+def SKLWriteResGroup167 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKLWriteResGroup167], (instregex "AESIMCrm")>;
+def: InstRW<[SKLWriteResGroup167], (instregex "VAESIMCrm")>;
+
+def SKLWriteResGroup168 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup168], (instregex "PMULLDrm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPDm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDPSm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSDm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "ROUNDSSm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "VPMULLDrm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPDm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDPSm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSDm")>;
+def: InstRW<[SKLWriteResGroup168], (instregex "VROUNDSSm")>;
+
+def SKLWriteResGroup169 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI16m")>;
+def: InstRW<[SKLWriteResGroup169], (instregex "MUL_FI32m")>;
+
+def SKLWriteResGroup170 : SchedWriteRes<[SKLPort1,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,4,1,3];
+}
+def: InstRW<[SKLWriteResGroup170], (instregex "RCR8rCL")>;
+
+def SKLWriteResGroup171 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 15;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FPrST0")>;
+def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FST0r")>;
+def: InstRW<[SKLWriteResGroup171], (instregex "DIVR_FrST0")>;
+
+def SKLWriteResGroup172 : SchedWriteRes<[SKLPort23,SKLPort015]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKLWriteResGroup172], (instregex "VPMULLDYrm")>;
+def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPDm")>;
+def: InstRW<[SKLWriteResGroup172], (instregex "VROUNDYPSm")>;
+
+def SKLWriteResGroup173 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 15;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKLWriteResGroup173], (instregex "DPPDrmi")>;
+def: InstRW<[SKLWriteResGroup173], (instregex "VDPPDrmi")>;
+
+def SKLWriteResGroup174 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,5,1,1];
+}
+def: InstRW<[SKLWriteResGroup174], (instregex "RCL(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup174], (instregex "RCL8mCL")>;
+
+def SKLWriteResGroup175 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup175], (instregex "DIVSSrm")>;
+def: InstRW<[SKLWriteResGroup175], (instregex "VDIVSSrm")>;
+
+def SKLWriteResGroup176 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRIrm")>;
+def: InstRW<[SKLWriteResGroup176], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[SKLWriteResGroup176], (instregex "VPCMPISTRM128rm")>;
+
+def SKLWriteResGroup177 : SchedWriteRes<[SKLPort4,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[SKLWriteResGroup177], (instregex "CMPXCHG8B")>;
+
+def SKLWriteResGroup178 : SchedWriteRes<[SKLPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ResourceCycles = [16];
+}
+def: InstRW<[SKLWriteResGroup178], (instregex "VZEROALL")>;
+
+def SKLWriteResGroup179 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup179], (instregex "DIVPSrm")>;
+def: InstRW<[SKLWriteResGroup179], (instregex "VDIVPSrm")>;
+def: InstRW<[SKLWriteResGroup179], (instregex "VSQRTSSm")>;
+
+def SKLWriteResGroup180 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[SKLWriteResGroup180], (instregex "XCH_F")>;
+
+def SKLWriteResGroup181 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 18;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDYr")>;
+def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTPDr")>;
+def: InstRW<[SKLWriteResGroup181], (instregex "VSQRTSDr")>;
+
+def SKLWriteResGroup182 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup182], (instregex "SQRTSSm")>;
+def: InstRW<[SKLWriteResGroup182], (instregex "VDIVPSYrm")>;
+def: InstRW<[SKLWriteResGroup182], (instregex "VSQRTPSm")>;
+
+def SKLWriteResGroup183 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def: InstRW<[SKLWriteResGroup183], (instregex "PCMPESTRIrr")>;
+def: InstRW<[SKLWriteResGroup183], (instregex "VPCMPESTRIrr")>;
+
+def SKLWriteResGroup184 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[SKLWriteResGroup184], (instregex "CPUID")>;
+def: InstRW<[SKLWriteResGroup184], (instregex "RDTSC")>;
+
+def SKLWriteResGroup185 : SchedWriteRes<[SKLPort1,SKLPort23,SKLPort237,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,4,1,2];
+}
+def: InstRW<[SKLWriteResGroup185], (instregex "RCR(16|32|64)mCL")>;
+def: InstRW<[SKLWriteResGroup185], (instregex "RCR8mCL")>;
+
+def SKLWriteResGroup186 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup186], (instregex "DIVSDrm")>;
+def: InstRW<[SKLWriteResGroup186], (instregex "SQRTPSm")>;
+def: InstRW<[SKLWriteResGroup186], (instregex "VDIVSDrm")>;
+def: InstRW<[SKLWriteResGroup186], (instregex "VSQRTPSYm")>;
+
+def SKLWriteResGroup187 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 19;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[SKLWriteResGroup187], (instregex "DPPSrmi")>;
+def: InstRW<[SKLWriteResGroup187], (instregex "VDPPSrmi")>;
+
+def SKLWriteResGroup188 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015,SKLPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[SKLWriteResGroup188], (instregex "PCMPESTRM128rr")>;
+def: InstRW<[SKLWriteResGroup188], (instregex "VPCMPESTRM128rr")>;
+
+def SKLWriteResGroup189 : SchedWriteRes<[SKLPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FPrST0")>;
+def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FST0r")>;
+def: InstRW<[SKLWriteResGroup189], (instregex "DIV_FrST0")>;
+def: InstRW<[SKLWriteResGroup189], (instregex "SQRTPDr")>;
+def: InstRW<[SKLWriteResGroup189], (instregex "SQRTSDr")>;
+
+def SKLWriteResGroup190 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup190], (instregex "DIVPDrm")>;
+def: InstRW<[SKLWriteResGroup190], (instregex "VDIVPDrm")>;
+
+def SKLWriteResGroup191 : SchedWriteRes<[SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[SKLWriteResGroup191], (instregex "VDPPSYrmi")>;
+
+def SKLWriteResGroup192 : SchedWriteRes<[SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[SKLWriteResGroup192], (instregex "INSB")>;
+def: InstRW<[SKLWriteResGroup192], (instregex "INSL")>;
+def: InstRW<[SKLWriteResGroup192], (instregex "INSW")>;
+
+def SKLWriteResGroup193 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
+}
+def: InstRW<[SKLWriteResGroup193], (instregex "MWAITrr")>;
+
+def SKLWriteResGroup194 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,2];
+}
+def: InstRW<[SKLWriteResGroup194], (instregex "AESKEYGENASSIST128rr")>;
+def: InstRW<[SKLWriteResGroup194], (instregex "VAESKEYGENASSIST128rr")>;
+
+def SKLWriteResGroup195 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup195], (instregex "VDIVPDYrm")>;
+
+def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 22;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F32m")>;
+def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F64m")>;
+
+def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm,
+ VGATHERDPDrm,
+ VGATHERQPDrm,
+ VGATHERQPSrm,
+ VPGATHERDDrm,
+ VPGATHERDQrm,
+ VPGATHERQDrm,
+ VPGATHERQQrm)>;
+
+def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm,
+ VGATHERQPDYrm,
+ VGATHERQPSYrm,
+ VPGATHERDDYrm,
+ VPGATHERDQYrm,
+ VPGATHERQDYrm,
+ VPGATHERQQYrm,
+ VGATHERDPDYrm)>;
+
+def SKLWriteResGroup197 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 23;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup197], (instregex "VSQRTSDm")>;
+
+def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[SKLWriteResGroup198], (instregex "CMPXCHG16B")>;
+
+def SKLWriteResGroup199 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 24;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup199], (instregex "VSQRTPDm")>;
+
+def SKLWriteResGroup200 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[SKLWriteResGroup200], (instregex "PCMPESTRIrm")>;
+def: InstRW<[SKLWriteResGroup200], (instregex "VPCMPESTRIrm")>;
+
+def SKLWriteResGroup201 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup201], (instregex "SQRTSDm")>;
+def: InstRW<[SKLWriteResGroup201], (instregex "VSQRTPDYm")>;
+
+def SKLWriteResGroup202 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI16m")>;
+def: InstRW<[SKLWriteResGroup202], (instregex "DIV_FI32m")>;
+
+def SKLWriteResGroup203 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015,SKLPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+def: InstRW<[SKLWriteResGroup203], (instregex "PCMPESTRM128rm")>;
+def: InstRW<[SKLWriteResGroup203], (instregex "VPCMPESTRM128rm")>;
+
+def SKLWriteResGroup204 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,1,1];
+}
+def: InstRW<[SKLWriteResGroup204], (instregex "AESKEYGENASSIST128rm")>;
+def: InstRW<[SKLWriteResGroup204], (instregex "VAESKEYGENASSIST128rm")>;
+
+def SKLWriteResGroup205 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 26;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup205], (instregex "SQRTPDm")>;
+
+def SKLWriteResGroup206 : SchedWriteRes<[SKLPort0,SKLPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F32m")>;
+def: InstRW<[SKLWriteResGroup206], (instregex "DIVR_F64m")>;
+
+def SKLWriteResGroup207 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23,SKLPort0156]> {
+ let Latency = 28;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,1,1];
+}
+def: InstRW<[SKLWriteResGroup207], (instregex "IDIV(16|32|64)m")>;
+def: InstRW<[SKLWriteResGroup207], (instregex "IDIV8m")>;
+
+def SKLWriteResGroup208 : SchedWriteRes<[SKLPort0,SKLPort5,SKLPort23]> {
+ let Latency = 30;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI16m")>;
+def: InstRW<[SKLWriteResGroup208], (instregex "DIVR_FI32m")>;
+
+def SKLWriteResGroup209 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort06,SKLPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)ri")>;
+def: InstRW<[SKLWriteResGroup209], (instregex "IN(16|32)rr")>;
+def: InstRW<[SKLWriteResGroup209], (instregex "IN8ri")>;
+def: InstRW<[SKLWriteResGroup209], (instregex "IN8rr")>;
+
+def SKLWriteResGroup210 : SchedWriteRes<[SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)ir")>;
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT(16|32)rr")>;
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT8ir")>;
+def: InstRW<[SKLWriteResGroup210], (instregex "OUT8rr")>;
+
+def SKLWriteResGroup211 : SchedWriteRes<[SKLPort1,SKLPort6,SKLPort23,SKLPort0156]> {
+ let Latency = 37;
+ let NumMicroOps = 31;
+ let ResourceCycles = [1,8,1,21];
+}
+def: InstRW<[SKLWriteResGroup211], (instregex "XRSTOR(64)?")>;
+
+def SKLWriteResGroup212 : SchedWriteRes<[SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort23,SKLPort237,SKLPort15,SKLPort0156]> {
+ let Latency = 40;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[SKLWriteResGroup212], (instregex "VMCLEARm")>;
+
+def SKLWriteResGroup213 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 41;
+ let NumMicroOps = 39;
+ let ResourceCycles = [1,10,1,1,26];
+}
+def: InstRW<[SKLWriteResGroup213], (instregex "XSAVE64")>;
+
+def SKLWriteResGroup214 : SchedWriteRes<[SKLPort5,SKLPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[SKLWriteResGroup214], (instregex "RDTSCP")>;
+
+def SKLWriteResGroup215 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 40;
+ let ResourceCycles = [1,11,1,1,26];
+}
+def: InstRW<[SKLWriteResGroup215], (instregex "^XSAVE$", "XSAVEC", "XSAVES")>;
+
+def SKLWriteResGroup216 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,SKLPort0156]> {
+ let Latency = 46;
+ let NumMicroOps = 44;
+ let ResourceCycles = [1,11,1,1,30];
+}
+def: InstRW<[SKLWriteResGroup216], (instregex "XSAVEOPT")>;
+
+def SKLWriteResGroup217 : SchedWriteRes<[SKLPort0,SKLPort23,SKLPort05,SKLPort06,SKLPort0156]> {
+ let Latency = 62;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,8,5,10,39];
+}
+def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>;
+def: InstRW<[SKLWriteResGroup217], (instregex "FLDENVm")>;
+
+def SKLWriteResGroup218 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[SKLWriteResGroup218], (instregex "FXRSTOR64")>;
+
+def SKLWriteResGroup219 : SchedWriteRes<[SKLPort0,SKLPort6,SKLPort23,SKLPort05,SKLPort06,SKLPort15,SKLPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[SKLWriteResGroup219], (instregex "FXRSTOR")>;
+
+def SKLWriteResGroup220 : SchedWriteRes<[SKLPort5,SKLPort05,SKLPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[SKLWriteResGroup220], (instregex "FNINIT")>;
+
+def SKLWriteResGroup221 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort05,SKLPort0156]> {
+ let Latency = 76;
+ let NumMicroOps = 32;
+ let ResourceCycles = [7,2,8,3,1,11];
+}
+def: InstRW<[SKLWriteResGroup221], (instregex "DIV(16|32|64)r")>;
+
+def SKLWriteResGroup222 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort5,SKLPort6,SKLPort06,SKLPort0156]> {
+ let Latency = 102;
+ let NumMicroOps = 66;
+ let ResourceCycles = [4,2,4,8,14,34];
+}
+def: InstRW<[SKLWriteResGroup222], (instregex "IDIV(16|32|64)r")>;
+
+def SKLWriteResGroup223 : SchedWriteRes<[SKLPort0,SKLPort1,SKLPort4,SKLPort5,SKLPort6,SKLPort237,SKLPort06,SKLPort0156]> {
+ let Latency = 106;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,1,11,16,1,11,21,30];
+}
+def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>;
+def: InstRW<[SKLWriteResGroup223], (instregex "FSTENVm")>;
+
+} // SchedModel
diff --git a/lib/Target/X86/X86SchedSkylakeServer.td b/lib/Target/X86/X86SchedSkylakeServer.td
new file mode 100755
index 000000000000..439a2ffa36a4
--- /dev/null
+++ b/lib/Target/X86/X86SchedSkylakeServer.td
@@ -0,0 +1,6500 @@
+//=- X86SchedSkylake.td - X86 Skylake Server Scheduling ------*- tablegen -*-=//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Skylake Server to support
+// instruction scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def SkylakeServerModel : SchedMachineModel {
+ // All x86 instructions are modeled as a single micro-op, and SKylake can
+ // decode 6 instructions per cycle.
+ let IssueWidth = 6;
+ let MicroOpBufferSize = 224; // Based on the reorder buffer.
+ let LoadLatency = 5;
+ let MispredictPenalty = 14;
+
+ // Based on the LSD (loop-stream detector) queue size and benchmarking data.
+ let LoopMicroOpBufferSize = 50;
+
+ // This flag is set to allow the scheduler to assign a default model to
+ // unrecognized opcodes.
+ let CompleteModel = 0;
+}
+
+let SchedModel = SkylakeServerModel in {
+
+// Skylake Server can issue micro-ops to 8 different ports in one cycle.
+
+// Ports 0, 1, 5, and 6 handle all computation.
+// Port 4 gets the data half of stores. Store data can be available later than
+// the store address, but since we don't model the latency of stores, we can
+// ignore that.
+// Ports 2 and 3 are identical. They handle loads and the address half of
+// stores. Port 7 can handle address calculations.
+def SKXPort0 : ProcResource<1>;
+def SKXPort1 : ProcResource<1>;
+def SKXPort2 : ProcResource<1>;
+def SKXPort3 : ProcResource<1>;
+def SKXPort4 : ProcResource<1>;
+def SKXPort5 : ProcResource<1>;
+def SKXPort6 : ProcResource<1>;
+def SKXPort7 : ProcResource<1>;
+
+// Many micro-ops are capable of issuing on multiple ports.
+def SKXPort01 : ProcResGroup<[SKXPort0, SKXPort1]>;
+def SKXPort23 : ProcResGroup<[SKXPort2, SKXPort3]>;
+def SKXPort237 : ProcResGroup<[SKXPort2, SKXPort3, SKXPort7]>;
+def SKXPort04 : ProcResGroup<[SKXPort0, SKXPort4]>;
+def SKXPort05 : ProcResGroup<[SKXPort0, SKXPort5]>;
+def SKXPort06 : ProcResGroup<[SKXPort0, SKXPort6]>;
+def SKXPort15 : ProcResGroup<[SKXPort1, SKXPort5]>;
+def SKXPort16 : ProcResGroup<[SKXPort1, SKXPort6]>;
+def SKXPort56 : ProcResGroup<[SKXPort5, SKXPort6]>;
+def SKXPort015 : ProcResGroup<[SKXPort0, SKXPort1, SKXPort5]>;
+def SKXPort056 : ProcResGroup<[SKXPort0, SKXPort5, SKXPort6]>;
+def SKXPort0156: ProcResGroup<[SKXPort0, SKXPort1, SKXPort5, SKXPort6]>;
+
+// 60 Entry Unified Scheduler
+def SKXPortAny : ProcResGroup<[SKXPort0, SKXPort1, SKXPort2, SKXPort3, SKXPort4,
+ SKXPort5, SKXPort6, SKXPort7]> {
+ let BufferSize=60;
+}
+
+// Loads are 5 cycles, so ReadAfterLd registers needn't be available until 5
+// cycles after the memory operand.
+def : ReadAdvance<ReadAfterLd, 5>;
+
+// Many SchedWrites are defined in pairs with and without a folded load.
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops when queued in the reservation station.
+// This multiclass defines the resource usage for variants with and without
+// folded loads.
+multiclass SKXWriteResPair<X86FoldableSchedWrite SchedRW,
+ ProcResourceKind ExePort,
+ int Lat> {
+ // Register variant is using a single cycle on ExePort.
+ def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
+
+ // Memory variant also uses a cycle on port 2/3 and adds 5 cycles to the
+ // latency.
+ def : WriteRes<SchedRW.Folded, [SKXPort23, ExePort]> {
+ let Latency = !add(Lat, 5);
+ }
+}
+
+// A folded store needs a cycle on port 4 for the store data, but it does not
+// need an extra port 2/3 cycle to recompute the address.
+def : WriteRes<WriteRMW, [SKXPort4]>;
+
+// Arithmetic.
+defm : SKXWriteResPair<WriteALU, SKXPort0156, 1>; // Simple integer ALU op.
+defm : SKXWriteResPair<WriteIMul, SKXPort1, 3>; // Integer multiplication.
+def : WriteRes<WriteIMulH, []> { let Latency = 3; } // Integer multiplication, high part.
+def SKXDivider : ProcResource<1>; // Integer division issued on port 0.
+def : WriteRes<WriteIDiv, [SKXPort0, SKXDivider]> { // Integer division.
+ let Latency = 25;
+ let ResourceCycles = [1, 10];
+}
+def : WriteRes<WriteIDivLd, [SKXPort23, SKXPort0, SKXDivider]> {
+ let Latency = 29;
+ let ResourceCycles = [1, 1, 10];
+}
+
+def : WriteRes<WriteLEA, [SKXPort15]>; // LEA instructions can't fold loads.
+
+// Integer shifts and rotates.
+defm : SKXWriteResPair<WriteShift, SKXPort06, 1>;
+
+// Loads, stores, and moves, not folded with other operations.
+def : WriteRes<WriteLoad, [SKXPort23]> { let Latency = 5; }
+def : WriteRes<WriteStore, [SKXPort237, SKXPort4]>;
+def : WriteRes<WriteMove, [SKXPort0156]>;
+
+// Idioms that clear a register, like xorps %xmm0, %xmm0.
+// These can often bypass execution ports completely.
+def : WriteRes<WriteZero, []>;
+
+// Branches don't produce values, so they have no latency, but they still
+// consume resources. Indirect branches can fold loads.
+defm : SKXWriteResPair<WriteJump, SKXPort06, 1>;
+
+// Floating point. This covers both scalar and vector operations.
+defm : SKXWriteResPair<WriteFAdd, SKXPort1, 3>; // Floating point add/sub/compare.
+defm : SKXWriteResPair<WriteFMul, SKXPort0, 5>; // Floating point multiplication.
+defm : SKXWriteResPair<WriteFDiv, SKXPort0, 12>; // 10-14 cycles. // Floating point division.
+defm : SKXWriteResPair<WriteFSqrt, SKXPort0, 15>; // Floating point square root.
+defm : SKXWriteResPair<WriteFRcp, SKXPort0, 5>; // Floating point reciprocal estimate.
+defm : SKXWriteResPair<WriteFRsqrt, SKXPort0, 5>; // Floating point reciprocal square root estimate.
+defm : SKXWriteResPair<WriteFMA, SKXPort015, 4>; // Fused Multiply Add.
+defm : SKXWriteResPair<WriteFShuffle, SKXPort5, 1>; // Floating point vector shuffles.
+defm : SKXWriteResPair<WriteFBlend, SKXPort015, 1>; // Floating point vector blends.
+def : WriteRes<WriteFVarBlend, [SKXPort5]> { // Fp vector variable blends.
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteFVarBlendLd, [SKXPort5, SKXPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+// FMA Scheduling helper class.
+// class FMASC { X86FoldableSchedWrite Sched = WriteFAdd; }
+
+// Vector integer operations.
+defm : SKXWriteResPair<WriteVecALU, SKXPort15, 1>; // Vector integer ALU op, no logicals.
+defm : SKXWriteResPair<WriteVecShift, SKXPort0, 1>; // Vector integer shifts.
+defm : SKXWriteResPair<WriteVecIMul, SKXPort0, 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteShuffle, SKXPort5, 1>; // Vector shuffles.
+defm : SKXWriteResPair<WriteBlend, SKXPort15, 1>; // Vector blends.
+
+def : WriteRes<WriteVarBlend, [SKXPort5]> { // Vector variable blends.
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteVarBlendLd, [SKXPort5, SKXPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1];
+}
+
+def : WriteRes<WriteMPSAD, [SKXPort0, SKXPort5]> { // Vector MPSAD.
+ let Latency = 6;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteMPSADLd, [SKXPort23, SKXPort0, SKXPort5]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 1, 2];
+}
+
+// Vector bitwise operations.
+// These are often used on both floating point and integer vectors.
+defm : SKXWriteResPair<WriteVecLogic, SKXPort015, 1>; // Vector and/or/xor.
+
+// Conversion between integer and float.
+defm : SKXWriteResPair<WriteCvtF2I, SKXPort1, 3>; // Float -> Integer.
+defm : SKXWriteResPair<WriteCvtI2F, SKXPort1, 4>; // Integer -> Float.
+defm : SKXWriteResPair<WriteCvtF2F, SKXPort1, 3>; // Float -> Float size conversion.
+
+// Strings instructions.
+// Packed Compare Implicit Length Strings, Return Mask
+// String instructions.
+def : WriteRes<WritePCmpIStrM, [SKXPort0]> {
+ let Latency = 10;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrMLd, [SKXPort0, SKXPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 1];
+}
+// Packed Compare Explicit Length Strings, Return Mask
+def : WriteRes<WritePCmpEStrM, [SKXPort0, SKXPort16, SKXPort5]> {
+ let Latency = 10;
+ let ResourceCycles = [3, 2, 4];
+}
+def : WriteRes<WritePCmpEStrMLd, [SKXPort05, SKXPort16, SKXPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [6, 2, 1];
+}
+ // Packed Compare Implicit Length Strings, Return Index
+def : WriteRes<WritePCmpIStrI, [SKXPort0]> {
+ let Latency = 11;
+ let ResourceCycles = [3];
+}
+def : WriteRes<WritePCmpIStrILd, [SKXPort0, SKXPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 1];
+}
+// Packed Compare Explicit Length Strings, Return Index
+def : WriteRes<WritePCmpEStrI, [SKXPort05, SKXPort16]> {
+ let Latency = 11;
+ let ResourceCycles = [6, 2];
+}
+def : WriteRes<WritePCmpEStrILd, [SKXPort0, SKXPort16, SKXPort5, SKXPort23]> {
+ let Latency = 11;
+ let ResourceCycles = [3, 2, 2, 1];
+}
+
+// AES instructions.
+def : WriteRes<WriteAESDecEnc, [SKXPort5]> { // Decryption, encryption.
+ let Latency = 7;
+ let ResourceCycles = [1];
+}
+def : WriteRes<WriteAESDecEncLd, [SKXPort5, SKXPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+def : WriteRes<WriteAESIMC, [SKXPort5]> { // InvMixColumn.
+ let Latency = 14;
+ let ResourceCycles = [2];
+}
+def : WriteRes<WriteAESIMCLd, [SKXPort5, SKXPort23]> {
+ let Latency = 14;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteAESKeyGen, [SKXPort0, SKXPort5]> { // Key Generation.
+ let Latency = 10;
+ let ResourceCycles = [2, 8];
+}
+def : WriteRes<WriteAESKeyGenLd, [SKXPort0, SKXPort5, SKXPort23]> {
+ let Latency = 10;
+ let ResourceCycles = [2, 7, 1];
+}
+
+// Carry-less multiplication instructions.
+def : WriteRes<WriteCLMul, [SKXPort0, SKXPort5]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteCLMulLd, [SKXPort0, SKXPort5, SKXPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 1, 1];
+}
+
+// Catch-all for expensive system instructions.
+def : WriteRes<WriteSystem, [SKXPort0156]> { let Latency = 100; } // def WriteSystem : SchedWrite;
+
+// AVX2.
+defm : SKXWriteResPair<WriteFShuffle256, SKXPort5, 3>; // Fp 256-bit width vector shuffles.
+defm : SKXWriteResPair<WriteShuffle256, SKXPort5, 3>; // 256-bit width vector shuffles.
+def : WriteRes<WriteVarVecShift, [SKXPort0, SKXPort5]> { // Variable vector shifts.
+ let Latency = 2;
+ let ResourceCycles = [2, 1];
+}
+def : WriteRes<WriteVarVecShiftLd, [SKXPort0, SKXPort5, SKXPort23]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 1, 1];
+}
+
+// Old microcoded instructions that nobody use.
+def : WriteRes<WriteMicrocoded, [SKXPort0156]> { let Latency = 100; } // def WriteMicrocoded : SchedWrite;
+
+// Fence instructions.
+def : WriteRes<WriteFence, [SKXPort23, SKXPort4]>;
+
+// Nop, not very useful expect it provides a model for nops!
+def : WriteRes<WriteNop, []>;
+
+////////////////////////////////////////////////////////////////////////////////
+// Horizontal add/sub instructions.
+////////////////////////////////////////////////////////////////////////////////
+// HADD, HSUB PS/PD
+// x,x / v,v,v.
+def : WriteRes<WriteFHAdd, [SKXPort1]> {
+ let Latency = 3;
+}
+
+// x,m / v,v,m.
+def : WriteRes<WriteFHAddLd, [SKXPort1, SKXPort23]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 1];
+}
+
+// PHADD|PHSUB (S) W/D.
+// v <- v,v.
+def : WriteRes<WritePHAdd, [SKXPort15]>;
+
+// v <- v,m.
+def : WriteRes<WritePHAddLd, [SKXPort15, SKXPort23]> {
+ let Latency = 5;
+ let ResourceCycles = [1, 1];
+}
+
+// Remaining instrs.
+
+def SKXWriteResGroup1 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDBrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDNBrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDNDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDNQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDNWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KANDWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KMOVBkk")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KMOVDkk")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KMOVQkk")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KMOVWkk")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KNOTBrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KNOTDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KNOTQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KNOTWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KORBrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KORDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KORQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KORWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXNORBrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXNORDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXNORQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXNORWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXORBrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXORDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXORQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "KXORWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDSWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PADDUSWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PAVGWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQDirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPEQWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTDirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PCMPGTWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXSWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMAXUBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINSWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PMINUBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSLLWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRADrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRAWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLDrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLQrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWri")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSRLWrr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBSWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSBirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "MMX_PSUBUSWirr")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVB2MZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVD2MZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVQ2MZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup1], (instregex "VPMOVW2MZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup2 : SchedWriteRes<[SKXPort1]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup2], (instregex "MMX_MASKMOVQ64")>;
+
+def SKXWriteResGroup3 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup3], (instregex "COMP_FST0r")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "COM_FST0r")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "INSERTPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "KMOVBkr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "KMOVDkr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "KMOVQkr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "KMOVWkr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64rr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_MOVD64to64rr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PALIGNR64irr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFBrr64")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PSHUFWri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHBWirr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHDQirr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKHWDirr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLBWirr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLDQirr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MMX_PUNPCKLWDirr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOV64toPQIrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVDDUPrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVDI2PDIrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVHLPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVLHPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVSDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVSHDUPrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVSLDUPrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "MOVUPSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSDWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PACKSSWBrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSDWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PACKUSWBrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PALIGNRrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PBLENDWrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVSXWQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PMOVZXWQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFBrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFDri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFHWri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PSHUFLWri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PSLLDQri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PSRLDQri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHQDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKHWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLQDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "PUNPCKLWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPDrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "SHUFPSrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_FPr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "UCOM_Fr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKHPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "UNPCKLPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTI32X2Z128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VBROADCASTSSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VINSERTPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOV64toPQIrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDDUPrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVDI2PDIrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVHLPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVLHPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSHDUPrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSLDUPrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVSSZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VMOVUPSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSDWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKSSWBrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSDWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPACKUSWBrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRYrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPALIGNRrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWYrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPBLENDWrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPBROADCASTQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPERMILPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVSXWQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPMOVZXWQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFBrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFDri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFHWri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSHUFLWri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSLLDQri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQYri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPSRLDQri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHQDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKHWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLBWrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLQDQrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VPUNPCKLWDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDYrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPDrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSYrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VSHUFPSrri")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKHPSrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPDrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSYrr")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup3], (instregex "VUNPCKLPSrr")>;
+
+def SKXWriteResGroup4 : SchedWriteRes<[SKXPort6]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup4], (instregex "JMP(16|32|64)r")>;
+
+def SKXWriteResGroup5 : SchedWriteRes<[SKXPort01]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup5], (instregex "PABSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PABSDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PABSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PADDSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PADDSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PADDUSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PAVGBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PAVGWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQQrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPEQWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PCMPGTWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMAX(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMAXSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMAXUWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMINSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMIN(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMINSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMINUBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMINUDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PMINUWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNBrr128")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNDrr128")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSIGNWrr128")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSLLDri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSLLQri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSLLWri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSRADri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSRAWri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSRLDri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSRLQri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSRLWri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSUBSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "PSUBUSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPABSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPADDUSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPAVGWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQQrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPEQWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPCMPGTWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAX(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMAXUWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMIN(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPMINUWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLQZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPROLVQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORQZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPRORVQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBYrr256")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNBrr128")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDYrr256")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNDrr128")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWYrr256")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSIGNWrr128")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLDri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLQri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVQrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLVWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ128ri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZ256ri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSLLWri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRADri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAQZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAVWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ128ri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZ256ri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRAWri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLDri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ128r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLQri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVDrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVQrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLVWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWYri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ128ri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZ256ri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSRLWri")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBSWrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSBrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWYrr")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup5], (instregex "VPSUBUSWrr")>;
+
+def SKXWriteResGroup6 : SchedWriteRes<[SKXPort05]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup6], (instregex "FINCSTP")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "FNOP")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_MOVQ64rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSBrr64")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSDrr64")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PABSWrr64")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDBirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDDirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDQirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PADDWirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDNirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PANDirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PORirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNBrr64")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNDrr64")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSIGNWrr64")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBBirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBDirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBQirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PSUBWirr")>;
+def: InstRW<[SKXWriteResGroup6], (instregex "MMX_PXORirr")>;
+
+def SKXWriteResGroup7 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "ADC(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "ADC8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "ADCX(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "ADOX(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)ri8")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BT(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)ri8")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BTC(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)ri8")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BTR(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)ri8")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "BTS(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CDQ")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CLAC")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVAE(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVB(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVE(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVG(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVGE(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVL(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVLE(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNE(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNO(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNP(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVNS(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVO(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVP(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CMOVS(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "CQO")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JAE_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JAE_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JA_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JA_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JBE_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JBE_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JB_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JB_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JE_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JE_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JGE_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JGE_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JG_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JG_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JLE_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JLE_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JL_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JL_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JMP_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JMP_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNE_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNE_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNO_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNO_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNP_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNP_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNS_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JNS_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JO_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JO_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JP_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JP_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JS_1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "JS_4")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "RORX(32|64)ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SAR(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SAR8r1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SAR8ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SARX(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SBB(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SBB8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETAEr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETBr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETEr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETGEr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETGr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETLEr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETLr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETNEr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETNOr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETNPr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETNSr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETOr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETPr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SETSr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHL(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHL8r1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHL8ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHLX(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHR(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHR8r1")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHR8ri")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "SHRX(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup7], (instregex "STAC")>;
+
+def SKXWriteResGroup8 : SchedWriteRes<[SKXPort15]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup8], (instregex "ANDN(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "BLSI(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "BLSMSK(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "BLSR(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "BZHI(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup8], (instregex "LEA(16|32|64)(_32)?r")>;
+
+def SKXWriteResGroup9 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "ANDNPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "ANDPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "ANDPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPDrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "BLENDPSrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVAPSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQArr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVDQUrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVPQI2QIrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "MOVSSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "ORPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "ORPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PADDBrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PADDDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PADDQrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PADDWrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PANDNrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PANDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PORrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PSUBBrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PSUBDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PSUBQrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PSUBWrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "PXORrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDNPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VANDPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDMPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDYrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPDrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSYrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VBLENDPSrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZ256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPDrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVAPSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA32Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQA64Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQAYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQArr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU16Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU32Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU64Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Z256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQU8Zrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUYrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVDQUrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI(2Q|Lo2PQ)IZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVPQI2QIrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVSSrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZ256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPDZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ128rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZ256rr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVUPSZrr(b?)(k?)(z?)(_REV)?")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VMOVZPQILo2PQIrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPSYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VORPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDBrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDQrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPADDWrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDNrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPANDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDYrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDDrri")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPBLENDMWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPORrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBBrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBQrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPSUBWrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPTERNLOGQZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VPXORrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSYrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "VXORPSrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "XORPDrr")>;
+def: InstRW<[SKXWriteResGroup9], (instregex "XORPSrr")>;
+
+def SKXWriteResGroup10 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 1;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "ADD8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "AND8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CBW")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CLC")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMC")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CMP8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "CWDE")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "DEC(16|32|64)r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "DEC8r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "INC(16|32|64)r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "INC8r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "LAHF")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOV(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOV8ri(_alt)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOV8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr16")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr32")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOVSX(16|32|64)rr8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr16")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "MOVZX(16|32|64)rr8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "NEG(16|32|64)r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "NEG8r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "NOOP")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "NOT(16|32|64)r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "NOT8r")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "OR8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SAHF")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SGDT64m")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SIDT64m")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SLDT64m")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SMSW16m")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "STC")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "STRm")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SUB8rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "SYSCALL")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "TEST(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "TEST8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "TEST8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "TEST8rr")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XCHG(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR(16|32|64)rr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR8i8")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR8ri")>;
+def: InstRW<[SKXWriteResGroup10], (instregex "XOR8rr(_REV)?")>;
+
+def SKXWriteResGroup11 : SchedWriteRes<[SKXPort4,SKXPort237]> {
+ let Latency = 1;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup11], (instregex "FBSTPm")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "KMOVBmk")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "KMOVDmk")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "KMOVQmk")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "KMOVWmk")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVD64mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVNTQmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MMX_MOVQ64mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOV(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mi")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOV8mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVAPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQAmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVDQUmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVHPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVLPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTDQmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTI_64mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTImr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVNTPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVPDI2DImr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQI2QImr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVPQIto64mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVSDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVSSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "MOVUPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP32m")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP64m")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "ST_FP80m")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF128mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x4Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF32x8Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x2Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTF64x4Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI128mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x4Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI32x8Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x2Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VEXTRACTI64x4Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVAPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA32Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQA64Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQAmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU16Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU32Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU64Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQU8Z256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVDQUmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVHPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVLPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTDQmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVNTPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DIZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPDI2DImr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI(2QI|to64)Zmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQI2QImr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVPQIto64mr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVSSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPDmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSYmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMOVUPSmr")>;
+def: InstRW<[SKXWriteResGroup11], (instregex "VMPTRSTm")>;
+
+def SKXWriteResGroup12 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 2;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup12], (instregex "COMISDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "COMISSrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64from64rr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MMX_MOVD64grr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MMX_PMOVMSKBrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MOVMSKPSrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MOVPDI2DIrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "MOVPQIto64rr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "PMOVMSKBrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "UCOMISSrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VCOMISSrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDYrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSYrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVMSKPSrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPDI2DIrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VMOVPQIto64rr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBYrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VPMOVMSKBrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDYrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSYrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VTESTPSrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISDrr")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup12], (instregex "VUCOMISSrr")>;
+
+def SKXWriteResGroup13 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup13], (instregex "MMX_MOVQ2DQrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "MMX_PINSRWirri")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "PINSRBrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "PINSRDrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "PINSRQrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "PINSRWrri")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRBrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRDrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRQrr")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup13], (instregex "VPINSRWrri")>;
+
+def SKXWriteResGroup14 : SchedWriteRes<[SKXPort05]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup14], (instregex "FDECSTP")>;
+def: InstRW<[SKXWriteResGroup14], (instregex "MMX_MOVDQ2Qrr")>;
+
+def SKXWriteResGroup15 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup15], (instregex "CMOVA(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "CMOVBE(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROL(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROL8r1")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROL8ri")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROR(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROR8r1")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "ROR8ri")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "SETAr")>;
+def: InstRW<[SKXWriteResGroup15], (instregex "SETBEr")>;
+
+def SKXWriteResGroup16 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPDrr0")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "BLENDVPSrr0")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "PBLENDVBrr0")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDYrr")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPDrr")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSYrr")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "VBLENDVPSrr")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBYrr")>;
+def: InstRW<[SKXWriteResGroup16], (instregex "VPBLENDVBrr")>;
+
+def SKXWriteResGroup17 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup17], (instregex "LFENCE")>;
+def: InstRW<[SKXWriteResGroup17], (instregex "WAIT")>;
+def: InstRW<[SKXWriteResGroup17], (instregex "XGETBV")>;
+
+def SKXWriteResGroup18 : SchedWriteRes<[SKXPort0,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup18], (instregex "MMX_MASKMOVQ64")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVDQU")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDYmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPDmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSYmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VMASKMOVPSmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDYmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVDmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQYmr")>;
+def: InstRW<[SKXWriteResGroup18], (instregex "VPMASKMOVQmr")>;
+
+def SKXWriteResGroup19 : SchedWriteRes<[SKXPort5,SKXPort01]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup19], (instregex "PSLLDrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSLLQrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSLLWrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSRADrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSRAWrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSRLDrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSRLQrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "PSRLWrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLDrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLQrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSLLWrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRADrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRAWrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLDrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLQrr")>;
+def: InstRW<[SKXWriteResGroup19], (instregex "VPSRLWrr")>;
+
+def SKXWriteResGroup20 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup20], (instregex "CLFLUSH")>;
+
+def SKXWriteResGroup21 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup21], (instregex "SFENCE")>;
+
+def SKXWriteResGroup22 : SchedWriteRes<[SKXPort06,SKXPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup22], (instregex "BEXTR(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup22], (instregex "BSWAP(16|32|64)r")>;
+
+def SKXWriteResGroup23 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup23], (instregex "ADC8i8")>;
+def: InstRW<[SKXWriteResGroup23], (instregex "ADC8ri")>;
+def: InstRW<[SKXWriteResGroup23], (instregex "CWD")>;
+def: InstRW<[SKXWriteResGroup23], (instregex "JRCXZ")>;
+def: InstRW<[SKXWriteResGroup23], (instregex "SBB8i8")>;
+def: InstRW<[SKXWriteResGroup23], (instregex "SBB8ri")>;
+
+def SKXWriteResGroup24 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup24], (instregex "EXTRACTPSmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRBmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRDmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRQmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "PEXTRWmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "STMXCSR")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VEXTRACTPSmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRBmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRDmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRQmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VPEXTRWmr")>;
+def: InstRW<[SKXWriteResGroup24], (instregex "VSTMXCSR")>;
+
+def SKXWriteResGroup25 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup25], (instregex "FNSTCW16m")>;
+
+def SKXWriteResGroup26 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup26], (instregex "SETAEm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETBm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETEm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETGEm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETGm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETLEm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETLm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETNEm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETNOm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETNPm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETNSm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETOm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETPm")>;
+def: InstRW<[SKXWriteResGroup26], (instregex "SETSm")>;
+
+def SKXWriteResGroup27 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup27], (instregex "MOVBE(16|32|64)mr")>;
+
+def SKXWriteResGroup28 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 2;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup28], (instregex "PUSH(16|32|64)r(mr)?")>;
+def: InstRW<[SKXWriteResGroup28], (instregex "PUSH64i8")>;
+def: InstRW<[SKXWriteResGroup28], (instregex "STOSB")>;
+def: InstRW<[SKXWriteResGroup28], (instregex "STOSL")>;
+def: InstRW<[SKXWriteResGroup28], (instregex "STOSQ")>;
+def: InstRW<[SKXWriteResGroup28], (instregex "STOSW")>;
+
+def SKXWriteResGroup29 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort15]> {
+ let Latency = 2;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,2,1];
+}
+def: InstRW<[SKXWriteResGroup29], (instregex "VMOVDQU8Zmr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup30 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup30], (instregex "KADDBrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KADDDrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KADDQrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KADDWrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOVBrk")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOVDrk")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOVQrk")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KMOVWrk")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTBrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTDrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTQrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KORTESTWrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KTESTBrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KTESTDrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KTESTQrr")>;
+def: InstRW<[SKXWriteResGroup30], (instregex "KTESTWrr")>;
+
+def SKXWriteResGroup31 : SchedWriteRes<[SKXPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup31], (instregex "BSF(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "BSR(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "IMUL64rr(i8)?")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "IMUL8r")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "LZCNT(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "MUL8r")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "PDEP(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "PEXT(32|64)rr")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "POPCNT(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "SHRD(16|32|64)rri8")>;
+def: InstRW<[SKXWriteResGroup31], (instregex "TZCNT(16|32|64)rr")>;
+
+def SKXWriteResGroup31_16 : SchedWriteRes<[SKXPort1, SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup31_16], (instregex "IMUL16rr(i8)?")>;
+
+def SKXWriteResGroup31_32 : SchedWriteRes<[SKXPort1]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+}
+def: InstRW<[SKXWriteResGroup31_32], (instregex "IMUL32rr(i8)?")>;
+
+def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FPrST0")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FST0r")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "ADD_FrST0")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLBri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLDri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLQri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTLWri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRBri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRDri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRQri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KSHIFTRWri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKBWrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKDQrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "KUNPCKWDrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "MMX_PSADBWirr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "PCMPGTQrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "PSADBWrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FPrST0")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FST0r")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "SUBR_FrST0")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FPrST0")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FST0r")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "SUB_FrST0")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VALIGNQZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Z256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTF32X2Zr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Z256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTI32X2Zr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSDZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VBROADCASTSSZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VCMPSSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VDBPSADBWZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF128rr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF32x8Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x2Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTF64x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI128rr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI32x8Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x2Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VEXTRACTI64x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSDrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VFPCLASSSSrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF128rr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF32x8Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x2Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTF64x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI128rr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI32x8Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Z256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x2Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VINSERTI64x4Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTBrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTDrZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTQrZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPBROADCASTWrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPBZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPEQWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTQrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPGTWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPQZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUBZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUQZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPUWZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPCMPWZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2F128rr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERM2I128rr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2D256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Drr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PD256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PDrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PS256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2PSrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Q256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMI2Qrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDYri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQYri")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256r(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2D256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Drr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PD256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PDrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PS256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2PSrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Q256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPERMT2Qrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXSQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMAXUQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMINSQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMINUQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVQDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXBWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVSXWQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXBWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPMOVZXWQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWYrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPSADBWrr")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTMWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VPTESTNMWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Z256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF32X4Zrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Z256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFF64X2Zrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Z256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI32X4Zrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Z256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup32], (instregex "VSHUFI64X2Zrri(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup33 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup33], (instregex "EXTRACTPSrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "MMX_PEXTRWirri")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRBrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRDrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRQrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWri")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "PEXTRWrr_REV")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "PTESTrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VEXTRACTPSrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRBrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRDrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRQrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWZrr(_REV)?")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWri")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPEXTRWrr_REV")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTYrr")>;
+def: InstRW<[SKXWriteResGroup33], (instregex "VPTESTrr")>;
+
+def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup34], (instregex "FNSTSW16r")>;
+
+def SKXWriteResGroup35 : SchedWriteRes<[SKXPort06]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKXWriteResGroup35], (instregex "ROL(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "ROL8rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "ROR(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "ROR8rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "SAR(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "SAR8rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "SHL(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "SHL8rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "SHR(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup35], (instregex "SHR8rCL")>;
+
+def SKXWriteResGroup36 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKXWriteResGroup36], (instregex "XADD(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup36], (instregex "XADD8rr")>;
+def: InstRW<[SKXWriteResGroup36], (instregex "XCHG8rr")>;
+
+def SKXWriteResGroup37 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHADDSWrr64")>;
+def: InstRW<[SKXWriteResGroup37], (instregex "MMX_PHSUBSWrr64")>;
+
+def SKXWriteResGroup38 : SchedWriteRes<[SKXPort5,SKXPort01]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup38], (instregex "PHADDSWrr128")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "PHSUBSWrr128")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr128")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "VPHADDSWrr256")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr128")>;
+def: InstRW<[SKXWriteResGroup38], (instregex "VPHSUBSWrr256")>;
+
+def SKXWriteResGroup39 : SchedWriteRes<[SKXPort5,SKXPort05]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDWrr64")>;
+def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHADDrr64")>;
+def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBDrr64")>;
+def: InstRW<[SKXWriteResGroup39], (instregex "MMX_PHSUBWrr64")>;
+
+def SKXWriteResGroup40 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup40], (instregex "PHADDDrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "PHADDWrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBDrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "PHSUBWrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDYrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDDrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWYrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHADDWrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDYrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBDrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWYrr")>;
+def: InstRW<[SKXWriteResGroup40], (instregex "VPHSUBWrr")>;
+
+def SKXWriteResGroup41 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSDWirr")>;
+def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKSSWBirr")>;
+def: InstRW<[SKXWriteResGroup41], (instregex "MMX_PACKUSWBirr")>;
+
+def SKXWriteResGroup42 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup42], (instregex "CLD")>;
+
+def SKXWriteResGroup43 : SchedWriteRes<[SKXPort237,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup43], (instregex "MFENCE")>;
+
+def SKXWriteResGroup44 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL8r1")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCL8ri")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)r1")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCR(16|32|64)ri")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCR8r1")>;
+def: InstRW<[SKXWriteResGroup44], (instregex "RCR8ri")>;
+
+def SKXWriteResGroup45 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237]> {
+ let Latency = 3;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup45], (instregex "FNSTSWm")>;
+
+def SKXWriteResGroup46 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKXWriteResGroup46], (instregex "SETAm")>;
+def: InstRW<[SKXWriteResGroup46], (instregex "SETBEm")>;
+
+def SKXWriteResGroup47 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort237,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup47], (instregex "CALL(16|32|64)r")>;
+
+def SKXWriteResGroup48 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 3;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup48], (instregex "CALL64pcrel32")>;
+
+def SKXWriteResGroup49 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup49], (instregex "AESDECLASTrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "AESDECrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "AESENCLASTrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "AESENCrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDUBSWrr64")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMADDWDirr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHRSWrr64")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHUWirr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULHWirr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULLWirr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MMX_PMULUDQirr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FPrST0")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FST0r")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "MUL_FrST0")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "RCPPSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "RCPSSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTPSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "RSQRTSSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECLASTrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VAESDECrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCLASTrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VAESENCrr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PDZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14PSZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SDrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCP14SSrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSYr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCPPSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRCPSSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PDZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14PSZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SDrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRT14SSrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSYr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTPSr")>;
+def: InstRW<[SKXWriteResGroup49], (instregex "VRSQRTSSr")>;
+
+def SKXWriteResGroup50 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 4;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup50], (instregex "ADDPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "ADDPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "ADDSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "ADDSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "ADDSUBPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CMPPDrri")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CMPPSrri")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CMPSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CMPSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CVTDQ2PSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CVTPS2DQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "CVTTPS2DQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)PSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MAX(C?)SSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)PSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MIN(C?)SSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MULPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MULPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MULSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "MULSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PHMINPOSUWrr128")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMADDUBSWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMADDWDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMULDQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMULHRSWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMULHUWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMULHWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMULLWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "PMULUDQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "SUBPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "SUBPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "SUBSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "SUBSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VADDSUBPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDYrri")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPDrri")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSYrri")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCMPPSrri")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCMPSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTDQ2PSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2QQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPD2UQQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2DQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTPS2UDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTQQ2PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2QQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPD2UQQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2DQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTTPS2UDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUDQ2PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VCVTUQQ2PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSDrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VFIXUPIMMSSrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50],
+ (instregex
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Yr",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128r(b?)(k?)(z?)",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256r(b?)(k?)(z?)",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zr(b?)(k?)(z?)",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)r",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zr(b?)(_Int)?(k?)(z?)",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)r")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPDr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPPSr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSDr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETEXPSSr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VGETMANTSSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)PSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMAX(C?)SSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)PSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMIN(C?)SSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VMULSSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPHMINPOSUWrr128")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPLZCNTQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDUBSWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMADDWDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULDQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHRSWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHUWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULHWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULLWrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VPMULUDQrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGEPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VRANGESSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCEPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VREDUCESSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSCALEFSSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSYrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBPSrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSDrr")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup50], (instregex "VSUBSSrr")>;
+
+def SKXWriteResGroup51 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup51], (instregex "MPSADBWrri")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VEXPANDPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWYrri")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VMPSADBWrri")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPEXPANDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVDWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVQWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSDWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSQWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVSWBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSDWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSQWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVUSWBZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup51], (instregex "VPMOVWBZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup52 : SchedWriteRes<[SKXPort1,SKXPort5]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup52], (instregex "IMUL(32|64)r")>;
+def: InstRW<[SKXWriteResGroup52], (instregex "MUL(32|64)r")>;
+def: InstRW<[SKXWriteResGroup52], (instregex "MULX64rr")>;
+
+def SKXWriteResGroup52_16 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+}
+def: InstRW<[SKXWriteResGroup52_16], (instregex "IMUL16r")>;
+def: InstRW<[SKXWriteResGroup52_16], (instregex "MUL16r")>;
+
+def SKXWriteResGroup53 : SchedWriteRes<[SKXPort5,SKXPort01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSLLWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRADZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRAWZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWYrr")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup53], (instregex "VPSRLWZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup54 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 4;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP16m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP32m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "ISTT_FP64m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST_F16m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST_F32m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP16m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP32m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "IST_FP64m")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup54], (instregex "VPMOVQDZmr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup55 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [4];
+}
+def: InstRW<[SKXWriteResGroup55], (instregex "FNCLEX")>;
+
+def SKXWriteResGroup56 : SchedWriteRes<[SKXPort015,SKXPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup56], (instregex "VZEROUPPER")>;
+
+def SKXWriteResGroup57 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort0156]> {
+ let Latency = 4;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKXWriteResGroup57], (instregex "LAR(16|32|64)rr")>;
+
+def SKXWriteResGroup58 : SchedWriteRes<[SKXPort23]> {
+ let Latency = 5;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64from64rm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64rm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVD64to64rm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MMX_MOVQ64rm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOV(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOV64toPQIrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOV8rm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVDDUPrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVDI2PDIrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVQI2PQIrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSDrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSSrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm16")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm32")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVSX(16|32|64)rm8")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm16")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "MOVZX(16|32|64)rm8")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHNTA")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT0")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT1")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "PREFETCHT2")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "VMOV64toPQIrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDDUPrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "VMOVDI2PDIrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "VMOVQI2PQIrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSDrm")>;
+def: InstRW<[SKXWriteResGroup58], (instregex "VMOVSSrm")>;
+
+def SKXWriteResGroup59 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup59], (instregex "VCVTSD2SSZrr(b?)(_Int)?(k?)(z?)")>;
+
+def SKXWriteResGroup60 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup60], (instregex "CVTDQ2PDrr")>;
+def: InstRW<[SKXWriteResGroup60], (instregex "MMX_CVTPI2PDirr")>;
+def: InstRW<[SKXWriteResGroup60], (instregex "VCVTDQ2PDrr")>;
+
+def SKXWriteResGroup61 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2DQrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTPD2PSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTPS2PDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTSD2SSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI642SDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTSI2SSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTSS2SDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "CVTTPD2DQrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPD2PIirr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTPS2PIirr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPD2PIirr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "MMX_CVTTPS2PIirr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTDQ2PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2DQrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2PSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPD2UDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPH2PSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2PHrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2QQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTPS2UQQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTQQ2PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSD2SSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI2SSrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSI642SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTSS2SDrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2DQrr")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPD2UDQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2QQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTTPS2UQQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUDQ2PDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUQQ2PSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI2SSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup61], (instregex "VCVTUSI642SDZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup62 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup62], (instregex "VPCONFLICTQZ128rr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup63 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup63], (instregex "STR(16|32|64)r")>;
+
+def SKXWriteResGroup64 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup64], (instregex "MULX32rr")>;
+
+def SKXWriteResGroup65 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort015]> {
+ let Latency = 5;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup65], (instregex "VCVTPS2PHZmr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup66 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 5;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVDWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVQWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSDWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSQWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVSWBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSDWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSQWZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVUSWBZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup66], (instregex "VPMOVWBZmr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup67 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[SKXWriteResGroup67], (instregex "XSETBV")>;
+
+def SKXWriteResGroup68 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 5;
+ let ResourceCycles = [2,3];
+}
+def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG(16|32|64)rr")>;
+def: InstRW<[SKXWriteResGroup68], (instregex "CMPXCHG8rr")>;
+
+def SKXWriteResGroup69 : SchedWriteRes<[SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 5;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,4];
+}
+def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF16")>;
+def: InstRW<[SKXWriteResGroup69], (instregex "PUSHF64")>;
+
+def SKXWriteResGroup70 : SchedWriteRes<[SKXPort5]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup70], (instregex "PCLMULQDQrr")>;
+def: InstRW<[SKXWriteResGroup70], (instregex "VPCLMULQDQrr")>;
+
+def SKXWriteResGroup71 : SchedWriteRes<[SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup71], (instregex "LDDQUrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPDrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVAPSrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQArm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVDQUrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVNTDQArm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVSHDUPrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVSLDUPrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPDrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "MOVUPSrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VBROADCASTSSrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VLDDQUrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPDrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVAPSrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQArm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVDQUrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVNTDQArm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSHDUPrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVSLDUPrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPDrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VMOVUPSrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTDrm")>;
+def: InstRW<[SKXWriteResGroup71], (instregex "VPBROADCASTQrm")>;
+
+def SKXWriteResGroup72 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup72], (instregex "MMX_CVTPI2PSirr")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VCOMPRESSPSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPCOMPRESSQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup72], (instregex "VPERMWZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup73 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDSWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PADDUSWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PAVGWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQDirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPEQWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTDirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PCMPGTWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXSWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMAXUBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINSWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PMINUBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLDrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLQrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSLLWrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRADrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRAWrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLDrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLQrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSRLWrm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBSWirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSBirm")>;
+def: InstRW<[SKXWriteResGroup73], (instregex "MMX_PSUBUSWirm")>;
+
+def SKXWriteResGroup74 : SchedWriteRes<[SKXPort0,SKXPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SI64rr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "CVTSD2SIrr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SI64rr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "CVTSS2SIrr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SI64rr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "CVTTSD2SIrr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SI64rr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2SIrr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USI64Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSD2USIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SI64rr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2SIrr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTSS2USIZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64Zrr(b?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SI64rr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2SIrr")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USI64Zrr(b?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSD2USIZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup74], (instregex "VCVTTSS2USIZrr(b?)")>;
+
+def SKXWriteResGroup75 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PALIGNR64irm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PINSRWirmi")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFBrm64")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PSHUFWmi")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHBWirm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHDQirm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKHWDirm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLBWirm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLDQirm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MMX_PUNPCKLWDirm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MOVHPSrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "MOVLPSrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PINSRBrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PINSRDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PINSRQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PINSRWrmi")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXBWrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXDQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVSXWQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXBWrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXDQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "PMOVZXWQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVHPSrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VMOVLPSrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRBrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPINSRWrmi")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXBWrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXDQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVSXWQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXBWrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXDQrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWDrm")>;
+def: InstRW<[SKXWriteResGroup75], (instregex "VPMOVZXWQrm")>;
+
+def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup76], (instregex "FARJMP64")>;
+def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
+
+def SKXWriteResGroup77 : SchedWriteRes<[SKXPort23,SKXPort05]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSBrm64")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSDrm64")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PABSWrm64")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDBirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDDirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDQirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PADDWirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDNirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PANDirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PORirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNBrm64")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNDrm64")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSIGNWrm64")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBBirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBDirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBQirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PSUBWirm")>;
+def: InstRW<[SKXWriteResGroup77], (instregex "MMX_PXORirm")>;
+
+def SKXWriteResGroup78 : SchedWriteRes<[SKXPort23,SKXPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup78], (instregex "ADC(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "ADC8rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "ADCX(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "ADOX(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "BT(16|32|64)mi8")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVAE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVB(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVG(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVGE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVL(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVLE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNE(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNO(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNP(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVNS(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVO(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVP(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "CMOVS(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "RORX(32|64)mi")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "SARX(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "SBB(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "SBB8rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "SHLX(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup78], (instregex "SHRX(32|64)rm")>;
+
+def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup79], (instregex "ANDN(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "BLSI(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "BLSMSK(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "BLSR(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "BZHI(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup79], (instregex "MOVBE(16|32|64)rm")>;
+
+def SKXWriteResGroup80 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup80], (instregex "VMOV(64to|QI2)PQIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup80], (instregex "VMOVDI2PDIZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup81 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup81], (instregex "ADD(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "ADD8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "AND(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "AND8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "CMP(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mi")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "CMP8mr")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "CMP8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "OR(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "OR8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "POP(16|32|64)r(mr)?")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "SUB(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "SUB8rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "TEST(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mi")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "TEST8mr")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "XOR(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup81], (instregex "XOR8rm")>;
+
+def SKXWriteResGroup82 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup82], (instregex "CVTSI642SSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "HADDPDrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "HADDPSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPDrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "HSUBPSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VCVTSI642SSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VCVTUSI642SSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDYrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPDrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSYrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHADDPSrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDYrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPDrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSYrr")>;
+def: InstRW<[SKXWriteResGroup82], (instregex "VHSUBPSrr")>;
+
+def SKXWriteResGroup83 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup83], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[SKXWriteResGroup83], (instregex "SHRD(16|32|64)rrCL")>;
+
+def SKXWriteResGroup84 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort06,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup84], (instregex "SLDT(16|32|64)r")>;
+
+def SKXWriteResGroup85 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup85], (instregex "VCVTPS2PHmr")>;
+
+def SKXWriteResGroup86 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup86], (instregex "BTC(16|32|64)mi8")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "BTR(16|32|64)mi8")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "BTS(16|32|64)mi8")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SAR(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SAR8m1")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SAR8mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHL(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHL8m1")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHL8mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHR(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHR8m1")>;
+def: InstRW<[SKXWriteResGroup86], (instregex "SHR8mi")>;
+
+def SKXWriteResGroup87 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "ADD(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "ADD8mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "AND(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "AND8mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "AND8mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "DEC(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "DEC8m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "INC(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "INC8m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "NEG(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "NEG8m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "NOT(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "NOT8m")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "OR(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "OR8mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "OR8mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "POP(16|32|64)rmm")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "PUSH(16|32|64)rmm")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "SUB(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "SUB8mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "XOR(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mi")>;
+def: InstRW<[SKXWriteResGroup87], (instregex "XOR8mr")>;
+
+def SKXWriteResGroup88 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 6;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,5];
+}
+def: InstRW<[SKXWriteResGroup88], (instregex "STD")>;
+
+def SKXWriteResGroup89 : SchedWriteRes<[SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup89], (instregex "LD_F32m")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "LD_F64m")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "LD_F80m")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTF128")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTI128")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSDYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VBROADCASTSSYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VLDDQUYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPDYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVAPSYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDDUPYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQAYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVDQUYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVNTDQAZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSHDUPYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVSLDUPYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPDYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VMOVUPSYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTDYrm")>;
+def: InstRW<[SKXWriteResGroup89], (instregex "VPBROADCASTQYrm")>;
+
+def SKXWriteResGroup90 : SchedWriteRes<[SKXPort0,SKXPort5]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup90], (instregex "VCVTDQ2PDYrr")>;
+
+def SKXWriteResGroup91 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup91], (instregex "COMISDrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "COMISSrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISDrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "UCOMISSrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISDrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VCOMISSrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISDrm")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup91], (instregex "VUCOMISSrm")>;
+
+def SKXWriteResGroup92 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup92], (instregex "INSERTPSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSDWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PACKSSWBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSDWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PACKUSWBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PALIGNRrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PBLENDWrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFDmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFHWmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PSHUFLWmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHBWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHQDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKHWDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLBWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLQDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "PUNPCKLWDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPDrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "SHUFPSrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKHPSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "UNPCKLPSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VINSERTPSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VMOVSSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSDWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKSSWBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSDWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPACKUSWBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPALIGNRrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPBLENDWrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPBROADCASTWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPERMILPSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFBrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFDmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWZ128mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFHWmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWZ128mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSHUFLWmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSLLDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPSRLDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHBWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHQDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKHWDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLBWrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLQDQrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VPUNPCKLWDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPDrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VSHUFPSrmi")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKHPSrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPDrm")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup92], (instregex "VUNPCKLPSrm")>;
+
+def SKXWriteResGroup93 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTDQ2PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQYrr")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2DQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSYrr")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPD2UDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSYrr")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPH2PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDYrr")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHYrr")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2PHZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2QQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTPS2UQQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTQQ2PSZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQYrr")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2DQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPD2UDQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2QQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTTPS2UQQZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUDQ2PDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup93], (instregex "VCVTUQQ2PSZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup94 : SchedWriteRes<[SKXPort01,SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup94], (instregex "PABSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PABSDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PABSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PADDSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PADDSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PADDUSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PAVGBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PAVGWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPEQWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PCMPGTWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMAX(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMAXSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMAXUWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMINSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMIN(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMINSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMINUBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMINUDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PMINUWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNBrm128")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNDrm128")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSIGNWrm128")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSLLDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSLLQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSLLWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSRADrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSRAWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSRLDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSRLQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSRLWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSUBSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "PSUBUSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPABSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPADDUSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPAVGWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPEQWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPCMPGTWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAX(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMAXUWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMIN(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPMINUWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPROLDZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPROLQZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPROLVQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPRORDZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPRORQZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPRORVQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNBrm128")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNDrm128")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSIGNWrm128")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLVWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSLLWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRADrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAVWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRAWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVDrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVQrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLVWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSRLWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBSWrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSBrm")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup94], (instregex "VPSUBUSWrm")>;
+
+def SKXWriteResGroup95 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "ANDNPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "ANDPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "ANDPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPDrmi")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "BLENDPSrmi")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "ORPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "ORPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PADDBrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PADDDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PADDQrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PADDWrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PANDNrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PANDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PORrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PSUBBrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PSUBDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PSUBQrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PSUBWrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "PXORrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDNPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VANDPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDMPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPDrmi")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBLENDPSrmi")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTI32X2Z128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VBROADCASTSSZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTF128rm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VINSERTI128rm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMASKMOVPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVAPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDDUPZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA32Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQA64Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU16Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU32Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU64Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVDQU8Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVNTDQAZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSHDUPZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVSLDUPZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VMOVUPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VORPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VORPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VORPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VORPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDBrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDQrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPADDWrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPANDDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPANDNrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPANDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPANDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDDrmi")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBLENDMWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTDZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPBROADCASTQZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPMASKMOVQrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPORDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPORQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPORrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBBrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBQrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPSUBWrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPTERNLOGQZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPXORDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPXORQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VPXORrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VXORPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "VXORPSrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "XORPDrm")>;
+def: InstRW<[SKXWriteResGroup95], (instregex "XORPSrm")>;
+
+def SKXWriteResGroup96 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSDWirm")>;
+def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKSSWBirm")>;
+def: InstRW<[SKXWriteResGroup96], (instregex "MMX_PACKUSWBirm")>;
+
+def SKXWriteResGroup97 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2W256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMI2Wrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2W256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup97], (instregex "VPERMT2Wrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup98 : SchedWriteRes<[SKXPort23,SKXPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup98], (instregex "CMOVA(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup98], (instregex "CMOVBE(16|32|64)rm")>;
+
+def SKXWriteResGroup99 : SchedWriteRes<[SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup99], (instregex "LEAVE64")>;
+def: InstRW<[SKXWriteResGroup99], (instregex "SCASB")>;
+def: InstRW<[SKXWriteResGroup99], (instregex "SCASL")>;
+def: InstRW<[SKXWriteResGroup99], (instregex "SCASQ")>;
+def: InstRW<[SKXWriteResGroup99], (instregex "SCASW")>;
+
+def SKXWriteResGroup100 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SI64rr")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "CVTTSS2SIrr")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTSS2USI64Zrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64Zrr(b?)")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SI64rr")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIZrr(b?)")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2SIrr")>;
+def: InstRW<[SKXWriteResGroup100], (instregex "VCVTTSS2USI64Zrr(b?)")>;
+
+def SKXWriteResGroup101 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup101], (instregex "FLDCW16m")>;
+
+def SKXWriteResGroup102 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup102], (instregex "LDMXCSR")>;
+def: InstRW<[SKXWriteResGroup102], (instregex "VLDMXCSR")>;
+
+def SKXWriteResGroup103 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOVBkm")>;
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOVDkm")>;
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOVQkm")>;
+def: InstRW<[SKXWriteResGroup103], (instregex "KMOVWkm")>;
+
+def SKXWriteResGroup104 : SchedWriteRes<[SKXPort6,SKXPort23,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup104], (instregex "LRETQ")>;
+def: InstRW<[SKXWriteResGroup104], (instregex "RETQ")>;
+
+def SKXWriteResGroup105 : SchedWriteRes<[SKXPort23,SKXPort06,SKXPort15]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup105], (instregex "BEXTR(32|64)rm")>;
+
+def SKXWriteResGroup106 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237]> {
+ let Latency = 7;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VCOMPRESSPSZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSDZmr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ128mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZ256mr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup106], (instregex "VPCOMPRESSQZmr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup107 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL8m1")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROL8mi")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROR(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROR8m1")>;
+def: InstRW<[SKXWriteResGroup107], (instregex "ROR8mi")>;
+
+def SKXWriteResGroup108 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup108], (instregex "XADD(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup108], (instregex "XADD8rm")>;
+
+def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup109], (instregex "FARCALL64")>;
+
+def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,2,2,2];
+}
+def: InstRW<[SKXWriteResGroup110], (instrs VPSCATTERDQZ128mr,
+ VPSCATTERQQZ128mr,
+ VSCATTERDPDZ128mr,
+ VSCATTERQPDZ128mr)>;
+
+def SKXWriteResGroup111 : SchedWriteRes<[SKXPort6,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 7;
+ let ResourceCycles = [1,3,1,2];
+}
+def: InstRW<[SKXWriteResGroup111], (instregex "LOOP")>;
+
+def SKXWriteResGroup112 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 11;
+ let ResourceCycles = [1,4,4,2];
+}
+def: InstRW<[SKXWriteResGroup112], (instrs VPSCATTERDQZ256mr,
+ VPSCATTERQQZ256mr,
+ VSCATTERDPDZ256mr,
+ VSCATTERQPDZ256mr)>;
+
+def SKXWriteResGroup113 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 19;
+ let ResourceCycles = [1,8,8,2];
+}
+def: InstRW<[SKXWriteResGroup113], (instrs VPSCATTERDQZmr,
+ VPSCATTERQQZmr,
+ VSCATTERDPDZmr,
+ VSCATTERQPDZmr)>;
+
+def SKXWriteResGroup114 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 7;
+ let NumMicroOps = 36;
+ let ResourceCycles = [1,16,1,16,2];
+}
+def: InstRW<[SKXWriteResGroup114], (instrs VSCATTERDPSZmr)>;
+
+def SKXWriteResGroup115 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup115], (instregex "AESIMCrr")>;
+def: InstRW<[SKXWriteResGroup115], (instregex "VAESIMCrr")>;
+
+def SKXWriteResGroup116 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def: InstRW<[SKXWriteResGroup116], (instregex "PMULLDrr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPDr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDPSr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSDr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "ROUNDSSr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDYrr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VPMULLDrr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPDZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ128rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZ256rri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALEPSZrri(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESDr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VRNDSCALESSr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPDr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDPSr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSDr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDSSr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPDr")>;
+def: InstRW<[SKXWriteResGroup116], (instregex "VROUNDYPSr")>;
+
+def SKXWriteResGroup117 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPDrm")>;
+def: InstRW<[SKXWriteResGroup117], (instregex "VTESTPSrm")>;
+
+def SKXWriteResGroup118 : SchedWriteRes<[SKXPort1,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup118], (instregex "BSF(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "BSR(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "IMUL64m")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "IMUL(32|64)rm(i8)?")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "IMUL8m")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "LZCNT(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "MUL(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "MUL8m")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "PDEP(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "PEXT(32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "POPCNT(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup118], (instregex "TZCNT(16|32|64)rm")>;
+
+def SKXWriteResGroup118_16_1 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup118_16_1], (instregex "IMUL16rm(i8)?")>;
+
+def SKXWriteResGroup118_16_2 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+def: InstRW<[SKXWriteResGroup118_16_2], (instregex "IMUL16m")>;
+def: InstRW<[SKXWriteResGroup118_16_2], (instregex "MUL16m")>;
+
+def SKXWriteResGroup118_32 : SchedWriteRes<[SKXPort1, SKXPort0156, SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup118_32], (instregex "IMUL32m")>;
+def: InstRW<[SKXWriteResGroup118_32], (instregex "MUL32m")>;
+
+def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOM32m")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOM64m")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP32m")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "FCOMP64m")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "MMX_PSADBWirm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VFPCLASSSDrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSDWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKSSWBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSDWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPACKUSWBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRYrmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPALIGNRZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBLENDWYrmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTBZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPBROADCASTWZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPERMILPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBDYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXBQYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPMOVSXWQYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDYmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWYmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZ256mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFHWZmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWYmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZ256mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSHUFLWZmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSLLDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPSRLDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHBWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHQDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKHWDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLBWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLQDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VPUNPCKLWDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDYrmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSYrmi")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VSHUFPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKHPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSYrm")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup119], (instregex "VUNPCKLPSZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup120 : SchedWriteRes<[SKXPort01,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPABSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPADDUSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPAVGWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQQYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPEQWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPCMPGTWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAX(C?)SDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMAXUWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMIN(C?)SDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPMINUWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLQZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPROLVQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORQZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPRORVQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNBYrm256")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNDYrm256")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSIGNWYrm256")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLVWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSLLWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRADZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAVWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRAWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLVWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256mi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSRLWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWYrm")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup120], (instregex "VPSUBUSWZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup121 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDNPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VANDPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDMPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPDYrmi")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBLENDPSYrmi")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Z256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X2Zm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X4rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF32X8rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X2rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTF64X4rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Z256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X2Zm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X4rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI32X8rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2Z128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X2rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTI64X4rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSDZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VBROADCASTSSZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x4Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF32x8Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x2Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTF64x4Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x4Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI32x8Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x2Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VINSERTI64x4Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMASKMOVPSYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVAPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDDUPZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA32Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQA64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU16Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU32Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Z256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVDQU8Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVNTDQAZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSHDUPZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVSLDUPZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VMOVUPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VORPDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VORPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VORPSYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VORPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPADDWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDNYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPANDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDDYrmi")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBLENDMWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTDZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPBROADCASTQZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPMASKMOVQYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPORDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPORQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPORYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPSUBWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPTERNLOGQZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPXORDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPXORQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VPXORYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VXORPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSYrm")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup121], (instregex "VXORPSZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup122 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPDrm0")>;
+def: InstRW<[SKXWriteResGroup122], (instregex "BLENDVPSrm0")>;
+def: InstRW<[SKXWriteResGroup122], (instregex "PBLENDVBrm0")>;
+def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPDrm")>;
+def: InstRW<[SKXWriteResGroup122], (instregex "VBLENDVPSrm")>;
+def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBYrm")>;
+def: InstRW<[SKXWriteResGroup122], (instregex "VPBLENDVBrm")>;
+
+def SKXWriteResGroup123 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,2,1];
+}
+def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHADDSWrm64")>;
+def: InstRW<[SKXWriteResGroup123], (instregex "MMX_PHSUBSWrm64")>;
+
+def SKXWriteResGroup124 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort05]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDWrm64")>;
+def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHADDrm64")>;
+def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBDrm64")>;
+def: InstRW<[SKXWriteResGroup124], (instregex "MMX_PHSUBWrm64")>;
+
+def SKXWriteResGroup125 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort237,SKXPort015]> {
+ let Latency = 8;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup125], (instregex "VCVTPS2PHYmr")>;
+
+def SKXWriteResGroup126 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[SKXWriteResGroup126], (instregex "ROR(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup126], (instregex "ROR8mCL")>;
+
+def SKXWriteResGroup127 : SchedWriteRes<[SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL8m1")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCL8mi")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)m1")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCR(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCR8m1")>;
+def: InstRW<[SKXWriteResGroup127], (instregex "RCR8mi")>;
+
+def SKXWriteResGroup128 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[SKXWriteResGroup128], (instregex "ROL(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "ROL8mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "SAR(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "SAR8mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "SHL(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "SHL8mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "SHR(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup128], (instregex "SHR8mCL")>;
+
+def SKXWriteResGroup129 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,3];
+}
+def: InstRW<[SKXWriteResGroup129], (instregex "ADC(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup129], (instregex "ADC8mi")>;
+
+def SKXWriteResGroup130 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[SKXWriteResGroup130], (instregex "ADC(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "ADC8mr")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "CMPXCHG8rm")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mi")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "SBB(16|32|64)mr")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mi")>;
+def: InstRW<[SKXWriteResGroup130], (instregex "SBB8mr")>;
+
+def SKXWriteResGroup131 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,2,1,2,2];
+}
+def: InstRW<[SKXWriteResGroup131], (instrs VPSCATTERQDZ128mr,
+ VPSCATTERQDZ256mr,
+ VSCATTERQPSZ128mr,
+ VSCATTERQPSZ256mr)>;
+
+def SKXWriteResGroup132 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 12;
+ let ResourceCycles = [1,4,1,4,2];
+}
+def: InstRW<[SKXWriteResGroup132], (instrs VPSCATTERDDZ128mr,
+ VSCATTERDPSZ128mr)>;
+
+def SKXWriteResGroup133 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 20;
+ let ResourceCycles = [1,8,1,8,2];
+}
+def: InstRW<[SKXWriteResGroup133], (instrs VPSCATTERDDZ256mr,
+ VSCATTERDPSZ256mr)>;
+
+def SKXWriteResGroup134 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort237,SKXPort0156]> {
+ let Latency = 8;
+ let NumMicroOps = 36;
+ let ResourceCycles = [1,16,1,16,2];
+}
+def: InstRW<[SKXWriteResGroup134], (instrs VPSCATTERDDZmr)>;
+
+def SKXWriteResGroup135 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_CVTPI2PSirm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDUBSWrm64")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMADDWDirm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHRSWrm64")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHUWirm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULHWirm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULLWirm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "MMX_PMULUDQirm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "RCPSSm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "RSQRTSSm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "VRCPSSm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "VRSQRTSSm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPDYrm")>;
+def: InstRW<[SKXWriteResGroup135], (instregex "VTESTPSYrm")>;
+
+def SKXWriteResGroup136 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup136], (instregex "PCMPGTQrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "PSADBWrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VALIGNQZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VCMPPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VCMPSSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VDBPSADBWZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VFPCLASSSSrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPBZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPDZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPEQWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTQrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPGTWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPQZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUBZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUDZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUQZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPUWZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPCMPWZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2D128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PD128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2PS128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMI2Q128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2D128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PD128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2PS128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPERMT2Q128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXSQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMAXUQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMINSQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMINUQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWYrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXBWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQYrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDYrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVSXWQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXBWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDYrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPMOVZXWQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPSADBWrm")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTMWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMBZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup136], (instregex "VPTESTNMWZ128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup137], (instregex "ADDSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "ADDSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "CMPSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "CMPSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "CVTPS2PDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MAX(C?)SSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MIN(C?)SSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTPS2PIirm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MMX_CVTTPS2PIirm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MULSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "MULSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "SUBSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "SUBSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VADDSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VADDSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VCMPSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPH2PSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VCVTPS2PDrm")>;
+def: InstRW<[SKXWriteResGroup137],
+ (instregex "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)m")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VMAX(C?)SSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VMIN(C?)SSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VMULSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VMULSSrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSDrm")>;
+def: InstRW<[SKXWriteResGroup137], (instregex "VSUBSSrm")>;
+
+def SKXWriteResGroup138 : SchedWriteRes<[SKXPort0,SKXPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PDZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup138], (instregex "VRCP14PSZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PDZr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup138], (instregex "VRSQRT14PSZr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup139 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup139], (instregex "DPPDrri")>;
+def: InstRW<[SKXWriteResGroup139], (instregex "VDPPDrri")>;
+
+def SKXWriteResGroup140 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPDYrm")>;
+def: InstRW<[SKXWriteResGroup140], (instregex "VBLENDVPSYrm")>;
+
+def SKXWriteResGroup141 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup141], (instregex "PTESTrm")>;
+def: InstRW<[SKXWriteResGroup141], (instregex "VPTESTrm")>;
+
+def SKXWriteResGroup142 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup142], (instregex "MULX64rm")>;
+
+def SKXWriteResGroup143 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup143], (instregex "PHADDSWrm128")>;
+def: InstRW<[SKXWriteResGroup143], (instregex "PHSUBSWrm128")>;
+def: InstRW<[SKXWriteResGroup143], (instregex "VPHADDSWrm128")>;
+def: InstRW<[SKXWriteResGroup143], (instregex "VPHSUBSWrm128")>;
+
+def SKXWriteResGroup144 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup144], (instregex "PHADDDrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "PHADDWrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBDrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "PHSUBWrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDDrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "VPHADDWrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBDrm")>;
+def: InstRW<[SKXWriteResGroup144], (instregex "VPHSUBWrm")>;
+
+def SKXWriteResGroup145 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup145], (instregex "SHLD(16|32|64)mri8")>;
+def: InstRW<[SKXWriteResGroup145], (instregex "SHRD(16|32|64)mri8")>;
+
+def SKXWriteResGroup146 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
+ let Latency = 9;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup146], (instregex "LAR(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup146], (instregex "LSL(16|32|64)rm")>;
+
+def SKXWriteResGroup147 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup147], (instregex "AESDECLASTrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "AESDECrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "AESENCLASTrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "AESENCrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "RCPPSm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "RSQRTPSm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECLASTrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VAESDECrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCLASTrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VAESENCrm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PDZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14PSZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SDrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRCP14SSrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRCPPSm")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PDZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14PSZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SDrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRT14SSrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup147], (instregex "VRSQRTPSm")>;
+
+def SKXWriteResGroup148 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F32m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "ADD_F64m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F16m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F32m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "ILD_F64m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F32m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "SUBR_F64m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F32m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "SUB_F64m")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VALIGNQZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VCMPPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VDBPSADBWZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPBZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPDZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPEQWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPGTWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPQZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUBZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUDZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUQZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPUWZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZ256rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPCMPWZrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2F128rm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERM2I128rm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2D256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Drm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PD256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PDrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PS256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2PSrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Q256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMI2Qrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDYmi")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQYmi")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256m(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2D256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Drm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PD256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PDrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PS256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2PSrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Q256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPERMT2Qrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXSQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMAXUQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMINSQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMINUQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXBWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVSXWQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXBWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPMOVZXWQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWYrm")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPSADBWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTMWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMBZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VPTESTNMWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Z256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF32X4Zrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Z256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFF64X2Zrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Z256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI32X4Zrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Z256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup148], (instregex "VSHUFI64X2Zrm(b?)i(k?)(z?)")>;
+
+def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup149], (instregex "ADDPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "ADDPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "ADDSUBPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "CMPPDrmi")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "CMPPSrmi")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "CVTDQ2PSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "CVTPS2DQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "CVTSS2SDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "CVTTPS2DQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "MAX(C?)PSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "MIN(C?)PSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "MULPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "MULPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PHMINPOSUWrm128")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMADDUBSWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMADDWDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMULDQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMULHRSWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMULHUWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMULHWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMULLWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "PMULUDQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "SUBPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "SUBPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDSDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDSSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VADDSUBPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPDrmi")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCMPPSrmi")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTDQ2PSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2QQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPD2UQQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSYrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPH2PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2DQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2QQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTPS2UQQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTQQ2PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTSS2SDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2QQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPD2UQQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2DQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2QQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTTPS2UQQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUDQ2PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VCVTUQQ2PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSDrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VFIXUPIMMSSrmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149],
+ (instregex
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z128m(b?)(k?)(z?)",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)m",
+ "VF(N)?M(ADD|SUB)(132|213|231)S(D|S)Zm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPDZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPPSZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSDm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETEXPSSm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSDZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VGETMANTSSZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)PSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMAX(C?)SSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)PSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMIN(C?)SSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMULPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMULPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMULSDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VMULSSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPHMINPOSUWrm128")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPLZCNTQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDUBSWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMADDWDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULDQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHRSWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHUWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULHWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULLWrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VPMULUDQrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VRANGEPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESDZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VRANGESSZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCEPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESDZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VREDUCESSZ128rmi(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSCALEFSSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPDrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSUBPSrm")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSDZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup149], (instregex "VSUBSSZrm(_Int)?(k?)(z?)")>;
+
+def SKXWriteResGroup150 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRIrr")>;
+def: InstRW<[SKXWriteResGroup150], (instregex "PCMPISTRM128rr")>;
+def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRIrr")>;
+def: InstRW<[SKXWriteResGroup150], (instregex "VPCMPISTRM128rr")>;
+
+def SKXWriteResGroup151 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup151], (instregex "MPSADBWrmi")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VEXPANDPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VMPSADBWrmi")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup151], (instregex "VPEXPANDQZ128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup152 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup152], (instregex "MMX_CVTPI2PDirm")>;
+def: InstRW<[SKXWriteResGroup152], (instregex "VPTESTYrm")>;
+
+def SKXWriteResGroup153 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup153], (instregex "CVTSD2SSrm")>;
+def: InstRW<[SKXWriteResGroup153], (instregex "VCVTSD2SSrm")>;
+
+def SKXWriteResGroup154 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup154], (instregex "VPHADDSWrm256")>;
+def: InstRW<[SKXWriteResGroup154], (instregex "VPHSUBSWrm256")>;
+
+def SKXWriteResGroup155 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDDYrm")>;
+def: InstRW<[SKXWriteResGroup155], (instregex "VPHADDWYrm")>;
+def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBDYrm")>;
+def: InstRW<[SKXWriteResGroup155], (instregex "VPHSUBWYrm")>;
+
+def SKXWriteResGroup156 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort06,SKXPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup156], (instregex "MULX32rm")>;
+
+def SKXWriteResGroup157 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,3];
+}
+def: InstRW<[SKXWriteResGroup157], (instregex "ADD8mi")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "AND8mi")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "OR8mi")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "SUB8mi")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "XCHG(16|32|64)rm")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "XCHG8rm")>;
+def: InstRW<[SKXWriteResGroup157], (instregex "XOR8mi")>;
+
+def SKXWriteResGroup158 : SchedWriteRes<[SKXPort05,SKXPort0156]> {
+ let Latency = 10;
+ let NumMicroOps = 10;
+ let ResourceCycles = [9,1];
+}
+def: InstRW<[SKXWriteResGroup158], (instregex "MMX_EMMS")>;
+
+def SKXWriteResGroup159 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 11;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup159], (instregex "DIVPSrr")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "DIVSSrr")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSYrr")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "VDIVPSrr")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup159], (instregex "VDIVSSrr")>;
+
+def SKXWriteResGroup160 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F32m")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "MUL_F64m")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "VRCP14PSZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "VRCPPSYm")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRT14PSZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup160], (instregex "VRSQRTPSYm")>;
+
+def SKXWriteResGroup161 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VADDSUBPSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPDYrmi")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCMPPSYrmi")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTDQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2QQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPD2UQQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPH2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2DQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2QQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTPS2UQQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2QQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPD2UQQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2DQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2QQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTTPS2UQQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUDQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VFIXUPIMMPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161],
+ (instregex
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Ym",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Z256m(b?)(k?)(z?)",
+ "VF(N)?M(ADD|SUB|ADDSUB|SUBADD)(132|213|231)P(D|S)Zm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPDm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSZ256m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETEXPPSm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VGETMANTPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMAX(C?)PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMIN(C?)PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMULPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VMULPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPLZCNTQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDUBSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMADDWDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHRSWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHUWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULHWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULLWZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VPMULUDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VRANGEPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VREDUCEPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSCALEFPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSYrm")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup161], (instregex "VSUBPSZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup162 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOM16m")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOM32m")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP16m")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "FICOMP32m")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VEXPANDPSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VMPSADBWYrmi")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup162], (instregex "VPEXPANDQZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup163 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup163], (instregex "VCVTSD2SSZrm(_Int)?(k?)(z?)")>;
+
+def SKXWriteResGroup164 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup164], (instregex "CVTDQ2PDrm")>;
+def: InstRW<[SKXWriteResGroup164], (instregex "VCVTDQ2PDrm")>;
+
+def SKXWriteResGroup165 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTSD2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTSS2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSD2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "CVTTSS2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSD2USI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTSS2USIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSD2USI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SI64rm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2SIrm")>;
+def: InstRW<[SKXWriteResGroup165], (instregex "VCVTTSS2USIZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup166 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2DQrm")>;
+def: InstRW<[SKXWriteResGroup166], (instregex "CVTPD2PSrm")>;
+def: InstRW<[SKXWriteResGroup166], (instregex "CVTTPD2DQrm")>;
+def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTPD2PIirm")>;
+def: InstRW<[SKXWriteResGroup166], (instregex "MMX_CVTTPD2PIirm")>;
+
+def SKXWriteResGroup167 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 11;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup167], (instregex "VPCONFLICTQZ128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup168 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 6;
+ let ResourceCycles = [1,1,1,2,1];
+}
+def: InstRW<[SKXWriteResGroup168], (instregex "SHLD(16|32|64)mrCL")>;
+def: InstRW<[SKXWriteResGroup168], (instregex "SHRD(16|32|64)mrCL")>;
+
+def SKXWriteResGroup169 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 7;
+ let ResourceCycles = [2,3,2];
+}
+def: InstRW<[SKXWriteResGroup169], (instregex "RCL(16|32|64)rCL")>;
+def: InstRW<[SKXWriteResGroup169], (instregex "RCR(16|32|64)rCL")>;
+
+def SKXWriteResGroup170 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 9;
+ let ResourceCycles = [1,5,1,2];
+}
+def: InstRW<[SKXWriteResGroup170], (instregex "RCL8rCL")>;
+
+def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
+ let Latency = 11;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,9];
+}
+def: InstRW<[SKXWriteResGroup171], (instregex "LOOPE")>;
+def: InstRW<[SKXWriteResGroup171], (instregex "LOOPNE")>;
+
+def SKXWriteResGroup172 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 12;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup172], (instregex "SQRTPSr")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "SQRTSSr")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSYr")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTPSr")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSZr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup172], (instregex "VSQRTSSr")>;
+
+def SKXWriteResGroup173 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup173], (instregex "PCLMULQDQrm")>;
+def: InstRW<[SKXWriteResGroup173], (instregex "VPCLMULQDQrm")>;
+
+def SKXWriteResGroup174 : SchedWriteRes<[SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [3];
+}
+def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup175 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup175], (instregex "VPERMWZ128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup176 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSD2USIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVTSS2USI64Zrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSD2USIZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup176], (instregex "VCVTTSS2USI64Zrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup177 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2QQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVTPS2UQQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2QQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup177], (instregex "VCVTTPS2UQQZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup178 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup178], (instregex "HADDPDrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "HADDPSrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPDrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "HSUBPSrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPDrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "VHADDPSrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPDrm")>;
+def: InstRW<[SKXWriteResGroup178], (instregex "VHSUBPSrm")>;
+
+def SKXWriteResGroup179 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 12;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup179], (instregex "CVTTSS2SI64rm")>;
+
+def SKXWriteResGroup180 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI16m")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "ADD_FI32m")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI16m")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "SUBR_FI32m")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI16m")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "SUB_FI32m")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup180], (instregex "VPERMWZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup181 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 13;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup181], (instregex "VCVTDQ2PDYrm")>;
+
+def SKXWriteResGroup182 : SchedWriteRes<[SKXPort5,SKXPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup182], (instregex "DPPSrri")>;
+def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSYrri")>;
+def: InstRW<[SKXWriteResGroup182], (instregex "VDPPSrri")>;
+
+def SKXWriteResGroup183 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 13;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPDYrm")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VHADDPSYrm")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPDYrm")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VHSUBPSYrm")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VPERMI2W128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup183], (instregex "VPERMT2W128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup184 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 14;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup184], (instregex "DIVPDrr")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "DIVSDrr")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDYrr")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "VDIVPDrr")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDZrr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup184], (instregex "VDIVSDrr")>;
+
+def SKXWriteResGroup185 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup185], (instregex "AESIMCrm")>;
+def: InstRW<[SKXWriteResGroup185], (instregex "VAESIMCrm")>;
+
+def SKXWriteResGroup186 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup186], (instregex "PMULLDrm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPDm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDPSm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSDm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "ROUNDSSm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VPMULLDrm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPDZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALEPSZ128rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESDm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VRNDSCALESSm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPDm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDPSm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSDm")>;
+def: InstRW<[SKXWriteResGroup186], (instregex "VROUNDSSm")>;
+
+def SKXWriteResGroup187 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI16m")>;
+def: InstRW<[SKXWriteResGroup187], (instregex "MUL_FI32m")>;
+
+def SKXWriteResGroup188 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 14;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2DQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTPD2UDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTQQ2PSZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2DQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTTPD2UDQZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup188], (instregex "VCVTUQQ2PSZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup189 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 14;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2W256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMI2Wrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2W256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup189], (instregex "VPERMT2Wrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup190 : SchedWriteRes<[SKXPort1,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 14;
+ let NumMicroOps = 10;
+ let ResourceCycles = [2,4,1,3];
+}
+def: InstRW<[SKXWriteResGroup190], (instregex "RCR8rCL")>;
+
+def SKXWriteResGroup191 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 15;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FPrST0")>;
+def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FST0r")>;
+def: InstRW<[SKXWriteResGroup191], (instregex "DIVR_FrST0")>;
+
+def SKXWriteResGroup192 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 15;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,2];
+}
+def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDYrm")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VPMULLDZrm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPDZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZ256rm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VRNDSCALEPSZrm(b?)i(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPDm")>;
+def: InstRW<[SKXWriteResGroup192], (instregex "VROUNDYPSm")>;
+
+def SKXWriteResGroup193 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 15;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,1,2];
+}
+def: InstRW<[SKXWriteResGroup193], (instregex "DPPDrmi")>;
+def: InstRW<[SKXWriteResGroup193], (instregex "VDPPDrmi")>;
+
+def SKXWriteResGroup194 : SchedWriteRes<[SKXPort1,SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 15;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,2,2,1,2];
+}
+def: InstRW<[SKXWriteResGroup194], (instregex "VPCONFLICTDZ128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup195 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 15;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,1,1,5,1,1];
+}
+def: InstRW<[SKXWriteResGroup195], (instregex "RCL(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup195], (instregex "RCL8mCL")>;
+
+def SKXWriteResGroup196 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup196], (instregex "DIVSSrm")>;
+def: InstRW<[SKXWriteResGroup196], (instregex "VDIVSSrm")>;
+
+def SKXWriteResGroup197 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [3,1];
+}
+def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRIrm")>;
+def: InstRW<[SKXWriteResGroup197], (instregex "PCMPISTRM128rm")>;
+def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRIrm")>;
+def: InstRW<[SKXWriteResGroup197], (instregex "VPCMPISTRM128rm")>;
+
+def SKXWriteResGroup198 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 16;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PDZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup198], (instregex "VRCP14PSZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PDZm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup198], (instregex "VRSQRT14PSZm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup199 : SchedWriteRes<[SKXPort4,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 14;
+ let ResourceCycles = [1,1,1,4,2,5];
+}
+def: InstRW<[SKXWriteResGroup199], (instregex "CMPXCHG8B")>;
+
+def SKXWriteResGroup200 : SchedWriteRes<[SKXPort0156]> {
+ let Latency = 16;
+ let NumMicroOps = 16;
+ let ResourceCycles = [16];
+}
+def: InstRW<[SKXWriteResGroup200], (instregex "VZEROALL")>;
+
+def SKXWriteResGroup201 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup201], (instregex "DIVPSrm")>;
+def: InstRW<[SKXWriteResGroup201], (instregex "SQRTSSm")>;
+def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup201], (instregex "VDIVPSrm")>;
+def: InstRW<[SKXWriteResGroup201], (instregex "VDIVSSZrm(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup201], (instregex "VSQRTSSm")>;
+
+def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
+ let Latency = 17;
+ let NumMicroOps = 15;
+ let ResourceCycles = [2,1,2,4,2,4];
+}
+def: InstRW<[SKXWriteResGroup202], (instregex "XCH_F")>;
+
+def SKXWriteResGroup203 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 18;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup203], (instregex "SQRTPDr")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "SQRTSDr")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDYr")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ128r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDZ256r(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTPDr")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDZr(b?)(_Int)?(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup203], (instregex "VSQRTSDr")>;
+
+def SKXWriteResGroup204 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 18;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup204], (instregex "SQRTPSm")>;
+def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSYrm")>;
+def: InstRW<[SKXWriteResGroup204], (instregex "VDIVPSZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTPSm")>;
+def: InstRW<[SKXWriteResGroup204], (instregex "VSQRTSSZm(_Int)?(k?)(z?)")>;
+
+def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 18;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup205], (instregex "VPMULLQZ128rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup206 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [4,3,1];
+}
+def: InstRW<[SKXWriteResGroup206], (instregex "PCMPESTRIrr")>;
+def: InstRW<[SKXWriteResGroup206], (instregex "VPCMPESTRIrr")>;
+
+def SKXWriteResGroup207 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,5];
+}
+def: InstRW<[SKXWriteResGroup207], (instregex "CPUID")>;
+def: InstRW<[SKXWriteResGroup207], (instregex "RDTSC")>;
+
+def SKXWriteResGroup208 : SchedWriteRes<[SKXPort1,SKXPort23,SKXPort237,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 18;
+ let NumMicroOps = 11;
+ let ResourceCycles = [2,1,1,4,1,2];
+}
+def: InstRW<[SKXWriteResGroup208], (instregex "RCR(16|32|64)mCL")>;
+def: InstRW<[SKXWriteResGroup208], (instregex "RCR8mCL")>;
+
+def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup209], (instregex "DIVSDrm")>;
+def: InstRW<[SKXWriteResGroup209], (instregex "VDIVSDrm")>;
+def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSYm")>;
+def: InstRW<[SKXWriteResGroup209], (instregex "VSQRTPSZ256m(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup210 : SchedWriteRes<[SKXPort0,SKXPort015]> {
+ let Latency = 19;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup210], (instregex "VSQRTPSZr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> {
+ let Latency = 19;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup212 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 19;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[SKXWriteResGroup212], (instregex "DPPSrmi")>;
+def: InstRW<[SKXWriteResGroup212], (instregex "VDPPSrmi")>;
+
+def SKXWriteResGroup213 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015,SKXPort0156]> {
+ let Latency = 19;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[SKXWriteResGroup213], (instregex "PCMPESTRM128rr")>;
+def: InstRW<[SKXWriteResGroup213], (instregex "VPCMPESTRM128rr")>;
+
+def SKXWriteResGroup214 : SchedWriteRes<[]> {
+ let Latency = 20;
+ let NumMicroOps = 0;
+}
+def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm,
+ VGATHERQPSZrm,
+ VPGATHERDDZ128rm)>;
+
+def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
+ let Latency = 20;
+ let NumMicroOps = 1;
+ let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FPrST0")>;
+def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FST0r")>;
+def: InstRW<[SKXWriteResGroup215], (instregex "DIV_FrST0")>;
+
+def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup216], (instregex "DIVPDrm")>;
+def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDZ128rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup216], (instregex "VDIVPDrm")>;
+def: InstRW<[SKXWriteResGroup216], (instregex "VDIVSDZrm(_Int)?(k?)(z?)")>;
+
+def SKXWriteResGroup217 : SchedWriteRes<[SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,1,3];
+}
+def: InstRW<[SKXWriteResGroup217], (instregex "VDPPSYrmi")>;
+
+def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm,
+ VGATHERQPSZ256rm,
+ VPGATHERQDZ128rm,
+ VPGATHERQDZ256rm)>;
+
+def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 8;
+ let ResourceCycles = [1,1,1,1,1,1,2];
+}
+def: InstRW<[SKXWriteResGroup219], (instregex "INSB")>;
+def: InstRW<[SKXWriteResGroup219], (instregex "INSL")>;
+def: InstRW<[SKXWriteResGroup219], (instregex "INSW")>;
+
+def SKXWriteResGroup220 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort0156]> {
+ let Latency = 20;
+ let NumMicroOps = 10;
+ let ResourceCycles = [1,2,7];
+}
+def: InstRW<[SKXWriteResGroup220], (instregex "MWAITrr")>;
+
+def SKXWriteResGroup221 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort015]> {
+ let Latency = 20;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,2];
+}
+def: InstRW<[SKXWriteResGroup221], (instregex "AESKEYGENASSIST128rr")>;
+def: InstRW<[SKXWriteResGroup221], (instregex "VAESKEYGENASSIST128rr")>;
+
+def SKXWriteResGroup222 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 21;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDYrm")>;
+def: InstRW<[SKXWriteResGroup222], (instregex "VDIVPDZ256rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 22;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F32m")>;
+def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F64m")>;
+
+def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 22;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm,
+ VGATHERQPDZ128rm,
+ VPGATHERDQZ128rm,
+ VPGATHERQQZ128rm)>;
+
+def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm,
+ VGATHERDPDrm,
+ VGATHERQPDrm,
+ VGATHERQPSrm,
+ VPGATHERDDrm,
+ VPGATHERDQrm,
+ VPGATHERQDrm,
+ VPGATHERQQrm,
+ VPGATHERDDrm,
+ VPGATHERQDrm,
+ VPGATHERDQrm,
+ VPGATHERQQrm,
+ VGATHERDPSrm,
+ VGATHERQPSrm,
+ VGATHERDPDrm,
+ VGATHERQPDrm)>;
+
+def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm,
+ VGATHERQPDYrm,
+ VGATHERQPSYrm,
+ VPGATHERDDYrm,
+ VPGATHERDQYrm,
+ VPGATHERQDYrm,
+ VPGATHERQQYrm,
+ VPGATHERDDYrm,
+ VPGATHERQDYrm,
+ VPGATHERDQYrm,
+ VPGATHERQQYrm,
+ VGATHERDPSYrm,
+ VGATHERQPSYrm,
+ VGATHERDPDYrm)>;
+
+def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
+ let Latency = 22;
+ let NumMicroOps = 14;
+ let ResourceCycles = [5,5,4];
+}
+def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTDZ128rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup225], (instregex "VPCONFLICTQZ256rr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup226 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 23;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup226], (instregex "SQRTSDm")>;
+def: InstRW<[SKXWriteResGroup226], (instregex "VSQRTSDm")>;
+
+def SKXWriteResGroup227 : SchedWriteRes<[SKXPort0,SKXPort015]> {
+ let Latency = 23;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPDZrr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup227], (instregex "VDIVPSZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup228 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort5,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 23;
+ let NumMicroOps = 19;
+ let ResourceCycles = [2,1,4,1,1,4,6];
+}
+def: InstRW<[SKXWriteResGroup228], (instregex "CMPXCHG16B")>;
+
+def SKXWriteResGroup229 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 24;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup229], (instregex "SQRTPDm")>;
+def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDZ128m(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTPDm")>;
+def: InstRW<[SKXWriteResGroup229], (instregex "VSQRTSDZm(_Int)?(k?)(z?)")>;
+
+def SKXWriteResGroup230 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 24;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup230], (instregex "VDIVPSZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup231 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> {
+ let Latency = 24;
+ let NumMicroOps = 9;
+ let ResourceCycles = [4,3,1,1];
+}
+def: InstRW<[SKXWriteResGroup231], (instregex "PCMPESTRIrm")>;
+def: InstRW<[SKXWriteResGroup231], (instregex "VPCMPESTRIrm")>;
+
+def SKXWriteResGroup232 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDYm")>;
+def: InstRW<[SKXWriteResGroup232], (instregex "VSQRTPDZ256m(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 25;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI16m")>;
+def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI32m")>;
+
+def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm,
+ VGATHERQPDZ256rm,
+ VPGATHERDQZ256rm,
+ VPGATHERQDZrm,
+ VPGATHERQQZ256rm)>;
+
+def SKXWriteResGroup235 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 25;
+ let NumMicroOps = 10;
+ let ResourceCycles = [4,3,1,1,1];
+}
+def: InstRW<[SKXWriteResGroup235], (instregex "PCMPESTRM128rm")>;
+def: InstRW<[SKXWriteResGroup235], (instregex "VPCMPESTRM128rm")>;
+
+def SKXWriteResGroup236 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort015]> {
+ let Latency = 25;
+ let NumMicroOps = 11;
+ let ResourceCycles = [3,6,1,1];
+}
+def: InstRW<[SKXWriteResGroup236], (instregex "AESKEYGENASSIST128rm")>;
+def: InstRW<[SKXWriteResGroup236], (instregex "VAESKEYGENASSIST128rm")>;
+
+def SKXWriteResGroup237 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 26;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup237], (instregex "VSQRTPSZm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 26;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm,
+ VGATHERQPDZrm,
+ VPGATHERDQZrm,
+ VPGATHERQQZrm)>;
+
+def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
+ let Latency = 27;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F32m")>;
+def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F64m")>;
+
+def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 27;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm,
+ VPGATHERDDZ256rm)>;
+
+def SKXWriteResGroup241 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23,SKXPort0156]> {
+ let Latency = 28;
+ let NumMicroOps = 8;
+ let ResourceCycles = [2,4,1,1];
+}
+def: InstRW<[SKXWriteResGroup241], (instregex "IDIV(16|32|64)m")>;
+def: InstRW<[SKXWriteResGroup241], (instregex "IDIV8m")>;
+
+def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 29;
+ let NumMicroOps = 15;
+ let ResourceCycles = [5,5,1,4];
+}
+def: InstRW<[SKXWriteResGroup242], (instregex "VPCONFLICTQZ256rm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
+ let Latency = 30;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,1,1];
+}
+def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI16m")>;
+def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI32m")>;
+
+def SKXWriteResGroup244 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 30;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup244], (instregex "VDIVPDZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+ let Latency = 30;
+ let NumMicroOps = 5;
+ let ResourceCycles = [1,2,1,1];
+}
+def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm,
+ VPGATHERDDZrm)>;
+
+def SKXWriteResGroup246 : SchedWriteRes<[SKXPort0,SKXPort015]> {
+ let Latency = 31;
+ let NumMicroOps = 3;
+ let ResourceCycles = [2,1];
+}
+def: InstRW<[SKXWriteResGroup246], (instregex "VSQRTPDZr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,3,4,10];
+}
+def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)ri")>;
+def: InstRW<[SKXWriteResGroup247], (instregex "IN(16|32)rr")>;
+def: InstRW<[SKXWriteResGroup247], (instregex "IN8ri")>;
+def: InstRW<[SKXWriteResGroup247], (instregex "IN8rr")>;
+
+def SKXWriteResGroup248 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 35;
+ let NumMicroOps = 23;
+ let ResourceCycles = [1,5,2,1,4,10];
+}
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)ir")>;
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT(16|32)rr")>;
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT8ir")>;
+def: InstRW<[SKXWriteResGroup248], (instregex "OUT8rr")>;
+
+def SKXWriteResGroup249 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
+ let Latency = 37;
+ let NumMicroOps = 21;
+ let ResourceCycles = [9,7,5];
+}
+def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTDZ256rr(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup249], (instregex "VPCONFLICTQZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup250 : SchedWriteRes<[SKXPort1,SKXPort6,SKXPort23,SKXPort0156]> {
+ let Latency = 37;
+ let NumMicroOps = 31;
+ let ResourceCycles = [1,8,1,21];
+}
+def: InstRW<[SKXWriteResGroup250], (instregex "XRSTOR(64)?")>;
+
+def SKXWriteResGroup251 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015]> {
+ let Latency = 38;
+ let NumMicroOps = 4;
+ let ResourceCycles = [2,1,1];
+}
+def: InstRW<[SKXWriteResGroup251], (instregex "VSQRTPDZm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup252 : SchedWriteRes<[SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort15,SKXPort0156]> {
+ let Latency = 40;
+ let NumMicroOps = 18;
+ let ResourceCycles = [1,1,2,3,1,1,1,8];
+}
+def: InstRW<[SKXWriteResGroup252], (instregex "VMCLEARm")>;
+
+def SKXWriteResGroup253 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 41;
+ let NumMicroOps = 39;
+ let ResourceCycles = [1,10,1,1,26];
+}
+def: InstRW<[SKXWriteResGroup253], (instregex "XSAVE64")>;
+
+def SKXWriteResGroup254 : SchedWriteRes<[SKXPort5,SKXPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 22;
+ let ResourceCycles = [2,20];
+}
+def: InstRW<[SKXWriteResGroup254], (instregex "RDTSCP")>;
+
+def SKXWriteResGroup255 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,SKXPort0156]> {
+ let Latency = 42;
+ let NumMicroOps = 40;
+ let ResourceCycles = [1,11,1,1,26];
+}
+def: InstRW<[SKXWriteResGroup255], (instregex "XSAVE")>;
+
+def SKXWriteResGroup256 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 44;
+ let NumMicroOps = 22;
+ let ResourceCycles = [9,7,1,5];
+}
+def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTDZ256rm(b?)(k?)(z?)")>;
+def: InstRW<[SKXWriteResGroup256], (instregex "VPCONFLICTQZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup258 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort05,SKXPort06,SKXPort0156]> {
+ let Latency = 62;
+ let NumMicroOps = 64;
+ let ResourceCycles = [2,8,5,10,39];
+}
+def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>;
+def: InstRW<[SKXWriteResGroup258], (instregex "FLDENVm")>;
+
+def SKXWriteResGroup259 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 88;
+ let ResourceCycles = [4,4,31,1,2,1,45];
+}
+def: InstRW<[SKXWriteResGroup259], (instregex "FXRSTOR64")>;
+
+def SKXWriteResGroup260 : SchedWriteRes<[SKXPort0,SKXPort6,SKXPort23,SKXPort05,SKXPort06,SKXPort15,SKXPort0156]> {
+ let Latency = 63;
+ let NumMicroOps = 90;
+ let ResourceCycles = [4,2,33,1,2,1,47];
+}
+def: InstRW<[SKXWriteResGroup260], (instregex "FXRSTOR")>;
+
+def SKXWriteResGroup261 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
+ let Latency = 67;
+ let NumMicroOps = 35;
+ let ResourceCycles = [17,11,7];
+}
+def: InstRW<[SKXWriteResGroup261], (instregex "VPCONFLICTDZrr(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup262 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
+ let Latency = 74;
+ let NumMicroOps = 36;
+ let ResourceCycles = [17,11,1,7];
+}
+def: InstRW<[SKXWriteResGroup262], (instregex "VPCONFLICTDZrm(b?)(k?)(z?)")>;
+
+def SKXWriteResGroup263 : SchedWriteRes<[SKXPort5,SKXPort05,SKXPort0156]> {
+ let Latency = 75;
+ let NumMicroOps = 15;
+ let ResourceCycles = [6,3,6];
+}
+def: InstRW<[SKXWriteResGroup263], (instregex "FNINIT")>;
+
+def SKXWriteResGroup264 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort05,SKXPort0156]> {
+ let Latency = 76;
+ let NumMicroOps = 32;
+ let ResourceCycles = [7,2,8,3,1,11];
+}
+def: InstRW<[SKXWriteResGroup264], (instregex "DIV(16|32|64)r")>;
+
+def SKXWriteResGroup265 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKXPort06,SKXPort0156]> {
+ let Latency = 102;
+ let NumMicroOps = 66;
+ let ResourceCycles = [4,2,4,8,14,34];
+}
+def: InstRW<[SKXWriteResGroup265], (instregex "IDIV(16|32|64)r")>;
+
+def SKXWriteResGroup266 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort4,SKXPort5,SKXPort6,SKXPort237,SKXPort06,SKXPort0156]> {
+ let Latency = 106;
+ let NumMicroOps = 100;
+ let ResourceCycles = [9,1,11,16,1,11,21,30];
+}
+def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>;
+def: InstRW<[SKXWriteResGroup266], (instregex "FSTENVm")>;
+
+def SKXWriteResGroup267 : SchedWriteRes<[SKXPort6,SKXPort0156]> {
+ let Latency = 140;
+ let NumMicroOps = 4;
+ let ResourceCycles = [1,3];
+}
+def: InstRW<[SKXWriteResGroup267], (instregex "PAUSE")>;
+} // SchedModel
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index d831a7974359..2e21a97541b2 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -299,6 +299,7 @@ def IIC_SSE_SHUFP : InstrItinClass;
def IIC_SSE_PSHUF_RI : InstrItinClass;
def IIC_SSE_PSHUF_MI : InstrItinClass;
+def IIC_SSE_PACK : InstrItinClass;
def IIC_SSE_UNPCK : InstrItinClass;
def IIC_SSE_MOVMSK : InstrItinClass;
@@ -384,8 +385,6 @@ def IIC_SSE_CVT_PD_RR : InstrItinClass;
def IIC_SSE_CVT_PD_RM : InstrItinClass;
def IIC_SSE_CVT_PS_RR : InstrItinClass;
def IIC_SSE_CVT_PS_RM : InstrItinClass;
-def IIC_SSE_CVT_PI2PS_RR : InstrItinClass;
-def IIC_SSE_CVT_PI2PS_RM : InstrItinClass;
def IIC_SSE_CVT_Scalar_RR : InstrItinClass;
def IIC_SSE_CVT_Scalar_RM : InstrItinClass;
def IIC_SSE_CVT_SS2SI32_RM : InstrItinClass;
@@ -395,6 +394,8 @@ def IIC_SSE_CVT_SS2SI64_RR : InstrItinClass;
def IIC_SSE_CVT_SD2SI_RM : InstrItinClass;
def IIC_SSE_CVT_SD2SI_RR : InstrItinClass;
+def IIC_AVX_ZERO : InstrItinClass;
+
// MMX
def IIC_MMX_MOV_MM_RM : InstrItinClass;
def IIC_MMX_MOV_REG_MM : InstrItinClass;
@@ -425,12 +426,21 @@ def IIC_MMX_PSHUF : InstrItinClass;
def IIC_MMX_PEXTR : InstrItinClass;
def IIC_MMX_PINSRW : InstrItinClass;
def IIC_MMX_MASKMOV : InstrItinClass;
-
+def IIC_MMX_MOVMSK : InstrItinClass;
def IIC_MMX_CVT_PD_RR : InstrItinClass;
def IIC_MMX_CVT_PD_RM : InstrItinClass;
def IIC_MMX_CVT_PS_RR : InstrItinClass;
def IIC_MMX_CVT_PS_RM : InstrItinClass;
+def IIC_3DNOW_FALU_RM : InstrItinClass;
+def IIC_3DNOW_FALU_RR : InstrItinClass;
+def IIC_3DNOW_FCVT_F2I_RM : InstrItinClass;
+def IIC_3DNOW_FCVT_F2I_RR : InstrItinClass;
+def IIC_3DNOW_FCVT_I2F_RM : InstrItinClass;
+def IIC_3DNOW_FCVT_I2F_RR : InstrItinClass;
+def IIC_3DNOW_MISC_FUNC_REG : InstrItinClass;
+def IIC_3DNOW_MISC_FUNC_MEM : InstrItinClass;
+
def IIC_CMPX_LOCK : InstrItinClass;
def IIC_CMPX_LOCK_8 : InstrItinClass;
def IIC_CMPX_LOCK_8B : InstrItinClass;
@@ -439,6 +449,7 @@ def IIC_CMPX_LOCK_16B : InstrItinClass;
def IIC_XADD_LOCK_MEM : InstrItinClass;
def IIC_XADD_LOCK_MEM8 : InstrItinClass;
+def IIC_FCMOV : InstrItinClass;
def IIC_FILD : InstrItinClass;
def IIC_FLD : InstrItinClass;
def IIC_FLD80 : InstrItinClass;
@@ -467,6 +478,8 @@ def IIC_FXTRACT : InstrItinClass;
def IIC_FPREM1 : InstrItinClass;
def IIC_FPSTP : InstrItinClass;
def IIC_FPREM : InstrItinClass;
+def IIC_FSIGN : InstrItinClass;
+def IIC_FSQRT : InstrItinClass;
def IIC_FYL2XP1 : InstrItinClass;
def IIC_FSINCOS : InstrItinClass;
def IIC_FRNDINT : InstrItinClass;
@@ -483,16 +496,31 @@ def IIC_INT : InstrItinClass;
def IIC_INT3 : InstrItinClass;
def IIC_INVD : InstrItinClass;
def IIC_INVLPG : InstrItinClass;
+def IIC_INVPCID : InstrItinClass;
def IIC_IRET : InstrItinClass;
def IIC_HLT : InstrItinClass;
def IIC_LXS : InstrItinClass;
def IIC_LTR : InstrItinClass;
+def IIC_MPX : InstrItinClass;
+def IIC_PKU : InstrItinClass;
+def IIC_PTWRITE : InstrItinClass;
+def IIC_RDPID : InstrItinClass;
+def IIC_RDRAND : InstrItinClass;
+def IIC_RDSEED : InstrItinClass;
def IIC_RDTSC : InstrItinClass;
+def IIC_RDTSCP : InstrItinClass;
def IIC_RSM : InstrItinClass;
def IIC_SIDT : InstrItinClass;
def IIC_SGDT : InstrItinClass;
def IIC_SLDT : InstrItinClass;
+def IIC_SMAP : InstrItinClass;
+def IIC_SMX : InstrItinClass;
def IIC_STR : InstrItinClass;
+def IIC_SKINIT : InstrItinClass;
+def IIC_SVM : InstrItinClass;
+def IIC_VMX : InstrItinClass;
+def IIC_CLGI : InstrItinClass;
+def IIC_STGI : InstrItinClass;
def IIC_SWAPGS : InstrItinClass;
def IIC_SYSCALL : InstrItinClass;
def IIC_SYS_ENTER_EXIT : InstrItinClass;
@@ -522,6 +550,8 @@ def IIC_PUSH_CS : InstrItinClass;
def IIC_PUSH_SR : InstrItinClass;
def IIC_POP_SR : InstrItinClass;
def IIC_POP_SR_SS : InstrItinClass;
+def IIC_SEGMENT_BASE_R : InstrItinClass;
+def IIC_SEGMENT_BASE_W : InstrItinClass;
def IIC_VERR : InstrItinClass;
def IIC_VERW_REG : InstrItinClass;
def IIC_VERW_MEM : InstrItinClass;
@@ -547,6 +577,10 @@ def IIC_PUSH_A : InstrItinClass;
def IIC_BSWAP : InstrItinClass;
def IIC_BIT_SCAN_MEM : InstrItinClass;
def IIC_BIT_SCAN_REG : InstrItinClass;
+def IIC_LZCNT_RR : InstrItinClass;
+def IIC_LZCNT_RM : InstrItinClass;
+def IIC_TZCNT_RR : InstrItinClass;
+def IIC_TZCNT_RM : InstrItinClass;
def IIC_MOVS : InstrItinClass;
def IIC_STOS : InstrItinClass;
def IIC_SCAS : InstrItinClass;
@@ -659,10 +693,3 @@ def GenericPostRAModel : GenericX86Model {
let PostRAScheduler = 1;
}
-include "X86ScheduleAtom.td"
-include "X86SchedSandyBridge.td"
-include "X86SchedHaswell.td"
-include "X86ScheduleSLM.td"
-include "X86ScheduleZnver1.td"
-include "X86ScheduleBtVer2.td"
-
diff --git a/lib/Target/X86/X86ScheduleAtom.td b/lib/Target/X86/X86ScheduleAtom.td
index a5b440182aa9..e052ad98104c 100644
--- a/lib/Target/X86/X86ScheduleAtom.td
+++ b/lib/Target/X86/X86ScheduleAtom.td
@@ -212,6 +212,7 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_SSE_PSHUF_RI, [InstrStage<1, [Port0]>] >,
InstrItinData<IIC_SSE_PSHUF_MI, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_SSE_PACK, [InstrStage<1, [Port0]>] >,
InstrItinData<IIC_SSE_UNPCK, [InstrStage<1, [Port0]>] >,
InstrItinData<IIC_SSE_SQRTPS_RR, [InstrStage<70, [Port0, Port1]>] >,
@@ -337,6 +338,7 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_MMX_PEXTR, [InstrStage<4, [Port0, Port1]>] >,
InstrItinData<IIC_MMX_PINSRW, [InstrStage<1, [Port0]>] >,
InstrItinData<IIC_MMX_MASKMOV, [InstrStage<1, [Port0]>] >,
+ InstrItinData<IIC_MMX_MOVMSK, [InstrStage<3, [Port0]>] >,
// conversions
// from/to PD
InstrItinData<IIC_MMX_CVT_PD_RR, [InstrStage<7, [Port0, Port1]>] >,
@@ -362,6 +364,7 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_FST80, [InstrStage<5, [Port0, Port1]>] >,
InstrItinData<IIC_FIST, [InstrStage<6, [Port0, Port1]>] >,
+ InstrItinData<IIC_FCMOV, [InstrStage<9, [Port0, Port1]>] >,
InstrItinData<IIC_FLDZ, [InstrStage<1, [Port0, Port1]>] >,
InstrItinData<IIC_FUCOM, [InstrStage<1, [Port1]>] >,
InstrItinData<IIC_FUCOMI, [InstrStage<9, [Port0, Port1]>] >,
@@ -392,6 +395,8 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_FXSAVE, [InstrStage<140, [Port0, Port1]>] >,
InstrItinData<IIC_FXRSTOR, [InstrStage<141, [Port0, Port1]>] >,
InstrItinData<IIC_FXCH, [InstrStage<1, [Port0], 0>, InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FSIGN, [InstrStage<1, [Port1]>] >,
+ InstrItinData<IIC_FSQRT, [InstrStage<71, [Port0, Port1]>] >,
// System instructions
InstrItinData<IIC_CPUID, [InstrStage<121, [Port0, Port1]>] >,
@@ -404,6 +409,7 @@ def AtomItineraries : ProcessorItineraries<
InstrItinData<IIC_LXS, [InstrStage<10, [Port0, Port1]>] >,
InstrItinData<IIC_LTR, [InstrStage<83, [Port0, Port1]>] >,
InstrItinData<IIC_RDTSC, [InstrStage<30, [Port0, Port1]>] >,
+ InstrItinData<IIC_RDTSCP, [InstrStage<30, [Port0, Port1]>] >,
InstrItinData<IIC_RSM, [InstrStage<741, [Port0, Port1]>] >,
InstrItinData<IIC_SIDT, [InstrStage<4, [Port0, Port1]>] >,
InstrItinData<IIC_SGDT, [InstrStage<4, [Port0, Port1]>] >,
diff --git a/lib/Target/X86/X86ScheduleBtVer2.td b/lib/Target/X86/X86ScheduleBtVer2.td
index 9dcc968a1a7a..6ea81a25e41c 100644
--- a/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/lib/Target/X86/X86ScheduleBtVer2.td
@@ -135,6 +135,30 @@ def : WriteRes<WriteLEA, [JALU01]>;
defm : JWriteResIntPair<WriteShift, JALU01, 1>;
+def WriteSHLDrri : SchedWriteRes<[JALU01]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+ let NumMicroOps = 6;
+}
+def: InstRW<[WriteSHLDrri], (instregex "SHLD(16|32|64)rri8")>;
+def: InstRW<[WriteSHLDrri], (instregex "SHRD(16|32|64)rri8")>;
+
+def WriteSHLDrrCL : SchedWriteRes<[JALU01]> {
+ let Latency = 4;
+ let ResourceCycles = [8];
+ let NumMicroOps = 7;
+}
+def: InstRW<[WriteSHLDrrCL], (instregex "SHLD(16|32|64)rrCL")>;
+def: InstRW<[WriteSHLDrrCL], (instregex "SHRD(16|32|64)rrCL")>;
+
+def WriteSHLDm : SchedWriteRes<[JLAGU, JALU01]> {
+ let Latency = 9;
+ let ResourceCycles = [1, 22];
+ let NumMicroOps = 8;
+}
+def: InstRW<[WriteSHLDm], (instregex "SHLD(16|32|64)mr(i8|CL)")>;
+def: InstRW<[WriteSHLDm], (instregex "SHRD(16|32|64)mr(i8|CL)")>;
+
////////////////////////////////////////////////////////////////////////////////
// Loads, stores, and moves, not folded with other operations.
// FIXME: Split x86 and SSE load/store/moves
@@ -142,7 +166,10 @@ defm : JWriteResIntPair<WriteShift, JALU01, 1>;
def : WriteRes<WriteLoad, [JLAGU]> { let Latency = 5; }
def : WriteRes<WriteStore, [JSAGU]>;
-def : WriteRes<WriteMove, [JAny]>;
+def : WriteRes<WriteMove, [JALU01]>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
////////////////////////////////////////////////////////////////////////////////
// Idioms that clear a register, like xorps %xmm0, %xmm0.
@@ -168,6 +195,7 @@ defm : JWriteResIntPair<WriteJump, JALU01, 1>;
defm : JWriteResFpuPair<WriteFAdd, JFPU0, 3>;
defm : JWriteResFpuPair<WriteFMul, JFPU1, 2>;
+defm : JWriteResFpuPair<WriteFMA, JFPU1, 2>; // NOTE: Doesn't exist on Jaguar.
defm : JWriteResFpuPair<WriteFRcp, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFRsqrt, JFPU1, 2>;
defm : JWriteResFpuPair<WriteFShuffle, JFPU01, 1>;
@@ -199,11 +227,13 @@ defm : JWriteResFpuPair<WriteCvtF2F, JFPU1, 3>; // Float -> Float size conve
def : WriteRes<WriteFVarBlend, [JFPU01]> {
let Latency = 2;
- let ResourceCycles = [2];
+ let ResourceCycles = [4];
+ let NumMicroOps = 3;
}
def : WriteRes<WriteFVarBlendLd, [JLAGU, JFPU01]> {
let Latency = 7;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 3;
}
// Vector integer operations.
@@ -217,21 +247,20 @@ defm : JWriteResFpuPair<WriteShuffle256, JFPU01, 1>;
def : WriteRes<WriteVarBlend, [JFPU01]> {
let Latency = 2;
- let ResourceCycles = [2];
+ let ResourceCycles = [4];
+ let NumMicroOps = 3;
}
def : WriteRes<WriteVarBlendLd, [JLAGU, JFPU01]> {
let Latency = 7;
- let ResourceCycles = [1, 2];
+ let ResourceCycles = [1, 4];
+ let NumMicroOps = 3;
}
// FIXME: why do we need to define AVX2 resource on CPU that doesn't have AVX2?
-def : WriteRes<WriteVarVecShift, [JFPU01]> {
- let Latency = 1;
- let ResourceCycles = [1];
-}
+def : WriteRes<WriteVarVecShift, [JFPU01]> {}
def : WriteRes<WriteVarVecShiftLd, [JLAGU, JFPU01]> {
let Latency = 6;
- let ResourceCycles = [1, 1];
+ let ResourceCycles = [1, 2];
}
def : WriteRes<WriteMPSAD, [JFPU0]> {
@@ -249,43 +278,49 @@ def : WriteRes<WriteMPSADLd, [JLAGU, JFPU0]> {
// FIXME: approximate latencies + pipe dependencies
////////////////////////////////////////////////////////////////////////////////
-def : WriteRes<WritePCmpIStrM, [JFPU01]> {
- let Latency = 7;
- let ResourceCycles = [2];
+def : WriteRes<WritePCmpIStrM, [JFPU1,JFPU0]> {
+ let Latency = 8;
+ let ResourceCycles = [2, 2];
+ let NumMicroOps = 3;
}
-def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU01]> {
- let Latency = 12;
- let ResourceCycles = [1, 2];
+def : WriteRes<WritePCmpIStrMLd, [JLAGU, JFPU1, JFPU0]> {
+ let Latency = 13;
+ let ResourceCycles = [1, 2, 2];
+ let NumMicroOps = 3;
}
// Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [JFPU01]> {
- let Latency = 13;
- let ResourceCycles = [5];
+def : WriteRes<WritePCmpEStrM, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
+ let Latency = 14;
+ let ResourceCycles = [5, 5, 5, 5, 5];
+ let NumMicroOps = 9;
}
-def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU01]> {
- let Latency = 18;
- let ResourceCycles = [1, 5];
+def : WriteRes<WritePCmpEStrMLd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
+ let Latency = 19;
+ let ResourceCycles = [1, 5, 5, 5, 5, 5];
+ let NumMicroOps = 9;
}
// Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [JFPU01]> {
- let Latency = 6;
- let ResourceCycles = [2];
+def : WriteRes<WritePCmpIStrI, [JFPU1, JFPU0]> {
+ let Latency = 7;
+ let ResourceCycles = [2, 2];
}
-def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU01]> {
- let Latency = 11;
- let ResourceCycles = [1, 2];
+def : WriteRes<WritePCmpIStrILd, [JLAGU, JFPU1, JFPU0]> {
+ let Latency = 12;
+ let ResourceCycles = [1, 2, 2];
}
// Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [JFPU01]> {
- let Latency = 13;
- let ResourceCycles = [5];
+def : WriteRes<WritePCmpEStrI, [JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
+ let Latency = 14;
+ let ResourceCycles = [5, 5, 5, 5, 5];
+ let NumMicroOps = 9;
}
-def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU01]> {
- let Latency = 18;
- let ResourceCycles = [1, 5];
+def : WriteRes<WritePCmpEStrILd, [JLAGU, JFPU1, JLAGU, JFPU01,JFPU1, JFPU0]> {
+ let Latency = 19;
+ let ResourceCycles = [1, 5, 5, 5, 5, 5];
+ let NumMicroOps = 9;
}
////////////////////////////////////////////////////////////////////////////////
@@ -371,6 +406,38 @@ def : WriteRes<WriteFence, [JSAGU]>;
def : WriteRes<WriteNop, []>;
////////////////////////////////////////////////////////////////////////////////
+// SSE4.1 instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteDPPS: SchedWriteRes<[JFPU0, JFPU1]> {
+ let Latency = 11;
+ let ResourceCycles = [3,3];
+ let NumMicroOps = 5;
+}
+def : InstRW<[WriteDPPS], (instregex "(V)?DPPSrri")>;
+
+def WriteDPPSLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> {
+ let Latency = 16;
+ let ResourceCycles = [1,3,3];
+ let NumMicroOps = 6;
+}
+def : InstRW<[WriteDPPSLd], (instregex "(V)?DPPSrmi")>;
+
+def WriteDPPD: SchedWriteRes<[JFPU0, JFPU1]> {
+ let Latency = 9;
+ let ResourceCycles = [3,3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteDPPD], (instregex "(V)?DPPDrri")>;
+
+def WriteDPPDLd: SchedWriteRes<[JLAGU, JFPU0, JFPU1]> {
+ let Latency = 14;
+ let ResourceCycles = [1,3,3];
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteDPPDLd], (instregex "(V)?DPPDrmi")>;
+
+////////////////////////////////////////////////////////////////////////////////
// SSE4A instructions.
////////////////////////////////////////////////////////////////////////////////
@@ -387,9 +454,73 @@ def WriteINSERTQ: SchedWriteRes<[JFPU01]> {
def : InstRW<[WriteINSERTQ], (instregex "INSERTQ")>;
////////////////////////////////////////////////////////////////////////////////
+// F16C instructions.
+////////////////////////////////////////////////////////////////////////////////
+
+def WriteCVT3: SchedWriteRes<[JFPU1]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteCVT3], (instregex "VCVTPS2PHrr")>;
+def : InstRW<[WriteCVT3], (instregex "VCVTPH2PSrr")>;
+
+def WriteCVT3St: SchedWriteRes<[JFPU1, JSAGU]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVT3St], (instregex "VCVTPS2PHmr")>;
+
+def WriteCVT3Ld: SchedWriteRes<[JLAGU, JFPU1]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 1];
+}
+def : InstRW<[WriteCVT3Ld], (instregex "VCVTPH2PSrm")>;
+
+def WriteCVTPS2PHY: SchedWriteRes<[JFPU1, JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [2,2];
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteCVTPS2PHY], (instregex "VCVTPS2PHYrr")>;
+
+def WriteCVTPS2PHYSt: SchedWriteRes<[JFPU1, JFPU01, JSAGU]> {
+ let Latency = 11;
+ let ResourceCycles = [2,2,1];
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteCVTPS2PHYSt], (instregex "VCVTPS2PHYmr")>;
+
+def WriteCVTPH2PSY: SchedWriteRes<[JFPU1]> {
+ let Latency = 3;
+ let ResourceCycles = [2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteCVTPH2PSY], (instregex "VCVTPH2PSYrr")>;
+
+def WriteCVTPH2PSYLd: SchedWriteRes<[JLAGU, JFPU1]> {
+ let Latency = 8;
+ let ResourceCycles = [1,2];
+ let NumMicroOps = 2;
+}
+def : InstRW<[WriteCVTPH2PSYLd], (instregex "VCVTPH2PSYrm")>;
+
+////////////////////////////////////////////////////////////////////////////////
// AVX instructions.
////////////////////////////////////////////////////////////////////////////////
+def WriteVDPPSY: SchedWriteRes<[JFPU1, JFPU0]> {
+ let Latency = 12;
+ let ResourceCycles = [6, 6];
+ let NumMicroOps = 10;
+}
+def : InstRW<[WriteVDPPSY], (instregex "VDPPSYrr")>;
+
+def WriteVDPPSYLd: SchedWriteRes<[JLAGU, JFPU1, JFPU0]> {
+ let Latency = 17;
+ let ResourceCycles = [1, 6, 6];
+ let NumMicroOps = 11;
+}
+def : InstRW<[WriteVDPPSYLd, ReadAfterLd], (instregex "VDPPSYrm")>;
+
def WriteFAddY: SchedWriteRes<[JFPU0]> {
let Latency = 3;
let ResourceCycles = [2];
@@ -438,6 +569,152 @@ def WriteVMULYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
}
def : InstRW<[WriteVMULYPSLd, ReadAfterLd], (instregex "VMULPSYrm", "VRCPPSYm", "VRSQRTPSYm")>;
+def WriteVCVTY: SchedWriteRes<[JSTC]> {
+ let Latency = 3;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WriteVCVTY], (instregex "VCVTDQ2P(S|D)Yrr")>;
+def : InstRW<[WriteVCVTY], (instregex "VROUNDYP(S|D)r")>;
+def : InstRW<[WriteVCVTY], (instregex "VCVTPS2DQYrr")>;
+def : InstRW<[WriteVCVTY], (instregex "VCVTTPS2DQYrr")>;
+
+def WriteVCVTYLd: SchedWriteRes<[JLAGU, JSTC]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTDQ2P(S|D)Yrm")>;
+def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VROUNDYP(S|D)m")>;
+def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTPS2DQYrm")>;
+def : InstRW<[WriteVCVTYLd, ReadAfterLd], (instregex "VCVTTPS2DQYrm")>;
+
+def WriteVMONTPSt: SchedWriteRes<[JSTC, JLAGU]> {
+ let Latency = 3;
+ let ResourceCycles = [2,1];
+}
+def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTP(S|D)Ymr")>;
+def : InstRW<[WriteVMONTPSt], (instregex "VMOVNTDQYmr")>;
+
+def WriteVCVTPDY: SchedWriteRes<[JSTC, JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [2, 4];
+}
+def : InstRW<[WriteVCVTPDY], (instregex "VCVTPD2(DQ|PS)Yrr")>;
+def : InstRW<[WriteVCVTPDY], (instregex "VCVTTPD2DQYrr")>;
+
+def WriteVCVTPDYLd: SchedWriteRes<[JLAGU, JSTC, JFPU01]> {
+ let Latency = 11;
+ let ResourceCycles = [1, 2, 4];
+}
+def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTPD2(DQ|PS)Yrm")>;
+def : InstRW<[WriteVCVTPDYLd, ReadAfterLd], (instregex "VCVTTPD2DQYrm")>;
+
+def WriteVBlendVPY: SchedWriteRes<[JFPU01]> {
+ let Latency = 3;
+ let ResourceCycles = [6];
+}
+def : InstRW<[WriteVBlendVPY], (instregex "VBLENDVP(S|D)Yrr", "VPERMILP(D|S)Yrr")>;
+
+def WriteVBlendVPYLd: SchedWriteRes<[JLAGU, JFPU01]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 6];
+}
+def : InstRW<[WriteVBlendVPYLd, ReadAfterLd], (instregex "VBLENDVP(S|D)Yrm")>;
+
+def WriteVBROADCASTYLd: SchedWriteRes<[JLAGU, JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 4];
+}
+def : InstRW<[WriteVBROADCASTYLd, ReadAfterLd], (instregex "VBROADCASTS(S|D)Yrm")>;
+
+def WriteFPAY22: SchedWriteRes<[JFPU0]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WriteFPAY22], (instregex "VCMPP(S|D)Yrri", "VM(AX|IN)P(D|S)Yrr")>;
+
+def WriteFPAY22Ld: SchedWriteRes<[JLAGU, JFPU0]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteFPAY22Ld, ReadAfterLd], (instregex "VCMPP(S|D)Yrmi", "VM(AX|IN)P(D|S)Yrm")>;
+
+def WriteVHAddSubY: SchedWriteRes<[JFPU0]> {
+ let Latency = 3;
+ let ResourceCycles = [2];
+}
+def : InstRW<[WriteVHAddSubY], (instregex "VH(ADD|SUB)P(D|S)Yrr")>;
+
+def WriteVHAddSubYLd: SchedWriteRes<[JLAGU, JFPU0]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteVHAddSubYLd], (instregex "VH(ADD|SUB)P(D|S)Yrm")>;
+
+def WriteVMaskMovLd: SchedWriteRes<[JLAGU,JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[WriteVMaskMovLd], (instregex "VMASKMOVP(D|S)rm")>;
+
+def WriteVMaskMovYLd: SchedWriteRes<[JLAGU,JFPU01]> {
+ let Latency = 6;
+ let ResourceCycles = [1, 4];
+}
+def : InstRW<[WriteVMaskMovYLd], (instregex "VMASKMOVP(D|S)Yrm")>;
+
+def WriteVMaskMovSt: SchedWriteRes<[JFPU01,JSAGU]> {
+ let Latency = 6;
+ let ResourceCycles = [4, 1];
+}
+def : InstRW<[WriteVMaskMovSt], (instregex "VMASKMOVP(D|S)mr")>;
+
+def WriteVMaskMovYSt: SchedWriteRes<[JFPU01,JSAGU]> {
+ let Latency = 6;
+ let ResourceCycles = [4, 1];
+}
+def : InstRW<[WriteVMaskMovYSt], (instregex "VMASKMOVP(D|S)Ymr")>;
+
+// TODO: In fact we have latency '2+i'. The +i represents an additional 1 cycle transfer
+// operation which moves the floating point result to the integer unit. During this
+// additional cycle the floating point unit execution resources are not occupied
+// and ALU0 in the integer unit is occupied instead.
+def WriteVMOVMSK: SchedWriteRes<[JFPU0]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteVMOVMSK], (instregex "VMOVMSKP(D|S)(Y)?rr")>;
+
+// TODO: In fact we have latency '3+i'. The +i represents an additional 1 cycle transfer
+// operation which moves the floating point result to the integer unit. During this
+// additional cycle the floating point unit execution resources are not occupied
+// and ALU0 in the integer unit is occupied instead.
+def WriteVTESTY: SchedWriteRes<[JFPU01, JFPU0]> {
+ let Latency = 4;
+ let ResourceCycles = [2, 2];
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteVTESTY], (instregex "VTESTP(S|D)Yrr")>;
+def : InstRW<[WriteVTESTY], (instregex "VPTESTYrr")>;
+
+def WriteVTESTYLd: SchedWriteRes<[JLAGU, JFPU01, JFPU0]> {
+ let Latency = 9;
+ let ResourceCycles = [1, 2, 2];
+ let NumMicroOps = 3;
+}
+def : InstRW<[WriteVTESTYLd], (instregex "VTESTP(S|D)Yrm")>;
+def : InstRW<[WriteVTESTYLd], (instregex "VPTESTYrm")>;
+
+def WriteVTEST: SchedWriteRes<[JFPU0]> {
+ let Latency = 3;
+}
+def : InstRW<[WriteVTEST], (instregex "VTESTP(S|D)rr")>;
+def : InstRW<[WriteVTEST], (instregex "VPTESTrr")>;
+
+def WriteVTESTLd: SchedWriteRes<[JLAGU, JFPU0]> {
+ let Latency = 8;
+}
+def : InstRW<[WriteVTESTLd], (instregex "VTESTP(S|D)rm")>;
+def : InstRW<[WriteVTESTLd], (instregex "VPTESTrm")>;
+
def WriteVSQRTYPD: SchedWriteRes<[JFPU1]> {
let Latency = 54;
let ResourceCycles = [54];
@@ -462,5 +739,16 @@ def WriteVSQRTYPSLd: SchedWriteRes<[JLAGU, JFPU1]> {
}
def : InstRW<[WriteVSQRTYPSLd], (instregex "VSQRTPSYm")>;
+def WriteJVZEROALL: SchedWriteRes<[]> {
+ let Latency = 90;
+ let NumMicroOps = 73;
+}
+def : InstRW<[WriteJVZEROALL], (instregex "VZEROALL")>;
+
+def WriteJVZEROUPPER: SchedWriteRes<[]> {
+ let Latency = 46;
+ let NumMicroOps = 37;
+}
+def : InstRW<[WriteJVZEROUPPER], (instregex "VZEROUPPER")>;
} // SchedModel
diff --git a/lib/Target/X86/X86ScheduleSLM.td b/lib/Target/X86/X86ScheduleSLM.td
index 03ed2db2350d..35ec7488db72 100644
--- a/lib/Target/X86/X86ScheduleSLM.td
+++ b/lib/Target/X86/X86ScheduleSLM.td
@@ -32,7 +32,6 @@ def SLMModel : SchedMachineModel {
let SchedModel = SLMModel in {
// Silvermont has 5 reservation stations for micro-ops
-
def IEC_RSV0 : ProcResource<1>;
def IEC_RSV1 : ProcResource<1>;
def FPC_RSV0 : ProcResource<1> { let BufferSize = 1; }
@@ -78,6 +77,9 @@ def : WriteRes<WriteLoad, [MEC_RSV]> { let Latency = 3; }
def : WriteRes<WriteMove, [IEC_RSV01]>;
def : WriteRes<WriteZero, []>;
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
defm : SMWriteResPair<WriteALU, IEC_RSV01, 1>;
defm : SMWriteResPair<WriteIMul, IEC_RSV1, 3>;
defm : SMWriteResPair<WriteShift, IEC_RSV0, 1>;
@@ -249,7 +251,7 @@ def : WriteRes<WriteMicrocoded, [FPC_RSV0]> { let Latency = 100; }
def : WriteRes<WriteFence, [MEC_RSV]>;
def : WriteRes<WriteNop, []>;
-// AVX is not supported on that architecture, but we should define the basic
+// AVX/FMA is not supported on that architecture, but we should define the basic
// scheduling resources anyway.
def : WriteRes<WriteIMulH, [FPC_RSV0]>;
defm : SMWriteResPair<WriteVarBlend, FPC_RSV0, 1>;
@@ -257,4 +259,5 @@ defm : SMWriteResPair<WriteFVarBlend, FPC_RSV0, 1>;
defm : SMWriteResPair<WriteFShuffle256, FPC_RSV0, 1>;
defm : SMWriteResPair<WriteShuffle256, FPC_RSV0, 1>;
defm : SMWriteResPair<WriteVarVecShift, FPC_RSV0, 1>;
+defm : SMWriteResPair<WriteFMA, FPC_RSV0, 1>;
} // SchedModel
diff --git a/lib/Target/X86/X86ScheduleZnver1.td b/lib/Target/X86/X86ScheduleZnver1.td
index d5b4cfe2ddee..a4e5327213c2 100644
--- a/lib/Target/X86/X86ScheduleZnver1.td
+++ b/lib/Target/X86/X86ScheduleZnver1.td
@@ -92,7 +92,7 @@ def ZnDivider : ProcResource<1>;
def : ReadAdvance<ReadAfterLd, 4>;
// (a folded load is an instruction that loads and does some operation)
-// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
// Instructions with folded loads are usually micro-fused, so they only appear
// as two micro-ops.
// a. load and
@@ -104,9 +104,10 @@ multiclass ZnWriteResPair<X86FoldableSchedWrite SchedRW,
// Register variant takes 1-cycle on Execution Port.
def : WriteRes<SchedRW, [ExePort]> { let Latency = Lat; }
- // Memory variant also uses a cycle on ZnAGU
+ // Memory variant also uses a cycle on ZnAGU
// adds 4 cycles to the latency.
def : WriteRes<SchedRW.Folded, [ZnAGU, ExePort]> {
+ let NumMicroOps = 2;
let Latency = !add(Lat, 4);
}
}
@@ -125,7 +126,7 @@ multiclass ZnWriteResFpuPair<X86FoldableSchedWrite SchedRW,
}
}
-// WriteRMW is set for instructions with Memory write
+// WriteRMW is set for instructions with Memory write
// operation in codegen
def : WriteRes<WriteRMW, [ZnAGU]>;
@@ -139,6 +140,9 @@ defm : ZnWriteResPair<WriteALU, ZnALU, 1>;
defm : ZnWriteResPair<WriteShift, ZnALU, 1>;
defm : ZnWriteResPair<WriteJump, ZnALU, 1>;
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
// IDIV
def : WriteRes<WriteIDiv, [ZnALU2, ZnDivider]> {
let Latency = 41;
@@ -174,6 +178,7 @@ defm : ZnWriteResFpuPair<WriteCvtF2I, ZnFPU3, 5>;
defm : ZnWriteResFpuPair<WriteFDiv, ZnFPU3, 15>;
defm : ZnWriteResFpuPair<WriteFShuffle, ZnFPU12, 1>;
defm : ZnWriteResFpuPair<WriteFMul, ZnFPU0, 5>;
+defm : ZnWriteResFpuPair<WriteFMA, ZnFPU03, 5>;
defm : ZnWriteResFpuPair<WriteFRcp, ZnFPU01, 5>;
defm : ZnWriteResFpuPair<WriteFRsqrt, ZnFPU01, 5>;
defm : ZnWriteResFpuPair<WriteFSqrt, ZnFPU3, 20>;
@@ -220,4 +225,1550 @@ let Latency = 100 in {
def : WriteRes<WritePCmpIStrI, []>;
def : WriteRes<WritePCmpIStrILd, []>;
}
+
+//=== Regex based itineraries ===//
+// Notation:
+// - r: register.
+// - m = memory.
+// - i = immediate
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// CMOVcc.
+// r,r.
+def : InstRW<[WriteALU],
+ (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[WriteALULd, ReadAfterLd],
+ (instregex "CMOV(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)(16|32|64)rm")>;
+
+// XCHG.
+// r,r.
+def ZnWriteXCHG : SchedWriteRes<[ZnALU]> {
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+
+def : InstRW<[ZnWriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def ZnWriteXCHGrm : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "XLAT")>;
+
+// POP16.
+// r.
+def ZnWritePop16r : SchedWriteRes<[ZnAGU]>{
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWritePop16r], (instregex "POP16rmm")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>;
+
+
+// PUSH.
+// r. Has default values.
+// m.
+def ZnWritePUSH : SchedWriteRes<[ZnAGU]>{
+ let Latency = 4;
+}
+def : InstRW<[ZnWritePUSH], (instregex "PUSH(16|32)rmm")>;
+
+//PUSHF
+def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def ZnWritePushA : SchedWriteRes<[ZnAGU]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWritePushA], (instregex "PUSHA(16|32)")>;
+
+//LAHF
+def : InstRW<[WriteMicrocoded], (instregex "LAHF")>;
+
+// SAHF.
+def ZnWriteSAHF : SchedWriteRes<[ZnALU]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteSAHF], (instregex "SAHF")>;
+
+// BSWAP.
+def ZnWriteBSwap : SchedWriteRes<[ZnALU]> {
+ let ResourceCycles = [4];
+}
+def : InstRW<[ZnWriteBSwap], (instregex "BSWAP")>;
+
+// MOVBE.
+// r,m.
+def ZnWriteMOVBE : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>;
+
+// m16,r16.
+def : InstRW<[ZnWriteMOVBE], (instregex "MOVBE(16|32|64)mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+ "(ADD|SUB)(8|16|32|64)mi8",
+ "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// r,r/i.
+def : InstRW<[WriteALU], (instregex "(ADC|SBB)(8|16|32|64)r(r|i)",
+ "(ADC|SBB)(16|32|64)ri8",
+ "(ADC|SBB)64ri32",
+ "(ADC|SBB)(8|16|32|64)rr_REV")>;
+
+// r,m.
+def : InstRW<[WriteALULd, ReadAfterLd],
+ (instregex "(ADC|SBB)(8|16|32|64)rm")>;
+
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+ "(ADC|SBB)(16|32|64)mi8",
+ "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteALULd],
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m",
+ "(INC|DEC)64(16|32)m")>;
+
+// MUL IMUL.
+// r16.
+def ZnWriteMul16 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteMul16], (instregex "IMUL16r", "MUL16r")>;
+
+// m16.
+def ZnWriteMul16Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteMul16Ld, ReadAfterLd], (instregex "IMUL16m", "MUL16m")>;
+
+// r32.
+def ZnWriteMul32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteMul32], (instregex "IMUL32r", "MUL32r")>;
+
+// m32.
+def ZnWriteMul32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteMul32Ld, ReadAfterLd], (instregex "IMUL32m", "MUL32m")>;
+
+// r64.
+def ZnWriteMul64 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteMul64], (instregex "IMUL64r", "MUL64r")>;
+
+// m64.
+def ZnWriteMul64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteMul64Ld, ReadAfterLd], (instregex "IMUL64m", "MUL64m")>;
+
+// r16,r16.
+def ZnWriteMul16rri : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteMul16rri], (instregex "IMUL16rri", "IMUL16rri8")>;
+
+// r16,m16.
+def ZnWriteMul16rmi : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteMul16rmi, ReadAfterLd], (instregex "IMUL16rmi", "IMUL16rmi8")>;
+
+// MULX.
+// r32,r32,r32.
+def ZnWriteMulX32 : SchedWriteRes<[ZnALU1, ZnMultiplier]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteMulX32], (instregex "MULX32rr")>;
+
+// r32,r32,m32.
+def ZnWriteMulX32Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+ let ResourceCycles = [1, 2, 2];
+}
+def : InstRW<[ZnWriteMulX32Ld, ReadAfterLd], (instregex "MULX32rm")>;
+
+// r64,r64,r64.
+def ZnWriteMulX64 : SchedWriteRes<[ZnALU1]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteMulX64], (instregex "MULX64rr")>;
+
+// r64,r64,m64.
+def ZnWriteMulX64Ld : SchedWriteRes<[ZnAGU, ZnALU1, ZnMultiplier]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteMulX64Ld, ReadAfterLd], (instregex "MULX64rm")>;
+
+// DIV, IDIV.
+// r8.
+def ZnWriteDiv8 : SchedWriteRes<[ZnALU2, ZnDivider]> {
+ let Latency = 15;
+}
+def : InstRW<[ZnWriteDiv8], (instregex "DIV8r", "IDIV8r")>;
+
+// r16.
+def ZnWriteDiv16 : SchedWriteRes<[ZnALU2, ZnDivider]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteDiv16], (instregex "DIV16r", "IDIV16r")>;
+
+// r32.
+def ZnWriteDiv32 : SchedWriteRes<[ZnALU2, ZnDivider]> {
+ let Latency = 25;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteDiv32], (instregex "DIV32r", "IDIV32r")>;
+
+// r64.
+def ZnWriteDiv64 : SchedWriteRes<[ZnALU2, ZnDivider]> {
+ let Latency = 41;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteDiv64], (instregex "DIV64r", "IDIV64r")>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def ZnWriteJCXZ : SchedWriteRes<[ZnALU03]>;
+def : InstRW<[ZnWriteJCXZ], (instregex "JCXZ", "JECXZ_(32|64)", "JRCXZ")>;
+
+// INTO
+def : InstRW<[WriteMicrocoded], (instregex "INTO")>;
+
+// LOOP.
+def ZnWriteLOOP : SchedWriteRes<[ZnALU03]>;
+def : InstRW<[ZnWriteLOOP], (instregex "LOOP")>;
+
+// LOOP(N)E, LOOP(N)Z
+def ZnWriteLOOPE : SchedWriteRes<[ZnALU03]>;
+def : InstRW<[ZnWriteLOOPE], (instregex "LOOPE", "LOOPNE",
+ "LOOPZ", "LOOPNZ")>;
+
+// CALL.
+// r.
+def ZnWriteCALLr : SchedWriteRes<[ZnAGU, ZnALU03]>;
+def : InstRW<[ZnWriteCALLr], (instregex "CALL(16|32)r")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>;
+
+// RET.
+def ZnWriteRET : SchedWriteRes<[ZnALU03]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
+ "IRET(D|Q)", "RETF")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+ "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// ANDN.
+// r,r.
+def : InstRW<[WriteALU], (instregex "ANDN(32|64)rr")>;
+// r,m.
+def : InstRW<[WriteALULd, ReadAfterLd], (instregex "ANDN(32|64)rm")>;
+
+// Define ALU latency variants
+def ZnWriteALULat2 : SchedWriteRes<[ZnALU]> {
+ let Latency = 2;
+}
+def ZnWriteALULat2Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 6;
+}
+
+def ZnWriteALULat3 : SchedWriteRes<[ZnALU]> {
+ let Latency = 3;
+}
+def ZnWriteALULat3Ld : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 7;
+}
+
+// BSF BSR.
+// r,r.
+def : InstRW<[ZnWriteALULat3], (instregex "BS(R|F)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[ZnWriteALULat3Ld, ReadAfterLd], (instregex "BS(R|F)(16|32|64)rm")>;
+
+// BT.
+// r,r/i.
+def : InstRW<[WriteShift], (instregex "BT(16|32|64)r(r|i8)")>;
+
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mr")>;
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def ZnWriteBTRSC : SchedWriteRes<[ZnALU]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+
+// m,r,i.
+def ZnWriteBTRSCm : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+// m,r,i.
+def : InstRW<[ZnWriteBTRSCm], (instregex "BT(R|S|C)(16|32|64)m(r|i8)")>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : InstRW<[ZnWriteALULat2], (instregex "BLS(I|MSK|R)(32|64)rr")>;
+// r,m.
+def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "BLS(I|MSK|R)(32|64)rm")>;
+
+// BEXTR.
+// r,r,r.
+def : InstRW<[WriteALU], (instregex "BEXTR(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BEXTR(32|64)rm")>;
+
+// BZHI.
+// r,r,r.
+def : InstRW<[WriteALU], (instregex "BZHI(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteALULd, ReadAfterLd], (instregex "BZHI(32|64)rm")>;
+
+// CLD STD.
+def : InstRW<[WriteALU], (instregex "STD", "CLD")>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,m,r.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+// ROR ROL.
+def : InstRW<[WriteShift], (instregex "RO(R|L)(8|16|32|64)r1")>;
+
+// RCR RCL.
+// r,1.
+def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r1")>;
+
+// m,1.
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m1")>;
+
+// i.
+def : InstRW<[WriteShift], (instregex "RC(R|L)(8|16|32|64)r(i|CL)")>;
+
+// m,i.
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(i|CL)")>;
+
+// SHR SHL SAR.
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// SHRD SHLD.
+// r,r
+def : InstRW<[WriteShift], (instregex "SH(R|L)D(16|32|64)rri8")>;
+
+// m,r
+def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SHLD(16|32|64)rrCL")>;
+
+// r,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SHRD(16|32|64)rrCL")>;
+
+// m,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+// SETcc.
+// r.
+def : InstRW<[WriteShift],
+ (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)r")>;
+// m.
+def : InstRW<[WriteShift],
+ (instregex "SET(O|NO|B|AE|E|NE|BE|A|S|NS|P|NP|L|GE|LE|G)m")>;
+
+// LZCNT TZCNT.
+// r,r.
+def : InstRW<[ZnWriteALULat2], (instregex "(LZCNT|TZCNT)(16|32|64)rr")>;
+// r,m.
+def : InstRW<[ZnWriteALULat2Ld, ReadAfterLd], (instregex "(LZCNT|TZCNT)(16|32|64)rm")>;
+
+//-- Misc instructions --//
+// CMPXCHG.
+def ZnWriteCMPXCHG : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let Latency = 8;
+ let NumMicroOps = 5;
+}
+def : InstRW<[ZnWriteCMPXCHG], (instregex "CMPXCHG(8|16|32|64)rm")>;
+
+// CMPXCHG8B.
+def ZnWriteCMPXCHG8B : SchedWriteRes<[ZnAGU, ZnALU]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[ZnWriteCMPXCHG8B], (instregex "CMPXCHG8B")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "CMPXCHG16B")>;
+
+// LEAVE
+def ZnWriteLEAVE : SchedWriteRes<[ZnALU, ZnAGU]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteLEAVE], (instregex "LEAVE")>;
+
+// PAUSE.
+def : InstRW<[WriteMicrocoded], (instregex "PAUSE")>;
+
+// RDTSC.
+def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
+
+// RDPMC.
+def : InstRW<[WriteMicrocoded], (instregex "RDPMC")>;
+
+// RDRAND.
+def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>;
+
+// XGETBV.
+def : InstRW<[WriteMicrocoded], (instregex "XGETBV")>;
+
+//-- String instructions --//
+// CMPS.
+def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>;
+
+// LODSB/W.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>;
+
+// MOVS.
+def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>;
+
+// STOS
+def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>;
+
+// XADD.
+def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+def ZnWriteFLDr : SchedWriteRes<[ZnFPU13]> ;
+
+def ZnWriteSTr: SchedWriteRes<[ZnFPU23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// LD_F.
+// r.
+def : InstRW<[ZnWriteFLDr], (instregex "LD_Frr")>;
+
+// m.
+def ZnWriteLD_F80m : SchedWriteRes<[ZnAGU, ZnFPU13]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+def : InstRW<[WriteMicrocoded], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[ZnWriteSTr], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def ZnWriteST_FP80m : SchedWriteRes<[ZnAGU, ZnFPU23]> {
+ let Latency = 5;
}
+def : InstRW<[ZnWriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>;
+
+def ZnWriteFXCH : SchedWriteRes<[ZnFPU]>;
+
+// FXCHG.
+def : InstRW<[ZnWriteFXCH], (instregex "XCH_F")>;
+
+// FILD.
+def ZnWriteFILD : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def ZnWriteFIST : SchedWriteRes<[ZnAGU, ZnFPU23]> {
+ let Latency = 12;
+}
+def : InstRW<[ZnWriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>;
+
+def ZnWriteFPU13 : SchedWriteRes<[ZnAGU, ZnFPU13]> {
+ let Latency = 8;
+}
+
+def ZnWriteFPU3 : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+}
+
+// FLDZ.
+def : InstRW<[ZnWriteFPU13], (instregex "LD_F0")>;
+
+// FLD1.
+def : InstRW<[ZnWriteFPU3], (instregex "LD_F1")>;
+
+// FLDPI FLDL2E etc.
+def : InstRW<[ZnWriteFPU3], (instregex "FLDPI", "FLDL2(T|E)" "FLDL(G|N)2")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "CMOV(B|BE|E|P|NB|NBE|NE|NP)_F")>;
+
+// FNSTSW.
+// AX.
+def : InstRW<[WriteMicrocoded], (instregex "FNSTSW16r")>;
+
+// m16.
+def : InstRW<[WriteMicrocoded], (instregex "FNSTSWm")>;
+
+// FLDCW.
+def : InstRW<[WriteMicrocoded], (instregex "FLDCW16m")>;
+
+// FNSTCW.
+def : InstRW<[WriteMicrocoded], (instregex "FNSTCW16m")>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[ZnWriteFPU3], (instregex "FINCSTP", "FDECSTP")>;
+
+// FFREE.
+def : InstRW<[ZnWriteFPU3], (instregex "FFREE")>;
+
+// FNSAVE.
+def : InstRW<[WriteMicrocoded], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+def ZnWriteFPU3Lat2 : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 2;
+}
+
+def ZnWriteFPU3Lat2Ld : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 9;
+}
+
+def ZnWriteFPU3Lat1 : SchedWriteRes<[ZnFPU3]> ;
+
+def ZnWriteFPU0Lat1 : SchedWriteRes<[ZnFPU0]> ;
+
+def ZnWriteFPU0Lat1Ld : SchedWriteRes<[ZnAGU, ZnFPU0]> {
+ let Latency = 8;
+}
+
+// FABS.
+def : InstRW<[ZnWriteFPU3Lat2], (instregex "ABS_F")>;
+
+// FCHS.
+def : InstRW<[ZnWriteFPU3Lat1], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "COM_FST0r", "COMP_FST0r", "UCOM_Fr",
+ "UCOM_FPr")>;
+// m.
+def : InstRW<[ZnWriteFPU0Lat1Ld], (instregex "FCOM(32|64)m", "FCOMP(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "FCOMPP", "UCOM_FPPr")>;
+
+def ZnWriteFPU02 : SchedWriteRes<[ZnAGU, ZnFPU02]>
+{
+ let Latency = 9;
+}
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[ZnWriteFPU02], (instregex "COM_FIr", "COM_FIPr", "UCOM_FIr",
+ "UCOM_FIPr")>;
+
+def ZnWriteFPU03 : SchedWriteRes<[ZnAGU, ZnFPU03]>
+{
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,3];
+}
+
+// FICOM(P).
+def : InstRW<[ZnWriteFPU03], (instregex "FICOM(16|32)m", "FICOMP(16|32)m")>;
+
+// FTST.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[ZnWriteFPU3Lat1], (instregex "FXAM")>;
+
+// FPREM.
+def : InstRW<[WriteMicrocoded], (instregex "FPREM")>;
+
+// FPREM1.
+def : InstRW<[WriteMicrocoded], (instregex "FPREM1")>;
+
+// FRNDINT.
+def : InstRW<[WriteMicrocoded], (instregex "FRNDINT")>;
+
+// FSCALE.
+def : InstRW<[WriteMicrocoded], (instregex "FSCALE")>;
+
+// FXTRACT.
+def : InstRW<[WriteMicrocoded], (instregex "FXTRACT")>;
+
+// FNOP.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "FNOP")>;
+
+// WAIT.
+def : InstRW<[ZnWriteFPU0Lat1], (instregex "WAIT")>;
+
+// FNCLEX.
+def : InstRW<[WriteMicrocoded], (instregex "FNCLEX")>;
+
+// FNINIT.
+def : InstRW<[WriteMicrocoded], (instregex "FNINIT")>;
+
+//=== Integer MMX and XMM Instructions ===//
+//-- Move instructions --//
+
+// Moves from GPR to FPR incurs a penalty
+def ZnWriteFPU2 : SchedWriteRes<[ZnFPU2]> {
+ let Latency = 3;
+}
+
+// Move to ALU doesn't incur penalty
+def ZnWriteToALU2 : SchedWriteRes<[ZnFPU2]> {
+ let Latency = 2;
+}
+
+def ZnWriteFPU : SchedWriteRes<[ZnFPU]>;
+def ZnWriteFPUY : SchedWriteRes<[ZnFPU]> {
+ let NumMicroOps = 2;
+ let Latency=2;
+}
+
+// MOVD.
+// r32/64 <- (x)mm.
+def : InstRW<[ZnWriteToALU2], (instregex "MMX_MOVD64grr", "MMX_MOVD64from64rr",
+ "VMOVPDI2DIrr", "MOVPDI2DIrr")>;
+
+// (x)mm <- r32/64.
+def : InstRW<[ZnWriteFPU2], (instregex "MMX_MOVD64rr", "MMX_MOVD64to64rr",
+ "VMOVDI2PDIrr", "MOVDI2PDIrr")>;
+
+// MOVQ.
+// r64 <- (x)mm.
+def : InstRW<[ZnWriteToALU2], (instregex "VMOVPQIto64rr")>;
+
+// (x)mm <- r64.
+def : InstRW<[ZnWriteFPU2], (instregex "VMOV64toPQIrr", "VMOVZQI2PQIrr")>;
+
+// (x)mm <- (x)mm.
+def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ64rr")>;
+
+// (V)MOVDQA/U.
+// x <- x.
+def : InstRW<[ZnWriteFPU], (instregex "MOVDQ(A|U)rr", "VMOVDQ(A|U)rr",
+ "MOVDQ(A|U)rr_REV", "VMOVDQ(A|U)rr_REV")>;
+
+// y <- y.
+def : InstRW<[ZnWriteFPUY], (instregex "VMOVDQ(A|U)Yrr", "VMOVDQ(A|U)Yrr_REV")>;
+
+// MOVDQ2Q.
+def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVDQ2Qrr")>;
+
+// MOVQ2DQ.
+def : InstRW<[ZnWriteFPU], (instregex "MMX_MOVQ2DQrr")>;
+
+// PACKSSWB/DW.
+// mm <- mm.
+def ZnWriteFPU12 : SchedWriteRes<[ZnFPU12]> ;
+def ZnWriteFPU12Y : SchedWriteRes<[ZnFPU12]> {
+ let NumMicroOps = 2;
+}
+def ZnWriteFPU12m : SchedWriteRes<[ZnAGU, ZnFPU12]> ;
+
+def : InstRW<[ZnWriteFPU12], (instregex "MMX_PACKSSDWirr",
+ "MMX_PACKSSWBirr", "MMX_PACKUSWBirr")>;
+def : InstRW<[ZnWriteFPU12m], (instregex "MMX_PACKSSDWirm",
+ "MMX_PACKSSWBirm", "MMX_PACKUSWBirm")>;
+
+// VPMOVSX/ZX BW BD BQ DW DQ.
+// y <- x.
+def : InstRW<[ZnWriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BQ|DW|DQ)Yrr")>;
+
+def ZnWriteFPU013 : SchedWriteRes<[ZnFPU013]> ;
+def ZnWriteFPU013Y : SchedWriteRes<[ZnFPU013]> {
+ let Latency = 2;
+}
+def ZnWriteFPU013m : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def ZnWriteFPU013Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def ZnWriteFPU013LdY : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def : InstRW<[ZnWriteFPU013], (instregex "(V?)PBLENDWrri")>;
+// ymm
+def : InstRW<[ZnWriteFPU013Y], (instregex "(V?)PBLENDWYrri")>;
+
+// x,m,i / v,v,m,i
+def : InstRW<[ZnWriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>;
+// y,m,i
+def : InstRW<[ZnWriteFPU013LdY], (instregex "(V?)PBLENDWYrmi")>;
+
+def ZnWriteFPU01 : SchedWriteRes<[ZnFPU01]> ;
+def ZnWriteFPU01Y : SchedWriteRes<[ZnFPU01]> {
+ let NumMicroOps = 2;
+}
+
+// VPBLENDD.
+// v,v,v,i.
+def : InstRW<[ZnWriteFPU01], (instregex "VPBLENDDrri")>;
+// ymm
+def : InstRW<[ZnWriteFPU01Y], (instregex "VPBLENDDYrri")>;
+
+// v,v,m,i
+def ZnWriteFPU01Op2 : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+def ZnWriteFPU01Op2Y : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 9;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[ZnWriteFPU01Op2], (instregex "VPBLENDDrmi")>;
+def : InstRW<[ZnWriteFPU01Op2Y], (instregex "VPBLENDDYrmi")>;
+
+// MASKMOVQ.
+def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOVQ.
+// ymm
+def : InstRW<[ZnWriteFPU01Op2],(instregex "VPMASKMOVQrm")>;
+def : InstRW<[ZnWriteFPU01Op2Y],(instregex "VPMASKMOVQYrm")>;
+
+def : InstRW<[WriteMicrocoded],
+ (instregex "VPMASKMOVD(Y?)rm")>;
+// m, v,v.
+def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// PMOVMSKB.
+def ZnWritePMOVMSKB : SchedWriteRes<[ZnFPU2]> {
+ let NumMicroOps = 2;
+}
+def ZnWritePMOVMSKBY : SchedWriteRes<[ZnFPU2]> {
+ let Latency = 2;
+}
+def : InstRW<[ZnWritePMOVMSKB], (instregex "(V|MMX_)?PMOVMSKBrr")>;
+def : InstRW<[ZnWritePMOVMSKBY], (instregex "(V|MMX_)?PMOVMSKBYrr")>;
+
+// PEXTR B/W/D/Q.
+// r32,x,i.
+def ZnWritePEXTRr : SchedWriteRes<[ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWritePEXTRr], (instregex "PEXTR(B|W|D|Q)rr", "MMX_PEXTRWirri")>;
+
+def ZnWritePEXTRm : SchedWriteRes<[ZnAGU, ZnFPU12, ZnFPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2, 3];
+}
+// m8,x,i.
+def : InstRW<[ZnWritePEXTRm], (instregex "PEXTR(B|W|D|Q)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def ZnWriteVPBROADCAST128Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteVPBROADCAST128Ld],
+ (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def ZnWriteVPBROADCAST256Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteVPBROADCAST256Ld],
+ (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+def : InstRW<[WriteMicrocoded], (instregex "MMX_PHADD(W?)r(r|m)64",
+ "MMX_PHADDSWr(r|m)64",
+ "MMX_PHSUB(W|D)r(r|m)64",
+ "MMX_PHSUBSWrr64",
+ "(V?)PH(ADD|SUB)(W|D)(Y?)r(r|m)",
+ "(V?)PH(ADD|SUB)SWr(r|m)(256)?")>;
+
+
+// PCMPGTQ.
+def ZnWritePCMPGTQr : SchedWriteRes<[ZnFPU03]>;
+def : InstRW<[ZnWritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// x <- x,m.
+def ZnWritePCMPGTQm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
+ let Latency = 8;
+}
+// ymm.
+def ZnWritePCMPGTQYm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+def : InstRW<[ZnWritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>;
+def : InstRW<[ZnWritePCMPGTQYm], (instregex "(V?)PCMPGTQYrm")>;
+
+// PMULLD.
+// x,x.
+def ZnWritePMULLDr : SchedWriteRes<[ZnFPU0]> {
+ let Latency = 4;
+}
+// ymm.
+def ZnWritePMULLDYr : SchedWriteRes<[ZnFPU0]> {
+ let Latency = 5;
+ let ResourceCycles = [2];
+}
+def : InstRW<[ZnWritePMULLDr], (instregex "(V?)PMULLDrr")>;
+def : InstRW<[ZnWritePMULLDYr], (instregex "(V?)PMULLDYrr")>;
+
+// x,m.
+def ZnWritePMULLDm : SchedWriteRes<[ZnAGU, ZnFPU0]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+// y,m.
+def ZnWritePMULLDYm : SchedWriteRes<[ZnAGU, ZnFPU0]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWritePMULLDm], (instregex "(V?)PMULLDrm")>;
+def : InstRW<[ZnWritePMULLDYm], (instregex "(V?)PMULLDYrm")>;
+
+//-- Logic instructions --//
+
+// PTEST.
+// v,v.
+def ZnWritePTESTr : SchedWriteRes<[ZnFPU12]> {
+ let ResourceCycles = [2];
+}
+def : InstRW<[ZnWritePTESTr], (instregex "(V?)PTEST(Y?)rr")>;
+
+// v,m.
+def ZnWritePTESTm : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWritePTESTm], (instregex "(V?)PTEST(Y?)rm")>;
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def ZnWritePShift : SchedWriteRes<[ZnFPU2]> ;
+def ZnWritePShiftY : SchedWriteRes<[ZnFPU2]> {
+ let Latency = 2;
+}
+def ZnWritePShiftLd : SchedWriteRes<[ZnAGU,ZnFPU2]> {
+ let Latency = 8;
+}
+def ZnWritePShiftYLd : SchedWriteRes<[ZnAGU, ZnFPU2]> {
+ let Latency = 9;
+}
+def : InstRW<[ZnWritePShift], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rr")>;
+def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrr")>;
+
+def : InstRW<[ZnWritePShiftLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)rm")>;
+def : InstRW<[ZnWritePShiftYLd], (instregex "(V?)PS(LL|RL|RA)(W|D|Q)Yrm")>;
+
+// PSLL,PSRL DQ.
+def : InstRW<[ZnWritePShift], (instregex "(V?)PS(R|L)LDQri")>;
+def : InstRW<[ZnWritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// MOVMSKP S/D.
+// r32 <- x,y.
+def ZnWriteMOVMSKPr : SchedWriteRes<[ZnFPU2]> ;
+def : InstRW<[ZnWriteMOVMSKPr], (instregex "(V?)MOVMSKP(S|D)(Y?)rr")>;
+
+// VPERM2F128.
+def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rr")>;
+def : InstRW<[WriteMicrocoded], (instregex "VPERM2F128rm")>;
+
+// BLENDVP S/D.
+def ZnWriteFPU01Lat3 : SchedWriteRes<[ZnFPU013]> {
+ let Latency = 3;
+}
+def ZnWriteFPU01Lat3Ld : SchedWriteRes<[ZnAGU, ZnFPU013]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteFPU01Lat3], (instregex "BLENDVP(S|D)rr0")>;
+def : InstRW<[ZnWriteFPU01Lat3Ld, ReadAfterLd], (instregex "BLENDVP(S|D)rm0")>;
+
+def ZnWriteBROADCAST : SchedWriteRes<[ZnAGU, ZnFPU13]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+}
+// VBROADCASTF128.
+def : InstRW<[ZnWriteBROADCAST], (instregex "VBROADCASTF128")>;
+
+// EXTRACTPS.
+// r32,x,i.
+def ZnWriteEXTRACTPSr : SchedWriteRes<[ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+def ZnWriteEXTRACTPSm : SchedWriteRes<[ZnAGU,ZnFPU12, ZnFPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [5, 1, 2];
+}
+// m32,x,i.
+def : InstRW<[ZnWriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[ZnWriteFPU013], (instregex "VEXTRACTF128rr")>;
+
+// m128,y,i.
+def : InstRW<[ZnWriteFPU013m], (instregex "VEXTRACTF128mr")>;
+
+def ZnWriteVINSERT128r: SchedWriteRes<[ZnFPU013]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def ZnWriteVINSERT128Ld: SchedWriteRes<[ZnAGU,ZnFPU013]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[ZnWriteVINSERT128r], (instregex "VINSERTF128rr")>;
+def : InstRW<[ZnWriteVINSERT128Ld], (instregex "VINSERTF128rm")>;
+
+// VMASKMOVP S/D.
+// x,x,m.
+def ZnWriteVMASKMOVPLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 8;
+}
+// y,y,m.
+def ZnWriteVMASKMOVPLdY : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def ZnWriteVMASKMOVPm : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteVMASKMOVPLd], (instregex "VMASKMOVP(S|D)rm")>;
+def : InstRW<[ZnWriteVMASKMOVPLdY], (instregex "VMASKMOVP(S|D)Yrm")>;
+def : InstRW<[ZnWriteVMASKMOVPm], (instregex "VMASKMOVP(S|D)mr")>;
+
+// m256,y,y.
+def ZnWriteVMASKMOVPYmr : SchedWriteRes<[ZnAGU,ZnFPU01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteVMASKMOVPYmr], (instregex "VMASKMOVP(S|D)Ymr")>;
+
+// VGATHERDPS.
+// x.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSrm")>;
+// y.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPSYrm")>;
+
+// VGATHERQPS.
+// x.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSrm")>;
+
+// y.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPSYrm")>;
+
+// VGATHERDPD.
+// x.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDrm")>;
+
+// y.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERDPDYrm")>;
+
+// VGATHERQPD.
+// x.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDrm")>;
+
+// y.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHERQPDYrm")>;
+
+//-- Conversion instructions --//
+def ZnWriteCVTPD2PSr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+// CVTPD2PS.
+// x,x.
+def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(V?)CVTPD2PSrr")>;
+
+def ZnWriteCVTPD2PSLd: SchedWriteRes<[ZnAGU,ZnFPU03]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+// x,m128.
+def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(V?)CVTPD2PS(X?)rm")>;
+
+// x,y.
+def ZnWriteCVTPD2PSYr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteCVTPD2PSYr], (instregex "(V?)CVTPD2PSYrr")>;
+
+// x,m256.
+def ZnWriteCVTPD2PSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+}
+def : InstRW<[ZnWriteCVTPD2PSYLd], (instregex "(V?)CVTPD2PSYrm")>;
+
+// CVTSD2SS.
+// x,x.
+// Same as WriteCVTPD2PSr
+def : InstRW<[ZnWriteCVTPD2PSr], (instregex "(Int_)?(V)?CVTSD2SSrr")>;
+
+// x,m64.
+def : InstRW<[ZnWriteCVTPD2PSLd], (instregex "(Int_)?(V)?CVTSD2SSrm")>;
+
+// CVTPS2PD.
+// x,x.
+def ZnWriteCVTPS2PDr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteCVTPS2PDr], (instregex "(V?)CVTPS2PDrr")>;
+
+// x,m64.
+// y,m128.
+def ZnWriteCVTPS2PDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteCVTPS2PDLd], (instregex "(V?)CVTPS2PD(Y?)rm")>;
+
+// y,x.
+def ZnWriteVCVTPS2PDY : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 3;
+}
+def : InstRW<[ZnWriteVCVTPS2PDY], (instregex "VCVTPS2PDYrr")>;
+
+// CVTSS2SD.
+// x,x.
+def ZnWriteCVTSS2SDr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteCVTSS2SDr], (instregex "(Int_)?(V?)CVTSS2SDrr")>;
+
+// x,m32.
+def ZnWriteCVTSS2SDLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[ZnWriteCVTSS2SDLd], (instregex "(Int_)?(V?)CVTSS2SDrm")>;
+
+def ZnWriteCVTDQ2PDr: SchedWriteRes<[ZnFPU12,ZnFPU3]> {
+ let Latency = 5;
+}
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
+
+// Same as xmm
+// y,x.
+def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "VCVTDQ2PDYrr")>;
+
+def ZnWriteCVTPD2DQr: SchedWriteRes<[ZnFPU12, ZnFPU3]> {
+ let Latency = 5;
+}
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[ZnWriteCVTDQ2PDr], (instregex "(V?)CVT(T?)PD2DQrr")>;
+
+def ZnWriteCVTPD2DQLd: SchedWriteRes<[ZnAGU,ZnFPU12,ZnFPU3]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+// x,m128.
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// same as xmm handling
+// x,y.
+def : InstRW<[ZnWriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>;
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQ(64)?rm")>;
+
+def ZnWriteCVTPS2PIr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[ZnWriteCVTPS2PDr], (instregex "MMX_CVT(T?)PI2PDirr")>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[ZnWriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+def ZnWriteCVSTSI2SSr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 5;
+}
+// CVSTSI2SS.
+// x,r32.
+def : InstRW<[ZnWriteCVSTSI2SSr], (instregex "(Int_)?(V?)CVT(T?)SI2SS(64)?rr")>;
+
+// same as CVTPD2DQr
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[ZnWriteCVTPD2DQr], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rr")>;
+// same as CVTPD2DQm
+// r32,m32.
+def : InstRW<[ZnWriteCVTPD2DQLd], (instregex "(Int_)?(V?)CVT(T?)SS2SI(64)?rm")>;
+
+def ZnWriteCVSTSI2SDr: SchedWriteRes<[ZnFPU013, ZnFPU3]> {
+ let Latency = 5;
+}
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[ZnWriteCVSTSI2SDr], (instregex "(Int_)?(V?)CVTSI2SS(64)?rr")>;
+
+
+def ZnWriteCVSTSI2SIr: SchedWriteRes<[ZnFPU3, ZnFPU2]> {
+ let Latency = 5;
+}
+def ZnWriteCVSTSI2SILd: SchedWriteRes<[ZnAGU, ZnFPU3, ZnFPU2]> {
+ let Latency = 12;
+}
+// CVTSD2SI.
+// r32/64
+def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?CVT(T?)SD2SI(64)?rm")>;
+
+
+def ZnWriteVCVSTSI2SIr: SchedWriteRes<[ZnFPU3]> {
+ let Latency = 5;
+}
+def ZnWriteVCVSTSI2SILd: SchedWriteRes<[ZnFPU3, ZnAGU]> {
+ let Latency = 12;
+}
+// VCVTSD2SI.
+// r32/64
+def : InstRW<[ZnWriteCVSTSI2SIr], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[ZnWriteCVSTSI2SILd], (instregex "(Int_)?VCVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)rr")>;
+// m,v,i.
+def : InstRW<[WriteMicrocoded], (instregex "VCVTPS2PH(Y?)mr")>;
+
+// VCVTPH2PS.
+// v,x.
+def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rr")>;
+// v,m.
+def : InstRW<[WriteMicrocoded], (instregex "VCVTPH2PS(Y?)rm")>;
+
+//-- SSE4A instructions --//
+// EXTRQ
+def ZnWriteEXTRQ: SchedWriteRes<[ZnFPU12, ZnFPU2]> {
+ let Latency = 2;
+}
+def : InstRW<[ZnWriteEXTRQ], (instregex "EXTRQ")>;
+
+// INSERTQ
+def ZnWriteINSERTQ: SchedWriteRes<[ZnFPU03,ZnFPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteINSERTQ], (instregex "INSERTQ")>;
+
+// MOVNTSS/MOVNTSD
+def ZnWriteMOVNT: SchedWriteRes<[ZnAGU,ZnFPU2]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteMOVNT], (instregex "MOVNTS(S|D)")>;
+
+//-- SHA instructions --//
+// SHA256MSG2
+def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>;
+
+// SHA1MSG1, SHA256MSG1
+// x,x.
+def ZnWriteSHA1MSG1r : SchedWriteRes<[ZnFPU12]> {
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[ZnWriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>;
+// x,m.
+def ZnWriteSHA1MSG1Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 9;
+ let ResourceCycles = [1,2];
+}
+def : InstRW<[ZnWriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>;
+
+// SHA1MSG2
+// x,x.
+def ZnWriteSHA1MSG2r : SchedWriteRes<[ZnFPU12]> ;
+def : InstRW<[ZnWriteSHA1MSG2r], (instregex "SHA1MSG2rr")>;
+// x,m.
+def ZnWriteSHA1MSG2Ld : SchedWriteRes<[ZnAGU, ZnFPU12]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteSHA1MSG2Ld], (instregex "SHA1MSG2rm")>;
+
+// SHA1NEXTE
+// x,x.
+def ZnWriteSHA1NEXTEr : SchedWriteRes<[ZnFPU1]> ;
+def : InstRW<[ZnWriteSHA1NEXTEr], (instregex "SHA1NEXTErr")>;
+// x,m.
+def ZnWriteSHA1NEXTELd : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 8;
+}
+def : InstRW<[ZnWriteSHA1NEXTELd], (instregex "SHA1NEXTErm")>;
+
+// SHA1RNDS4
+// x,x.
+def ZnWriteSHA1RNDS4r : SchedWriteRes<[ZnFPU1]> {
+ let Latency = 6;
+}
+def : InstRW<[ZnWriteSHA1RNDS4r], (instregex "SHA1RNDS4rr")>;
+// x,m.
+def ZnWriteSHA1RNDS4Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 13;
+}
+def : InstRW<[ZnWriteSHA1RNDS4Ld], (instregex "SHA1RNDS4rm")>;
+
+// SHA256RNDS2
+// x,x.
+def ZnWriteSHA256RNDS2r : SchedWriteRes<[ZnFPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteSHA256RNDS2r], (instregex "SHA256RNDS2rr")>;
+// x,m.
+def ZnWriteSHA256RNDS2Ld : SchedWriteRes<[ZnAGU, ZnFPU1]> {
+ let Latency = 11;
+}
+def : InstRW<[ZnWriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+def : InstRW<[WriteMicrocoded], (instregex "(V?)H(ADD|SUB)P(S|D)(Y?)r(r|m)")>;
+
+// MULL SS/SD PS/PD.
+// x,x / v,v,v.
+def ZnWriteMULr : SchedWriteRes<[ZnFPU01]> {
+ let Latency = 3;
+}
+// ymm.
+def ZnWriteMULYr : SchedWriteRes<[ZnFPU01]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteMULr], (instregex "(V?)MUL(P|S)(S|D)rr")>;
+def : InstRW<[ZnWriteMULYr], (instregex "(V?)MUL(P|S)(S|D)Yrr")>;
+
+// x,m / v,v,m.
+def ZnWriteMULLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteMULLd], (instregex "(V?)MUL(P|S)(S|D)rm")>;
+
+// ymm
+def ZnWriteMULYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteMULYLd], (instregex "(V?)MUL(P|S)(S|D)Yrm")>;
+
+// VDIVPS.
+// y,y,y.
+def ZnWriteVDIVPSYr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 12;
+ let ResourceCycles = [12];
+}
+def : InstRW<[ZnWriteVDIVPSYr], (instregex "VDIVPSYrr")>;
+
+// y,y,m256.
+def ZnWriteVDIVPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 19;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 19];
+}
+def : InstRW<[ZnWriteVDIVPSYLd], (instregex "VDIVPSYrm")>;
+
+// VDIVPD.
+// y,y,y.
+def ZnWriteVDIVPDY : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 15;
+ let ResourceCycles = [15];
+}
+def : InstRW<[ZnWriteVDIVPDY], (instregex "VDIVPDYrr")>;
+
+// y,y,m256.
+def ZnWriteVDIVPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 22;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,22];
+}
+def : InstRW<[ZnWriteVDIVPDYLd], (instregex "VDIVPDYrm")>;
+
+// VRCPPS.
+// y,y.
+def ZnWriteVRCPPSr : SchedWriteRes<[ZnFPU01]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteVRCPPSr], (instregex "VRCPPSYr(_Int)?")>;
+
+// y,m256.
+def ZnWriteVRCPPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 12;
+ let NumMicroOps = 3;
+}
+def : InstRW<[ZnWriteVRCPPSLd], (instregex "VRCPPSYm(_Int)?")>;
+
+// ROUND SS/SD PS/PD.
+// v,v,i.
+def ZnWriteROUNDr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 4;
+}
+def : InstRW<[ZnWriteROUNDr], (instregex "(V?)ROUND(Y?)(S|P)(S|D)r(_Int)?")>;
+
+// VFMADD.
+// v,v,v.
+def ZnWriteFMADDr : SchedWriteRes<[ZnFPU03]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteFMADDr],
+ (instregex
+ "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)P(S|D)(213|132|231)(Y)?r",
+ "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)r",
+ "VF(N?)M(ADD|SUB)S(S|D)4rr(_REV|_Int)?",
+ "VF(N?)M(ADD|SUB)P(S|D)4rr(Y)?(_REV)?")>;
+
+// v,v,m.
+def ZnWriteFMADDm : SchedWriteRes<[ZnAGU, ZnFPU03]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteFMADDm],
+ (instregex
+ "VF(N?)M(ADD|SUB|ADDSUB|SUBADD)(213|132|231)P(S|D)(Y)?m",
+ "VF(N?)M(ADD|SUB)(132|231|213)S(S|D)m",
+ "VF(N?)M(ADD|SUB)S(S|D)4(rm|mr)(_Int)?",
+ "VF(N?)M(ADD|SUB)P(S|D)4(rm|mr)(Y)?")>;
+
+// v,m,i.
+def ZnWriteROUNDm : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteROUNDm], (instregex "(V?)ROUND(Y?)(S|P)(S|D)m(_Int)?")>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rri")>;
+
+// x,m,i / v,v,m,i.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPS(Y?)rmi")>;
+
+// DPPD.
+// x,x,i.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrri")>;
+
+// x,m,i.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)DPPDrmi")>;
+
+// VSQRTPS.
+// y,y.
+def ZnWriteVSQRTPSYr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 28;
+ let ResourceCycles = [28];
+}
+def : InstRW<[ZnWriteVSQRTPSYr], (instregex "VSQRTPSYr")>;
+
+// y,m256.
+def ZnWriteVSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 35;
+ let ResourceCycles = [1,35];
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteVSQRTPSYLd], (instregex "VSQRTPSYm")>;
+
+// VSQRTPD.
+// y,y.
+def ZnWriteVSQRTPDYr : SchedWriteRes<[ZnFPU3]> {
+ let Latency = 40;
+ let ResourceCycles = [40];
+}
+def : InstRW<[ZnWriteVSQRTPDYr], (instregex "VSQRTPDYr")>;
+
+// y,m256.
+def ZnWriteVSQRTPDYLd : SchedWriteRes<[ZnAGU, ZnFPU3]> {
+ let Latency = 47;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,47];
+}
+def : InstRW<[ZnWriteVSQRTPDYLd], (instregex "VSQRTPDYm")>;
+
+// RSQRTSS
+// x,x.
+def ZnWriteRSQRTSSr : SchedWriteRes<[ZnFPU02]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteRSQRTSSr], (instregex "(V?)RSQRTSS(Y?)r(_Int)?")>;
+
+// RSQRTPS
+// x,x.
+def ZnWriteRSQRTPSr : SchedWriteRes<[ZnFPU01]> {
+ let Latency = 5;
+}
+def : InstRW<[ZnWriteRSQRTPSr], (instregex "(V?)RSQRTPS(Y?)r(_Int)?")>;
+
+// RSQRTSSm
+// x,m128.
+def ZnWriteRSQRTSSLd: SchedWriteRes<[ZnAGU, ZnFPU02]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+def : InstRW<[ZnWriteRSQRTSSLd], (instregex "(V?)RSQRTSSm(_Int)?")>;
+
+// RSQRTPSm
+def ZnWriteRSQRTPSLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteRSQRTPSLd], (instregex "(V?)RSQRTPSm(_Int)?")>;
+
+// RSQRTPS 256.
+// y,y.
+def ZnWriteRSQRTPSYr : SchedWriteRes<[ZnFPU01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : InstRW<[ZnWriteRSQRTPSYr], (instregex "VRSQRTPSYr(_Int)?")>;
+
+// y,m256.
+def ZnWriteRSQRTPSYLd : SchedWriteRes<[ZnAGU, ZnFPU01]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+def : InstRW<[ZnWriteRSQRTPSYLd], (instregex "VRSQRTPSYm(_Int)?")>;
+
+//-- Logic instructions --//
+
+// AND, ANDN, OR, XOR PS/PD.
+// x,x / v,v,v.
+def : InstRW<[WriteVecLogic], (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rr")>;
+// x,m / v,v,m.
+def : InstRW<[WriteVecLogicLd],
+ (instregex "(V?)(AND|ANDN|OR|XOR)P(S|D)(Y?)rm")>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def : InstRW<[WriteMicrocoded], (instregex "VZEROUPPER")>;
+
+// VZEROALL.
+def : InstRW<[WriteMicrocoded], (instregex "VZEROALL")>;
+
+// LDMXCSR.
+def : InstRW<[WriteMicrocoded], (instregex "(V)?LDMXCSR")>;
+
+// STMXCSR.
+def : InstRW<[WriteMicrocoded], (instregex "(V)?STMXCSR")>;
+
+} // SchedModel
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index c67aa04aebea..1e04997ad294 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -17,8 +17,8 @@
#include "X86RegisterInfo.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/DerivedTypes.h"
-#include "llvm/Target/TargetLowering.h"
using namespace llvm;
@@ -247,7 +247,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32;
if (Repeats.BytesLeft() > 0 &&
- DAG.getMachineFunction().getFunction()->optForMinSize()) {
+ DAG.getMachineFunction().getFunction().optForMinSize()) {
// When agressively optimizing for size, avoid generating the code to
// handle BytesLeft.
Repeats.AVT = MVT::i8;
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 2cebb76022ef..c7ddf93f8e85 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -12,10 +12,8 @@
//
//===----------------------------------------------------------------------===//
-#include "X86ShuffleDecodeConstantPool.h"
#include "Utils/X86ShuffleDecode.h"
#include "llvm/ADT/APInt.h"
-#include "llvm/CodeGen/MachineValueType.h"
#include "llvm/IR/Constants.h"
//===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 24845beac22d..8b08766b6171 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -13,21 +13,15 @@
#include "X86.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
#include "X86RegisterBankInfo.h"
-#endif
#include "X86Subtarget.h"
#include "MCTargetDesc/X86BaseInfo.h"
#include "X86TargetMachine.h"
#include "llvm/ADT/Triple.h"
-#ifdef LLVM_BUILD_GLOBAL_ISEL
#include "llvm/CodeGen/GlobalISel/CallLowering.h"
#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
-#include "llvm/CodeGen/GlobalISel/Legalizer.h"
-#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
-#endif
#include "llvm/IR/Attributes.h"
#include "llvm/IR/ConstantRange.h"
#include "llvm/IR/Function.h"
@@ -39,8 +33,6 @@
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
#include "llvm/Target/TargetMachine.h"
-#include <cassert>
-#include <string>
#if defined(_MSC_VER)
#include <intrin.h>
@@ -151,7 +143,12 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
if (TM.shouldAssumeDSOLocal(M, GV))
return X86II::MO_NO_FLAG;
- assert(!isTargetCOFF());
+ if (isTargetCOFF()) {
+ assert(GV->hasDLLImportStorageClass() &&
+ "shouldAssumeDSOLocal gave inconsistent answer");
+ return X86II::MO_DLLIMPORT;
+ }
+
const Function *F = dyn_cast_or_null<Function>(GV);
if (isTargetELF()) {
@@ -160,6 +157,8 @@ X86Subtarget::classifyGlobalFunctionReference(const GlobalValue *GV,
// In Regcall calling convention those registers are used for passing
// parameters. Thus we need to prevent lazy binding in Regcall.
return X86II::MO_GOTPCREL;
+ if (F && F->hasFnAttribute(Attribute::NonLazyBind) && is64Bit())
+ return X86II::MO_GOTPCREL;
return X86II::MO_PLT;
}
@@ -189,9 +188,12 @@ const char *X86Subtarget::getBZeroEntry() const {
}
bool X86Subtarget::hasSinCos() const {
- return getTargetTriple().isMacOSX() &&
- !getTargetTriple().isMacOSXVersionLT(10, 9) &&
- is64Bit();
+ if (getTargetTriple().isMacOSX()) {
+ return !getTargetTriple().isMacOSXVersionLT(10, 9) && is64Bit();
+ } else if (getTargetTriple().isOSFuchsia()) {
+ return true;
+ }
+ return false;
}
/// Return true if the subtarget allows calls to immediate address.
@@ -263,6 +265,17 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
else if (isTargetDarwin() || isTargetLinux() || isTargetSolaris() ||
isTargetKFreeBSD() || In64BitMode)
stackAlignment = 16;
+
+ // Some CPUs have more overhead for gather. The specified overhead is relative
+ // to the Load operation. "2" is the number provided by Intel architects. This
+ // parameter is used for cost estimation of Gather Op and comparison with
+ // other alternatives.
+ // TODO: Remove the explicit hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ if (hasAVX512() || (hasAVX2() && hasFastGather()))
+ GatherOverhead = 2;
+ if (hasAVX512())
+ ScatterOverhead = 2;
}
void X86Subtarget::initializeEnvironment() {
@@ -274,12 +287,15 @@ void X86Subtarget::initializeEnvironment() {
HasPOPCNT = false;
HasSSE4A = false;
HasAES = false;
+ HasVAES = false;
HasFXSR = false;
HasXSAVE = false;
HasXSAVEOPT = false;
HasXSAVEC = false;
HasXSAVES = false;
HasPCLMUL = false;
+ HasVPCLMULQDQ = false;
+ HasGFNI = false;
HasFMA = false;
HasFMA4 = false;
HasXOP = false;
@@ -293,6 +309,7 @@ void X86Subtarget::initializeEnvironment() {
HasBMI = false;
HasBMI2 = false;
HasVBMI = false;
+ HasVBMI2 = false;
HasIFMA = false;
HasRTM = false;
HasERI = false;
@@ -304,6 +321,8 @@ void X86Subtarget::initializeEnvironment() {
HasVLX = false;
HasADX = false;
HasPKU = false;
+ HasVNNI = false;
+ HasBITALG = false;
HasSHA = false;
HasPRFCHW = false;
HasRDSEED = false;
@@ -311,10 +330,11 @@ void X86Subtarget::initializeEnvironment() {
HasMWAITX = false;
HasCLZERO = false;
HasMPX = false;
+ HasSHSTK = false;
+ HasIBT = false;
HasSGX = false;
HasCLFLUSHOPT = false;
HasCLWB = false;
- IsBTMemSlow = false;
IsPMULLDSlow = false;
IsSHLDSlow = false;
IsUAMem16Slow = false;
@@ -323,15 +343,17 @@ void X86Subtarget::initializeEnvironment() {
HasCmpxchg16b = false;
UseLeaForSP = false;
HasFastPartialYMMorZMMWrite = false;
+ HasFastGather = false;
HasFastScalarFSQRT = false;
HasFastVectorFSQRT = false;
HasFastLZCNT = false;
HasFastSHLDRotate = false;
+ HasMacroFusion = false;
HasERMSB = false;
HasSlowDivide32 = false;
HasSlowDivide64 = false;
PadShortFunctions = false;
- CallRegIndirect = false;
+ SlowTwoMemOps = false;
LEAUsesAG = false;
SlowLEA = false;
Slow3OpsLEA = false;
@@ -340,6 +362,9 @@ void X86Subtarget::initializeEnvironment() {
// FIXME: this is a known good value for Yonah. How about others?
MaxInlineSizeThreshold = 128;
UseSoftFloat = false;
+ X86ProcFamily = Others;
+ GatherOverhead = 1024;
+ ScatterOverhead = 1024;
}
X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
@@ -349,35 +374,6 @@ X86Subtarget &X86Subtarget::initializeSubtargetDependencies(StringRef CPU,
return *this;
}
-#ifdef LLVM_BUILD_GLOBAL_ISEL
-namespace {
-
-struct X86GISelActualAccessor : public GISelAccessor {
- std::unique_ptr<CallLowering> CallLoweringInfo;
- std::unique_ptr<LegalizerInfo> Legalizer;
- std::unique_ptr<RegisterBankInfo> RegBankInfo;
- std::unique_ptr<InstructionSelector> InstSelector;
-
- const CallLowering *getCallLowering() const override {
- return CallLoweringInfo.get();
- }
-
- const InstructionSelector *getInstructionSelector() const override {
- return InstSelector.get();
- }
-
- const LegalizerInfo *getLegalizerInfo() const override {
- return Legalizer.get();
- }
-
- const RegisterBankInfo *getRegBankInfo() const override {
- return RegBankInfo.get();
- }
-};
-
-} // end anonymous namespace
-#endif
-
X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM,
unsigned StackAlignOverride)
@@ -402,39 +398,29 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
setPICStyle(PICStyles::StubPIC);
else if (isTargetELF())
setPICStyle(PICStyles::GOT);
-#ifndef LLVM_BUILD_GLOBAL_ISEL
- GISelAccessor *GISel = new GISelAccessor();
-#else
- X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
- GISel->CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
- GISel->Legalizer.reset(new X86LegalizerInfo(*this, TM));
+ CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
+ Legalizer.reset(new X86LegalizerInfo(*this, TM));
auto *RBI = new X86RegisterBankInfo(*getRegisterInfo());
- GISel->RegBankInfo.reset(RBI);
- GISel->InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI));
-#endif
- setGISelAccessor(*GISel);
+ RegBankInfo.reset(RBI);
+ InstSelector.reset(createX86InstructionSelector(TM, *this, *RBI));
}
const CallLowering *X86Subtarget::getCallLowering() const {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getCallLowering();
+ return CallLoweringInfo.get();
}
const InstructionSelector *X86Subtarget::getInstructionSelector() const {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getInstructionSelector();
+ return InstSelector.get();
}
const LegalizerInfo *X86Subtarget::getLegalizerInfo() const {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getLegalizerInfo();
+ return Legalizer.get();
}
const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
- assert(GISel && "Access to GlobalISel APIs not set");
- return GISel->getRegBankInfo();
+ return RegBankInfo.get();
}
bool X86Subtarget::enableEarlyIfConversion() const {
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 427a0001bef9..be4d46c470de 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -20,11 +20,14 @@
#include "X86SelectionDAGInfo.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/CallingConv.h"
#include "llvm/MC/MCInstrItineraries.h"
#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
#include <memory>
#define GET_SUBTARGETINFO_HEADER
@@ -48,6 +51,21 @@ enum Style {
} // end namespace PICStyles
class X86Subtarget final : public X86GenSubtargetInfo {
+public:
+ enum X86ProcFamilyEnum {
+ Others,
+ IntelAtom,
+ IntelSLM,
+ IntelGLM,
+ IntelHaswell,
+ IntelBroadwell,
+ IntelSkylake,
+ IntelKNL,
+ IntelSKX,
+ IntelCannonlake,
+ IntelIcelake,
+ };
+
protected:
enum X86SSEEnum {
NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -57,10 +75,6 @@ protected:
NoThreeDNow, MMX, ThreeDNow, ThreeDNowA
};
- enum X86ProcFamilyEnum {
- Others, IntelAtom, IntelSLM, IntelGLM
- };
-
/// X86 processor family: Intel Atom, and others
X86ProcFamilyEnum X86ProcFamily;
@@ -93,6 +107,7 @@ protected:
/// Target has AES instructions
bool HasAES;
+ bool HasVAES;
/// Target has FXSAVE/FXRESTOR instructions
bool HasFXSR;
@@ -111,6 +126,10 @@ protected:
/// Target has carry-less multiplication
bool HasPCLMUL;
+ bool HasVPCLMULQDQ;
+
+ /// Target has Galois Field Arithmetic instructions
+ bool HasGFNI;
/// Target has 3-operand fused multiply-add
bool HasFMA;
@@ -151,6 +170,9 @@ protected:
/// Processor has VBMI instructions.
bool HasVBMI;
+ /// Processor has VBMI2 instructions.
+ bool HasVBMI2;
+
/// Processor has Integer Fused Multiply Add
bool HasIFMA;
@@ -181,9 +203,6 @@ protected:
/// Processor has Prefetch with intent to Write instruction
bool HasPFPREFETCHWT1;
- /// True if BT (bit test) of memory instructions are slow.
- bool IsBTMemSlow;
-
/// True if SHLD instructions are slow.
bool IsSHLDSlow;
@@ -213,6 +232,10 @@ protected:
/// of a YMM or ZMM register without clearing the upper part.
bool HasFastPartialYMMorZMMWrite;
+ /// True if gather is reasonably fast. This is true for Skylake client and
+ /// all AVX-512 CPUs.
+ bool HasFastGather;
+
/// True if hardware SQRTSS instruction is at least as fast (latency) as
/// RSQRTSS followed by a Newton-Raphson iteration.
bool HasFastScalarFSQRT;
@@ -235,6 +258,9 @@ protected:
/// True if SHLD based rotate is fast.
bool HasFastSHLDRotate;
+ /// True if the processor supports macrofusion.
+ bool HasMacroFusion;
+
/// True if the processor has enhanced REP MOVSB/STOSB.
bool HasERMSB;
@@ -242,9 +268,9 @@ protected:
/// a stall when returning too early.
bool PadShortFunctions;
- /// True if the Calls with memory reference should be converted
- /// to a register-based indirect call.
- bool CallRegIndirect;
+ /// True if two memory operand instructions should use a temporary register
+ /// instead.
+ bool SlowTwoMemOps;
/// True if the LEA instruction inputs have to be ready at address generation
/// (AG) time.
@@ -285,9 +311,23 @@ protected:
/// Processor has PKU extenstions
bool HasPKU;
+ /// Processor has AVX-512 Vector Neural Network Instructions
+ bool HasVNNI;
+
+ /// Processor has AVX-512 Bit Algorithms instructions
+ bool HasBITALG;
+
/// Processor supports MPX - Memory Protection Extensions
bool HasMPX;
+ /// Processor supports CET SHSTK - Control-Flow Enforcement Technology
+ /// using Shadow Stack
+ bool HasSHSTK;
+
+ /// Processor supports CET IBT - Control-Flow Enforcement Technology
+ /// using Indirect Branch Tracking
+ bool HasIBT;
+
/// Processor has Software Guard Extensions
bool HasSGX;
@@ -314,10 +354,11 @@ protected:
/// Instruction itineraries for scheduling
InstrItineraryData InstrItins;
- /// Gather the accessor points to GlobalISel-related APIs.
- /// This is used to avoid ifndefs spreading around while GISel is
- /// an optional library.
- std::unique_ptr<GISelAccessor> GISel;
+ /// GlobalISel related APIs.
+ std::unique_ptr<CallLowering> CallLoweringInfo;
+ std::unique_ptr<LegalizerInfo> Legalizer;
+ std::unique_ptr<RegisterBankInfo> RegBankInfo;
+ std::unique_ptr<InstructionSelector> InstSelector;
private:
/// Override the stack alignment.
@@ -332,6 +373,10 @@ private:
/// True if compiling for 16-bit, false for 32-bit or 64-bit.
bool In16BitMode;
+ /// Contains the Overhead of gather\scatter instructions
+ int GatherOverhead;
+ int ScatterOverhead;
+
X86SelectionDAGInfo TSInfo;
// Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
// X86TargetLowering needs.
@@ -346,9 +391,6 @@ public:
X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
const X86TargetMachine &TM, unsigned StackAlignOverride);
- /// This object will take onwership of \p GISelAccessor.
- void setGISelAccessor(GISelAccessor &GISel) { this->GISel.reset(&GISel); }
-
const X86TargetLowering *getTargetLowering() const override {
return &TLInfo;
}
@@ -441,15 +483,18 @@ public:
bool has3DNowA() const { return X863DNowLevel >= ThreeDNowA; }
bool hasPOPCNT() const { return HasPOPCNT; }
bool hasAES() const { return HasAES; }
+ bool hasVAES() const { return HasVAES; }
bool hasFXSR() const { return HasFXSR; }
bool hasXSAVE() const { return HasXSAVE; }
bool hasXSAVEOPT() const { return HasXSAVEOPT; }
bool hasXSAVEC() const { return HasXSAVEC; }
bool hasXSAVES() const { return HasXSAVES; }
bool hasPCLMUL() const { return HasPCLMUL; }
+ bool hasVPCLMULQDQ() const { return HasVPCLMULQDQ; }
+ bool hasGFNI() const { return HasGFNI; }
// Prefer FMA4 to FMA - its better for commutation/memory folding and
// has equal or better performance on all supported targets.
- bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
+ bool hasFMA() const { return HasFMA; }
bool hasFMA4() const { return HasFMA4; }
bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
bool hasXOP() const { return HasXOP; }
@@ -463,6 +508,7 @@ public:
bool hasBMI() const { return HasBMI; }
bool hasBMI2() const { return HasBMI2; }
bool hasVBMI() const { return HasVBMI; }
+ bool hasVBMI2() const { return HasVBMI2; }
bool hasIFMA() const { return HasIFMA; }
bool hasRTM() const { return HasRTM; }
bool hasADX() const { return HasADX; }
@@ -472,26 +518,29 @@ public:
bool hasLAHFSAHF() const { return HasLAHFSAHF; }
bool hasMWAITX() const { return HasMWAITX; }
bool hasCLZERO() const { return HasCLZERO; }
- bool isBTMemSlow() const { return IsBTMemSlow; }
bool isSHLDSlow() const { return IsSHLDSlow; }
bool isPMULLDSlow() const { return IsPMULLDSlow; }
bool isUnalignedMem16Slow() const { return IsUAMem16Slow; }
bool isUnalignedMem32Slow() const { return IsUAMem32Slow; }
+ int getGatherOverhead() const { return GatherOverhead; }
+ int getScatterOverhead() const { return ScatterOverhead; }
bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
bool hasCmpxchg16b() const { return HasCmpxchg16b; }
bool useLeaForSP() const { return UseLeaForSP; }
bool hasFastPartialYMMorZMMWrite() const {
return HasFastPartialYMMorZMMWrite;
}
+ bool hasFastGather() const { return HasFastGather; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
bool hasFastLZCNT() const { return HasFastLZCNT; }
bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
+ bool hasMacroFusion() const { return HasMacroFusion; }
bool hasERMSB() const { return HasERMSB; }
bool hasSlowDivide32() const { return HasSlowDivide32; }
bool hasSlowDivide64() const { return HasSlowDivide64; }
bool padShortFunctions() const { return PadShortFunctions; }
- bool callRegIndirect() const { return CallRegIndirect; }
+ bool slowTwoMemOps() const { return SlowTwoMemOps; }
bool LEAusesAG() const { return LEAUsesAG; }
bool slowLEA() const { return SlowLEA; }
bool slow3OpsLEA() const { return Slow3OpsLEA; }
@@ -504,11 +553,19 @@ public:
bool hasBWI() const { return HasBWI; }
bool hasVLX() const { return HasVLX; }
bool hasPKU() const { return HasPKU; }
+ bool hasVNNI() const { return HasVNNI; }
+ bool hasBITALG() const { return HasBITALG; }
bool hasMPX() const { return HasMPX; }
+ bool hasSHSTK() const { return HasSHSTK; }
+ bool hasIBT() const { return HasIBT; }
bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
+ bool hasCLWB() const { return HasCLWB; }
bool isXRaySupported() const override { return is64Bit(); }
+ X86ProcFamilyEnum getProcFamily() const { return X86ProcFamily; }
+
+ /// TODO: to be removed later and replaced with suitable properties
bool isAtom() const { return X86ProcFamily == IntelAtom; }
bool isSLM() const { return X86ProcFamily == IntelSLM; }
bool useSoftFloat() const { return UseSoftFloat; }
@@ -568,13 +625,9 @@ public:
bool isOSWindows() const { return TargetTriple.isOSWindows(); }
- bool isTargetWin64() const {
- return In64BitMode && TargetTriple.isOSWindows();
- }
+ bool isTargetWin64() const { return In64BitMode && isOSWindows(); }
- bool isTargetWin32() const {
- return !In64BitMode && (isTargetCygMing() || isTargetKnownWindowsMSVC());
- }
+ bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
@@ -590,6 +643,7 @@ public:
// On Win64, all these conventions just use the default convention.
case CallingConv::C:
case CallingConv::Fast:
+ case CallingConv::Swift:
case CallingConv::X86_FastCall:
case CallingConv::X86_StdCall:
case CallingConv::X86_ThisCall:
@@ -655,6 +709,8 @@ public:
AntiDepBreakMode getAntiDepBreakMode() const override {
return TargetSubtargetInfo::ANTIDEP_CRITICAL;
}
+
+ bool enableAdvancedRASplitCost() const override { return true; }
};
} // end namespace llvm
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 08c2cdaefe71..ea8c9862230e 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -11,13 +11,13 @@
//
//===----------------------------------------------------------------------===//
+#include "X86TargetMachine.h"
#include "MCTargetDesc/X86MCTargetDesc.h"
#include "X86.h"
#include "X86CallLowering.h"
#include "X86LegalizerInfo.h"
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
-#include "X86TargetMachine.h"
#include "X86TargetObjectFile.h"
#include "X86TargetTransformInfo.h"
#include "llvm/ADT/Optional.h"
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
#include "llvm/CodeGen/MachineScheduler.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetPassConfig.h"
#include "llvm/IR/Attributes.h"
#include "llvm/IR/DataLayout.h"
@@ -43,7 +44,6 @@
#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/TargetRegistry.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
#include <memory>
#include <string>
@@ -58,7 +58,10 @@ namespace llvm {
void initializeWinEHStatePassPass(PassRegistry &);
void initializeFixupLEAPassPass(PassRegistry &);
+void initializeX86CallFrameOptimizationPass(PassRegistry &);
+void initializeX86CmovConverterPassPass(PassRegistry &);
void initializeX86ExecutionDepsFixPass(PassRegistry &);
+void initializeX86DomainReassignmentPass(PassRegistry &);
} // end namespace llvm
@@ -73,7 +76,10 @@ extern "C" void LLVMInitializeX86Target() {
initializeFixupBWInstPassPass(PR);
initializeEvexToVexInstPassPass(PR);
initializeFixupLEAPassPass(PR);
+ initializeX86CallFrameOptimizationPass(PR);
+ initializeX86CmovConverterPassPass(PR);
initializeX86ExecutionDepsFixPass(PR);
+ initializeX86DomainReassignmentPass(PR);
}
static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -181,15 +187,27 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
return *RM;
}
+static CodeModel::Model getEffectiveCodeModel(Optional<CodeModel::Model> CM,
+ bool JIT, bool Is64Bit) {
+ if (CM)
+ return *CM;
+ if (JIT)
+ return Is64Bit ? CodeModel::Large : CodeModel::Small;
+ return CodeModel::Small;
+}
+
/// Create an X86 target.
///
X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
StringRef CPU, StringRef FS,
const TargetOptions &Options,
Optional<Reloc::Model> RM,
- CodeModel::Model CM, CodeGenOpt::Level OL)
- : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
- getEffectiveRelocModel(TT, RM), CM, OL),
+ Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT)
+ : LLVMTargetMachine(
+ T, computeDataLayout(TT), TT, CPU, FS, Options,
+ getEffectiveRelocModel(TT, RM),
+ getEffectiveCodeModel(CM, JIT, TT.getArch() == Triple::x86_64), OL),
TLOF(createTLOF(getTargetTriple())) {
// Windows stack unwinder gets confused when execution flow "falls through"
// after a call to 'noreturn' function.
@@ -294,14 +312,13 @@ public:
void addIRPasses() override;
bool addInstSelector() override;
-#ifdef LLVM_BUILD_GLOBAL_ISEL
bool addIRTranslator() override;
bool addLegalizeMachineIR() override;
bool addRegBankSelect() override;
bool addGlobalInstructionSelect() override;
-#endif
bool addILPOpts() override;
bool addPreISel() override;
+ void addMachineSSAOptimization() override;
void addPreRegAlloc() override;
void addPostRegAlloc() override;
void addPreEmitPass() override;
@@ -349,7 +366,6 @@ bool X86PassConfig::addInstSelector() {
return false;
}
-#ifdef LLVM_BUILD_GLOBAL_ISEL
bool X86PassConfig::addIRTranslator() {
addPass(new IRTranslator());
return false;
@@ -369,7 +385,6 @@ bool X86PassConfig::addGlobalInstructionSelect() {
addPass(new InstructionSelect());
return false;
}
-#endif
bool X86PassConfig::addILPOpts() {
addPass(&EarlyIfConverterID);
@@ -397,6 +412,10 @@ void X86PassConfig::addPreRegAlloc() {
addPass(createX86WinAllocaExpander());
}
+void X86PassConfig::addMachineSSAOptimization() {
+ addPass(createX86DomainReassignmentPass());
+ TargetPassConfig::addMachineSSAOptimization();
+}
void X86PassConfig::addPostRegAlloc() {
addPass(createX86FloatingPointStackifierPass());
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index c16207973b39..952bd1321ff9 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -35,13 +35,14 @@ class X86TargetMachine final : public LLVMTargetMachine {
public:
X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
StringRef FS, const TargetOptions &Options,
- Optional<Reloc::Model> RM, CodeModel::Model CM,
- CodeGenOpt::Level OL);
+ Optional<Reloc::Model> RM, Optional<CodeModel::Model> CM,
+ CodeGenOpt::Level OL, bool JIT);
~X86TargetMachine() override;
const X86Subtarget *getSubtargetImpl(const Function &F) const override;
- // The no argument getSubtargetImpl, while it exists on some targets, is
- // deprecated and should not be used.
+ // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget,
+ // subtargets are per-function entities based on the target-specific
+ // attributes of each function.
const X86Subtarget *getSubtargetImpl() const = delete;
TargetIRAnalysis getTargetIRAnalysis() override;
diff --git a/lib/Target/X86/X86TargetObjectFile.cpp b/lib/Target/X86/X86TargetObjectFile.cpp
index 8627c06d4431..fb35a6b2ec1a 100644
--- a/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/lib/Target/X86/X86TargetObjectFile.cpp
@@ -11,6 +11,7 @@
#include "llvm/ADT/StringExtras.h"
#include "llvm/BinaryFormat/COFF.h"
#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/Mangler.h"
#include "llvm/IR/Operator.h"
#include "llvm/MC/MCContext.h"
@@ -18,7 +19,6 @@
#include "llvm/MC/MCSectionCOFF.h"
#include "llvm/MC/MCSectionELF.h"
#include "llvm/MC/MCValue.h"
-#include "llvm/Target/TargetLowering.h"
using namespace llvm;
using namespace dwarf;
diff --git a/lib/Target/X86/X86TargetObjectFile.h b/lib/Target/X86/X86TargetObjectFile.h
index f6aa570b6332..76e9cd5db2a0 100644
--- a/lib/Target/X86/X86TargetObjectFile.h
+++ b/lib/Target/X86/X86TargetObjectFile.h
@@ -10,8 +10,8 @@
#ifndef LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
#define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
+#include "llvm/CodeGen/TargetLoweringObjectFile.h"
#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
namespace llvm {
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index c9924f264939..223eed3048db 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -42,10 +42,10 @@
#include "X86TargetTransformInfo.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/CostTable.h"
+#include "llvm/CodeGen/TargetLowering.h"
#include "llvm/IR/IntrinsicInst.h"
#include "llvm/Support/Debug.h"
-#include "llvm/Target/CostTable.h"
-#include "llvm/Target/TargetLowering.h"
using namespace llvm;
@@ -66,6 +66,57 @@ X86TTIImpl::getPopcntSupport(unsigned TyWidth) {
return ST->hasPOPCNT() ? TTI::PSK_FastHardware : TTI::PSK_Software;
}
+llvm::Optional<unsigned> X86TTIImpl::getCacheSize(
+ TargetTransformInfo::CacheLevel Level) const {
+ switch (Level) {
+ case TargetTransformInfo::CacheLevel::L1D:
+ // - Penryn
+ // - Nehalem
+ // - Westmere
+ // - Sandy Bridge
+ // - Ivy Bridge
+ // - Haswell
+ // - Broadwell
+ // - Skylake
+ // - Kabylake
+ return 32 * 1024; // 32 KByte
+ case TargetTransformInfo::CacheLevel::L2D:
+ // - Penryn
+ // - Nehalem
+ // - Westmere
+ // - Sandy Bridge
+ // - Ivy Bridge
+ // - Haswell
+ // - Broadwell
+ // - Skylake
+ // - Kabylake
+ return 256 * 1024; // 256 KByte
+ }
+
+ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
+}
+
+llvm::Optional<unsigned> X86TTIImpl::getCacheAssociativity(
+ TargetTransformInfo::CacheLevel Level) const {
+ // - Penryn
+ // - Nehalem
+ // - Westmere
+ // - Sandy Bridge
+ // - Ivy Bridge
+ // - Haswell
+ // - Broadwell
+ // - Skylake
+ // - Kabylake
+ switch (Level) {
+ case TargetTransformInfo::CacheLevel::L1D:
+ LLVM_FALLTHROUGH;
+ case TargetTransformInfo::CacheLevel::L2D:
+ return 8;
+ }
+
+ llvm_unreachable("Unknown TargetTransformInfo::CacheLevel");
+}
+
unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
if (Vector && !ST->hasSSE1())
return 0;
@@ -144,9 +195,9 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FSUB, MVT::v2f64, 2 }, // subpd
// v2i64/v4i64 mul is custom lowered as a series of long:
// multiplies(3), shifts(3) and adds(2)
- // slm muldq version throughput is 2 and addq throughput 4
+ // slm muldq version throughput is 2 and addq throughput 4
// thus: 3X2 (muldq throughput) + 3X1 (shift throuput) +
- // 3X4 (addq throughput) = 17
+ // 3X4 (addq throughput) = 17
{ ISD::MUL, MVT::v2i64, 17 },
// slm addq\subq throughput is 4
{ ISD::ADD, MVT::v2i64, 4 },
@@ -838,11 +889,22 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
{ TTI::SK_Alternate, MVT::v32i8, 1 }, // vpblendvb
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 1 }, // vpermpd
+ { TTI::SK_PermuteSingleSrc, MVT::v8f32, 1 }, // vpermps
{ TTI::SK_PermuteSingleSrc, MVT::v4i64, 1 }, // vpermq
{ TTI::SK_PermuteSingleSrc, MVT::v8i32, 1 }, // vpermd
- { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
+ { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2*vpshufb
// + vpblendvb
- { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 } // vperm2i128 + 2 * vpshufb
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vperm2i128 + 2*vpshufb
+ // + vpblendvb
+
+ { TTI::SK_PermuteTwoSrc, MVT::v4f64, 3 }, // 2*vpermpd + vblendpd
+ { TTI::SK_PermuteTwoSrc, MVT::v8f32, 3 }, // 2*vpermps + vblendps
+ { TTI::SK_PermuteTwoSrc, MVT::v4i64, 3 }, // 2*vpermq + vpblendd
+ { TTI::SK_PermuteTwoSrc, MVT::v8i32, 3 }, // 2*vpermd + vpblendd
+ { TTI::SK_PermuteTwoSrc, MVT::v16i16, 7 }, // 2*vperm2i128 + 4*vpshufb
+ // + vpblendvb
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 7 }, // 2*vperm2i128 + 4*vpshufb
// + vpblendvb
};
@@ -850,6 +912,28 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
if (const auto *Entry = CostTableLookup(AVX2ShuffleTbl, Kind, LT.second))
return LT.first * Entry->Cost;
+ static const CostTblEntry XOPShuffleTbl[] = {
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 2 }, // vperm2f128 + vpermil2pd
+ { TTI::SK_PermuteSingleSrc, MVT::v8f32, 2 }, // vperm2f128 + vpermil2ps
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 2 }, // vperm2f128 + vpermil2pd
+ { TTI::SK_PermuteSingleSrc, MVT::v8i32, 2 }, // vperm2f128 + vpermil2ps
+ { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vextractf128 + 2*vpperm
+ // + vinsertf128
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 4 }, // vextractf128 + 2*vpperm
+ // + vinsertf128
+
+ { TTI::SK_PermuteTwoSrc, MVT::v16i16, 9 }, // 2*vextractf128 + 6*vpperm
+ // + vinsertf128
+ { TTI::SK_PermuteTwoSrc, MVT::v8i16, 1 }, // vpperm
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 9 }, // 2*vextractf128 + 6*vpperm
+ // + vinsertf128
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 1 }, // vpperm
+ };
+
+ if (ST->hasXOP())
+ if (const auto *Entry = CostTableLookup(XOPShuffleTbl, Kind, LT.second))
+ return LT.first * Entry->Cost;
+
static const CostTblEntry AVX1ShuffleTbl[] = {
{ TTI::SK_Broadcast, MVT::v4f64, 2 }, // vperm2f128 + vpermilpd
{ TTI::SK_Broadcast, MVT::v8f32, 2 }, // vperm2f128 + vpermilps
@@ -872,7 +956,25 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Alternate, MVT::v8i32, 1 }, // vblendps
{ TTI::SK_Alternate, MVT::v8f32, 1 }, // vblendps
{ TTI::SK_Alternate, MVT::v16i16, 3 }, // vpand + vpandn + vpor
- { TTI::SK_Alternate, MVT::v32i8, 3 } // vpand + vpandn + vpor
+ { TTI::SK_Alternate, MVT::v32i8, 3 }, // vpand + vpandn + vpor
+
+ { TTI::SK_PermuteSingleSrc, MVT::v4f64, 3 }, // 2*vperm2f128 + vshufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i64, 3 }, // 2*vperm2f128 + vshufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
+ { TTI::SK_PermuteSingleSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
+ { TTI::SK_PermuteSingleSrc, MVT::v16i16, 8 }, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+ { TTI::SK_PermuteSingleSrc, MVT::v32i8, 8 }, // vextractf128 + 4*pshufb
+ // + 2*por + vinsertf128
+
+ { TTI::SK_PermuteTwoSrc, MVT::v4f64, 4 }, // 2*vperm2f128 + 2*vshufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v8f32, 4 }, // 2*vperm2f128 + 2*vshufps
+ { TTI::SK_PermuteTwoSrc, MVT::v4i64, 4 }, // 2*vperm2f128 + 2*vshufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v8i32, 4 }, // 2*vperm2f128 + 2*vshufps
+ { TTI::SK_PermuteTwoSrc, MVT::v16i16, 15 }, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
+ { TTI::SK_PermuteTwoSrc, MVT::v32i8, 15 }, // 2*vextractf128 + 8*pshufb
+ // + 4*por + vinsertf128
};
if (ST->hasAVX())
@@ -899,11 +1001,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Reverse, MVT::v8i16, 1 }, // pshufb
{ TTI::SK_Reverse, MVT::v16i8, 1 }, // pshufb
- { TTI::SK_Alternate, MVT::v8i16, 3 }, // pshufb + pshufb + por
- { TTI::SK_Alternate, MVT::v16i8, 3 }, // pshufb + pshufb + por
+ { TTI::SK_Alternate, MVT::v8i16, 3 }, // 2*pshufb + por
+ { TTI::SK_Alternate, MVT::v16i8, 3 }, // 2*pshufb + por
{ TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
- { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 } // pshufb
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }, // pshufb
+
+ { TTI::SK_PermuteTwoSrc, MVT::v8i16, 3 }, // 2*pshufb + por
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 3 }, // 2*pshufb + por
};
if (ST->hasSSSE3())
@@ -914,13 +1019,13 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Broadcast, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_Broadcast, MVT::v2i64, 1 }, // pshufd
{ TTI::SK_Broadcast, MVT::v4i32, 1 }, // pshufd
- { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
+ { TTI::SK_Broadcast, MVT::v8i16, 2 }, // pshuflw + pshufd
{ TTI::SK_Broadcast, MVT::v16i8, 3 }, // unpck + pshuflw + pshufd
{ TTI::SK_Reverse, MVT::v2f64, 1 }, // shufpd
{ TTI::SK_Reverse, MVT::v2i64, 1 }, // pshufd
{ TTI::SK_Reverse, MVT::v4i32, 1 }, // pshufd
- { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
+ { TTI::SK_Reverse, MVT::v8i16, 3 }, // pshuflw + pshufhw + pshufd
{ TTI::SK_Reverse, MVT::v16i8, 9 }, // 2*pshuflw + 2*pshufhw
// + 2*pshufd + 2*unpck + packus
@@ -930,8 +1035,19 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
{ TTI::SK_Alternate, MVT::v8i16, 3 }, // pand + pandn + por
{ TTI::SK_Alternate, MVT::v16i8, 3 }, // pand + pandn + por
- { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
- { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 } // pshufd
+ { TTI::SK_PermuteSingleSrc, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
+ { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }, // pshufd
+ { TTI::SK_PermuteSingleSrc, MVT::v8i16, 5 }, // 2*pshuflw + 2*pshufhw
+ // + pshufd/unpck
+ { TTI::SK_PermuteSingleSrc, MVT::v16i8, 10 }, // 2*pshuflw + 2*pshufhw
+ // + 2*pshufd + 2*unpck + 2*packus
+
+ { TTI::SK_PermuteTwoSrc, MVT::v2f64, 1 }, // shufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v2i64, 1 }, // shufpd
+ { TTI::SK_PermuteTwoSrc, MVT::v4i32, 2 }, // 2*{unpck,movsd,pshufd}
+ { TTI::SK_PermuteTwoSrc, MVT::v8i16, 8 }, // blend+permute
+ { TTI::SK_PermuteTwoSrc, MVT::v16i8, 13 }, // blend+permute
};
if (ST->hasSSE2())
@@ -939,9 +1055,11 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
return LT.first * Entry->Cost;
static const CostTblEntry SSE1ShuffleTbl[] = {
- { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
- { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
- { TTI::SK_Alternate, MVT::v4f32, 2 } // 2*shufps
+ { TTI::SK_Broadcast, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Reverse, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_Alternate, MVT::v4f32, 2 }, // 2*shufps
+ { TTI::SK_PermuteSingleSrc, MVT::v4f32, 1 }, // shufps
+ { TTI::SK_PermuteTwoSrc, MVT::v4f32, 2 }, // 2*shufps
};
if (ST->hasSSE1())
@@ -1052,7 +1170,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::FP_TO_UINT, MVT::v2i32, MVT::v2f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },
+ { ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },
{ ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v16i16, MVT::v16f32, 2 },
+ { ISD::FP_TO_UINT, MVT::v16i8, MVT::v16f32, 2 },
};
static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -1315,7 +1437,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
return Entry->Cost;
}
- return BaseT::getCastInstrCost(Opcode, Dst, Src);
+ return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
}
int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
@@ -1805,8 +1927,8 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
return BaseT::getAddressComputationCost(Ty, SE, Ptr);
}
-int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
- bool IsPairwise) {
+int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
+ bool IsPairwise) {
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
@@ -1874,7 +1996,153 @@ int X86TTIImpl::getReductionCost(unsigned Opcode, Type *ValTy,
return LT.first * Entry->Cost;
}
- return BaseT::getReductionCost(Opcode, ValTy, IsPairwise);
+ return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+}
+
+int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
+ bool IsPairwise, bool IsUnsigned) {
+ std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+ MVT MTy = LT.second;
+
+ int ISD;
+ if (ValTy->isIntOrIntVectorTy()) {
+ ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+ } else {
+ assert(ValTy->isFPOrFPVectorTy() &&
+ "Expected float point or integer vector type.");
+ ISD = ISD::FMINNUM;
+ }
+
+ // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+ // and make it as the cost.
+
+ static const CostTblEntry SSE42CostTblPairWise[] = {
+ {ISD::FMINNUM, MVT::v2f64, 3},
+ {ISD::FMINNUM, MVT::v4f32, 2},
+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+ {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
+ {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
+ {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
+ {ISD::SMIN, MVT::v8i16, 2},
+ {ISD::UMIN, MVT::v8i16, 2},
+ };
+
+ static const CostTblEntry AVX1CostTblPairWise[] = {
+ {ISD::FMINNUM, MVT::v4f32, 1},
+ {ISD::FMINNUM, MVT::v4f64, 1},
+ {ISD::FMINNUM, MVT::v8f32, 2},
+ {ISD::SMIN, MVT::v2i64, 3},
+ {ISD::UMIN, MVT::v2i64, 3},
+ {ISD::SMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v4i32, 1},
+ {ISD::SMIN, MVT::v8i16, 1},
+ {ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v8i32, 3},
+ {ISD::UMIN, MVT::v8i32, 3},
+ };
+
+ static const CostTblEntry AVX2CostTblPairWise[] = {
+ {ISD::SMIN, MVT::v4i64, 2},
+ {ISD::UMIN, MVT::v4i64, 2},
+ {ISD::SMIN, MVT::v8i32, 1},
+ {ISD::UMIN, MVT::v8i32, 1},
+ {ISD::SMIN, MVT::v16i16, 1},
+ {ISD::UMIN, MVT::v16i16, 1},
+ {ISD::SMIN, MVT::v32i8, 2},
+ {ISD::UMIN, MVT::v32i8, 2},
+ };
+
+ static const CostTblEntry AVX512CostTblPairWise[] = {
+ {ISD::FMINNUM, MVT::v8f64, 1},
+ {ISD::FMINNUM, MVT::v16f32, 2},
+ {ISD::SMIN, MVT::v8i64, 2},
+ {ISD::UMIN, MVT::v8i64, 2},
+ {ISD::SMIN, MVT::v16i32, 1},
+ {ISD::UMIN, MVT::v16i32, 1},
+ };
+
+ static const CostTblEntry SSE42CostTblNoPairWise[] = {
+ {ISD::FMINNUM, MVT::v2f64, 3},
+ {ISD::FMINNUM, MVT::v4f32, 3},
+ {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
+ {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
+ {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
+ {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
+ {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
+ {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
+ };
+
+ static const CostTblEntry AVX1CostTblNoPairWise[] = {
+ {ISD::FMINNUM, MVT::v4f32, 1},
+ {ISD::FMINNUM, MVT::v4f64, 1},
+ {ISD::FMINNUM, MVT::v8f32, 1},
+ {ISD::SMIN, MVT::v2i64, 3},
+ {ISD::UMIN, MVT::v2i64, 3},
+ {ISD::SMIN, MVT::v4i32, 1},
+ {ISD::UMIN, MVT::v4i32, 1},
+ {ISD::SMIN, MVT::v8i16, 1},
+ {ISD::UMIN, MVT::v8i16, 1},
+ {ISD::SMIN, MVT::v8i32, 2},
+ {ISD::UMIN, MVT::v8i32, 2},
+ };
+
+ static const CostTblEntry AVX2CostTblNoPairWise[] = {
+ {ISD::SMIN, MVT::v4i64, 1},
+ {ISD::UMIN, MVT::v4i64, 1},
+ {ISD::SMIN, MVT::v8i32, 1},
+ {ISD::UMIN, MVT::v8i32, 1},
+ {ISD::SMIN, MVT::v16i16, 1},
+ {ISD::UMIN, MVT::v16i16, 1},
+ {ISD::SMIN, MVT::v32i8, 1},
+ {ISD::UMIN, MVT::v32i8, 1},
+ };
+
+ static const CostTblEntry AVX512CostTblNoPairWise[] = {
+ {ISD::FMINNUM, MVT::v8f64, 1},
+ {ISD::FMINNUM, MVT::v16f32, 2},
+ {ISD::SMIN, MVT::v8i64, 1},
+ {ISD::UMIN, MVT::v8i64, 1},
+ {ISD::SMIN, MVT::v16i32, 1},
+ {ISD::UMIN, MVT::v16i32, 1},
+ };
+
+ if (IsPairwise) {
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+ } else {
+ if (ST->hasAVX512())
+ if (const auto *Entry =
+ CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX2())
+ if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasAVX())
+ if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
+ if (ST->hasSSE42())
+ if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+ }
+
+ return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
}
/// \brief Calculate the cost of materializing a 64-bit value. This helper
@@ -2046,6 +2314,21 @@ int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
return X86TTIImpl::getIntImmCost(Imm, Ty);
}
+unsigned X86TTIImpl::getUserCost(const User *U,
+ ArrayRef<const Value *> Operands) {
+ if (isa<StoreInst>(U)) {
+ Value *Ptr = U->getOperand(1);
+ // Store instruction with index and scale costs 2 Uops.
+ // Check the preceding GEP to identify non-const indices.
+ if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+ if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
+ return TTI::TCC_Basic * 2;
+ }
+ return TTI::TCC_Basic;
+ }
+ return BaseT::getUserCost(U, Operands);
+}
+
// Return an average cost of Gather / Scatter instruction, maybe improved later
int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
unsigned Alignment, unsigned AddressSpace) {
@@ -2085,8 +2368,9 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
// Trying to reduce IndexSize to 32 bits for vector 16.
// By default the IndexSize is equal to pointer size.
- unsigned IndexSize = (VF >= 16) ? getIndexSizeInBits(Ptr, DL) :
- DL.getPointerSizeInBits();
+ unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
+ ? getIndexSizeInBits(Ptr, DL)
+ : DL.getPointerSizeInBits();
Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
IndexSize), VF);
@@ -2102,7 +2386,9 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
// The gather / scatter cost is given by Intel architects. It is a rough
// number since we are looking at one instruction in a time.
- const int GSOverhead = 2;
+ const int GSOverhead = (Opcode == Instruction::Load)
+ ? ST->getGatherOverhead()
+ : ST->getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
Alignment, AddressSpace);
}
@@ -2173,7 +2459,7 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
// the mask vector will add more instructions. Right now we give the scalar
// cost of vector-4 for KNL. TODO: Check, maybe the gather/scatter instruction
// is better in the VariableMask case.
- if (VF == 2 || (VF == 4 && !ST->hasVLX()))
+ if (ST->hasAVX512() && (VF == 2 || (VF == 4 && !ST->hasVLX())))
Scalarize = true;
if (Scalarize)
@@ -2183,7 +2469,21 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
return getGSVectorCost(Opcode, SrcVTy, Ptr, Alignment, AddressSpace);
}
+bool X86TTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+ TargetTransformInfo::LSRCost &C2) {
+ // X86 specific here are "instruction number 1st priority".
+ return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost,
+ C1.NumIVMuls, C1.NumBaseAdds,
+ C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+ std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost,
+ C2.NumIVMuls, C2.NumBaseAdds,
+ C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+}
+
bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy) {
+ // The backend can't handle a single element vector.
+ if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
+ return false;
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
@@ -2207,20 +2507,40 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
// the vector type.
// The Scalarizer asks again about legality. It sends a vector type.
// In this case we can reject non-power-of-2 vectors.
- if (isa<VectorType>(DataTy) && !isPowerOf2_32(DataTy->getVectorNumElements()))
- return false;
+ // We also reject single element vectors as the type legalizer can't
+ // scalarize it.
+ if (isa<VectorType>(DataTy)) {
+ unsigned NumElts = DataTy->getVectorNumElements();
+ if (NumElts == 1 || !isPowerOf2_32(NumElts))
+ return false;
+ }
Type *ScalarTy = DataTy->getScalarType();
int DataWidth = isa<PointerType>(ScalarTy) ?
DL.getPointerSizeInBits() : ScalarTy->getPrimitiveSizeInBits();
- // AVX-512 allows gather and scatter
- return (DataWidth == 32 || DataWidth == 64) && ST->hasAVX512();
+ // Some CPUs have better gather performance than others.
+ // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
+ // enable gather with a -march.
+ return (DataWidth == 32 || DataWidth == 64) &&
+ (ST->hasAVX512() || (ST->hasFastGather() && ST->hasAVX2()));
}
bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+ // AVX2 doesn't support scatter
+ if (!ST->hasAVX512())
+ return false;
return isLegalMaskedGather(DataType);
}
+bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
+ EVT VT = TLI->getValueType(DL, DataType);
+ return TLI->isOperationLegal(IsSigned ? ISD::SDIVREM : ISD::UDIVREM, VT);
+}
+
+bool X86TTIImpl::isFCmpOrdCheaperThanFCmpZero(Type *Ty) {
+ return false;
+}
+
bool X86TTIImpl::areInlineCompatible(const Function *Caller,
const Function *Callee) const {
const TargetMachine &TM = getTLI()->getTargetMachine();
@@ -2237,10 +2557,35 @@ bool X86TTIImpl::areInlineCompatible(const Function *Caller,
return (CallerBits & CalleeBits) == CalleeBits;
}
-bool X86TTIImpl::expandMemCmp(Instruction *I, unsigned &MaxLoadSize) {
- // TODO: We can increase these based on available vector ops.
- MaxLoadSize = ST->is64Bit() ? 8 : 4;
- return true;
+const X86TTIImpl::TTI::MemCmpExpansionOptions *
+X86TTIImpl::enableMemCmpExpansion(bool IsZeroCmp) const {
+ // Only enable vector loads for equality comparison.
+ // Right now the vector version is not as fast, see #33329.
+ static const auto ThreeWayOptions = [this]() {
+ TTI::MemCmpExpansionOptions Options;
+ if (ST->is64Bit()) {
+ Options.LoadSizes.push_back(8);
+ }
+ Options.LoadSizes.push_back(4);
+ Options.LoadSizes.push_back(2);
+ Options.LoadSizes.push_back(1);
+ return Options;
+ }();
+ static const auto EqZeroOptions = [this]() {
+ TTI::MemCmpExpansionOptions Options;
+ // TODO: enable AVX512 when the DAG is ready.
+ // if (ST->hasAVX512()) Options.LoadSizes.push_back(64);
+ if (ST->hasAVX2()) Options.LoadSizes.push_back(32);
+ if (ST->hasSSE2()) Options.LoadSizes.push_back(16);
+ if (ST->is64Bit()) {
+ Options.LoadSizes.push_back(8);
+ }
+ Options.LoadSizes.push_back(4);
+ Options.LoadSizes.push_back(2);
+ Options.LoadSizes.push_back(1);
+ return Options;
+ }();
+ return IsZeroCmp ? &EqZeroOptions : &ThreeWayOptions;
}
bool X86TTIImpl::enableInterleavedAccessVectorization() {
@@ -2288,7 +2633,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
unsigned VF = VecTy->getVectorNumElements() / Factor;
Type *ScalarTy = VecTy->getVectorElementType();
-
+
// Calculate the number of memory operations (NumOfMemOps), required
// for load/store the VecTy.
unsigned VecTySize = DL.getTypeStoreSize(VecTy);
@@ -2300,7 +2645,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
LegalVT.getVectorNumElements());
unsigned MemOpCost =
getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
-
+
VectorType *VT = VectorType::get(ScalarTy, VF);
EVT ETy = TLI->getValueType(DL, VT);
if (!ETy.isSimple())
@@ -2315,31 +2660,40 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
// The cost of the loads/stores is accounted for separately.
//
static const CostTblEntry AVX2InterleavedLoadTbl[] = {
+ { 2, MVT::v4i64, 6 }, //(load 8i64 and) deinterleave into 2 x 4i64
+ { 2, MVT::v4f64, 6 }, //(load 8f64 and) deinterleave into 2 x 4f64
+
{ 3, MVT::v2i8, 10 }, //(load 6i8 and) deinterleave into 3 x 2i8
{ 3, MVT::v4i8, 4 }, //(load 12i8 and) deinterleave into 3 x 4i8
{ 3, MVT::v8i8, 9 }, //(load 24i8 and) deinterleave into 3 x 8i8
- { 3, MVT::v16i8, 18}, //(load 48i8 and) deinterleave into 3 x 16i8
- { 3, MVT::v32i8, 42 }, //(load 96i8 and) deinterleave into 3 x 32i8
-
+ { 3, MVT::v16i8, 11}, //(load 48i8 and) deinterleave into 3 x 16i8
+ { 3, MVT::v32i8, 13}, //(load 96i8 and) deinterleave into 3 x 32i8
+ { 3, MVT::v8f32, 17 }, //(load 24f32 and)deinterleave into 3 x 8f32
+
{ 4, MVT::v2i8, 12 }, //(load 8i8 and) deinterleave into 4 x 2i8
{ 4, MVT::v4i8, 4 }, //(load 16i8 and) deinterleave into 4 x 4i8
{ 4, MVT::v8i8, 20 }, //(load 32i8 and) deinterleave into 4 x 8i8
{ 4, MVT::v16i8, 39 }, //(load 64i8 and) deinterleave into 4 x 16i8
- { 4, MVT::v32i8, 80 } //(load 128i8 and) deinterleave into 4 x 32i8
+ { 4, MVT::v32i8, 80 }, //(load 128i8 and) deinterleave into 4 x 32i8
+
+ { 8, MVT::v8f32, 40 } //(load 64f32 and)deinterleave into 8 x 8f32
};
static const CostTblEntry AVX2InterleavedStoreTbl[] = {
+ { 2, MVT::v4i64, 6 }, //interleave into 2 x 4i64 into 8i64 (and store)
+ { 2, MVT::v4f64, 6 }, //interleave into 2 x 4f64 into 8f64 (and store)
+
{ 3, MVT::v2i8, 7 }, //interleave 3 x 2i8 into 6i8 (and store)
{ 3, MVT::v4i8, 8 }, //interleave 3 x 4i8 into 12i8 (and store)
{ 3, MVT::v8i8, 11 }, //interleave 3 x 8i8 into 24i8 (and store)
- { 3, MVT::v16i8, 17 }, //interleave 3 x 16i8 into 48i8 (and store)
- { 3, MVT::v32i8, 32 }, //interleave 3 x 32i8 into 96i8 (and store)
+ { 3, MVT::v16i8, 11 }, //interleave 3 x 16i8 into 48i8 (and store)
+ { 3, MVT::v32i8, 13 }, //interleave 3 x 32i8 into 96i8 (and store)
{ 4, MVT::v2i8, 12 }, //interleave 4 x 2i8 into 8i8 (and store)
{ 4, MVT::v4i8, 9 }, //interleave 4 x 4i8 into 16i8 (and store)
- { 4, MVT::v8i8, 16 }, //interleave 4 x 8i8 into 32i8 (and store)
- { 4, MVT::v16i8, 20 }, //interleave 4 x 16i8 into 64i8 (and store)
- { 4, MVT::v32i8, 40 } //interleave 4 x 32i8 into 128i8 (and store)
+ { 4, MVT::v8i8, 10 }, //interleave 4 x 8i8 into 32i8 (and store)
+ { 4, MVT::v16i8, 10 }, //interleave 4 x 16i8 into 64i8 (and store)
+ { 4, MVT::v32i8, 12 } //interleave 4 x 32i8 into 128i8 (and store)
};
if (Opcode == Instruction::Load) {
@@ -2349,7 +2703,7 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
} else {
assert(Opcode == Instruction::Store &&
"Expected Store Instruction at this point");
- if (const auto *Entry =
+ if (const auto *Entry =
CostTableLookup(AVX2InterleavedStoreTbl, Factor, ETy.getSimpleVT()))
return NumOfMemOps * MemOpCost + Entry->Cost;
}
@@ -2385,7 +2739,27 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
unsigned MemOpCost =
getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+ unsigned VF = VecTy->getVectorNumElements() / Factor;
+ MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
+
if (Opcode == Instruction::Load) {
+ // The tables (AVX512InterleavedLoadTbl and AVX512InterleavedStoreTbl)
+ // contain the cost of the optimized shuffle sequence that the
+ // X86InterleavedAccess pass will generate.
+ // The cost of loads and stores are computed separately from the table.
+
+ // X86InterleavedAccess support only the following interleaved-access group.
+ static const CostTblEntry AVX512InterleavedLoadTbl[] = {
+ {3, MVT::v16i8, 12}, //(load 48i8 and) deinterleave into 3 x 16i8
+ {3, MVT::v32i8, 14}, //(load 96i8 and) deinterleave into 3 x 32i8
+ {3, MVT::v64i8, 22}, //(load 96i8 and) deinterleave into 3 x 32i8
+ };
+
+ if (const auto *Entry =
+ CostTableLookup(AVX512InterleavedLoadTbl, Factor, VT))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ //If an entry does not exist, fallback to the default implementation.
+
// Kind of shuffle depends on number of loaded values.
// If we load the entire data in one register, we can use a 1-src shuffle.
// Otherwise, we'll merge 2 sources in each operation.
@@ -2428,6 +2802,22 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
// Store.
assert(Opcode == Instruction::Store &&
"Expected Store Instruction at this point");
+ // X86InterleavedAccess support only the following interleaved-access group.
+ static const CostTblEntry AVX512InterleavedStoreTbl[] = {
+ {3, MVT::v16i8, 12}, // interleave 3 x 16i8 into 48i8 (and store)
+ {3, MVT::v32i8, 14}, // interleave 3 x 32i8 into 96i8 (and store)
+ {3, MVT::v64i8, 26}, // interleave 3 x 64i8 into 96i8 (and store)
+
+ {4, MVT::v8i8, 10}, // interleave 4 x 8i8 into 32i8 (and store)
+ {4, MVT::v16i8, 11}, // interleave 4 x 16i8 into 64i8 (and store)
+ {4, MVT::v32i8, 14}, // interleave 4 x 32i8 into 128i8 (and store)
+ {4, MVT::v64i8, 24} // interleave 4 x 32i8 into 256i8 (and store)
+ };
+
+ if (const auto *Entry =
+ CostTableLookup(AVX512InterleavedStoreTbl, Factor, VT))
+ return NumOfMemOps * MemOpCost + Entry->Cost;
+ //If an entry does not exist, fallback to the default implementation.
// There is no strided stores meanwhile. And store can't be folded in
// shuffle.
@@ -2449,27 +2839,22 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
ArrayRef<unsigned> Indices,
unsigned Alignment,
unsigned AddressSpace) {
- auto isSupportedOnAVX512 = [](Type *VecTy, bool &RequiresBW) {
- RequiresBW = false;
+ auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
Type *EltTy = VecTy->getVectorElementType();
if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
EltTy->isIntegerTy(32) || EltTy->isPointerTy())
return true;
- if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8)) {
- RequiresBW = true;
- return true;
- }
+ if (EltTy->isIntegerTy(16) || EltTy->isIntegerTy(8))
+ return HasBW;
return false;
};
- bool RequiresBW;
- bool HasAVX512Solution = isSupportedOnAVX512(VecTy, RequiresBW);
- if (ST->hasAVX512() && HasAVX512Solution && (!RequiresBW || ST->hasBWI()))
+ if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
if (ST->hasAVX2())
return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
-
+
return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
Alignment, AddressSpace);
}
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index ad0a0a211301..6f01a6fd11df 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -21,7 +21,7 @@
#include "X86TargetMachine.h"
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/Target/TargetLowering.h"
+#include "llvm/CodeGen/TargetLowering.h"
namespace llvm {
@@ -47,6 +47,14 @@ public:
/// @}
+ /// \name Cache TTI Implementation
+ /// @{
+ llvm::Optional<unsigned> getCacheSize(
+ TargetTransformInfo::CacheLevel Level) const;
+ llvm::Optional<unsigned> getCacheAssociativity(
+ TargetTransformInfo::CacheLevel Level) const;
+ /// @}
+
/// \name Vector TTI Implementations
/// @{
@@ -85,7 +93,11 @@ public:
ArrayRef<Value *> Args, FastMathFlags FMF,
unsigned VF = 1);
- int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
+ int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
+ bool IsPairwiseForm);
+
+ int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
+ bool IsUnsigned);
int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
unsigned Factor, ArrayRef<unsigned> Indices,
@@ -101,16 +113,23 @@ public:
int getIntImmCost(const APInt &Imm, Type *Ty);
+ unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+
int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
Type *Ty);
+ bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+ TargetTransformInfo::LSRCost &C2);
bool isLegalMaskedLoad(Type *DataType);
bool isLegalMaskedStore(Type *DataType);
bool isLegalMaskedGather(Type *DataType);
bool isLegalMaskedScatter(Type *DataType);
+ bool hasDivRemOp(Type *DataType, bool IsSigned);
+ bool isFCmpOrdCheaperThanFCmpZero(Type *Ty);
bool areInlineCompatible(const Function *Caller,
const Function *Callee) const;
- bool expandMemCmp(Instruction *I, unsigned &MaxLoadSize);
+ const TTI::MemCmpExpansionOptions *enableMemCmpExpansion(
+ bool IsZeroCmp) const;
bool enableInterleavedAccessVectorization();
private:
int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index d17dfac6a997..224262830b12 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -1,4 +1,4 @@
-//===-- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter -----------===//
+//===- X86VZeroUpper.cpp - AVX vzeroupper instruction inserter ------------===//
//
// The LLVM Compiler Infrastructure
//
@@ -17,14 +17,25 @@
#include "X86.h"
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
+#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include <cassert>
+
using namespace llvm;
#define DEBUG_TYPE "x86-vzeroupper"
@@ -35,23 +46,25 @@ namespace {
class VZeroUpperInserter : public MachineFunctionPass {
public:
-
VZeroUpperInserter() : MachineFunctionPass(ID) {}
+
bool runOnMachineFunction(MachineFunction &MF) override;
+
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
}
+
StringRef getPassName() const override { return "X86 vzeroupper inserter"; }
private:
-
void processBasicBlock(MachineBasicBlock &MBB);
void insertVZeroUpper(MachineBasicBlock::iterator I,
MachineBasicBlock &MBB);
void addDirtySuccessor(MachineBasicBlock &MBB);
- typedef enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY } BlockExitState;
+ using BlockExitState = enum { PASS_THROUGH, EXITS_CLEAN, EXITS_DIRTY };
+
static const char* getBlockExitStateName(BlockExitState ST);
// Core algorithm state:
@@ -73,13 +86,15 @@ namespace {
// to be guarded until we discover a predecessor that
// is DIRTY_OUT.
struct BlockState {
- BlockState() : ExitState(PASS_THROUGH), AddedToDirtySuccessors(false) {}
- BlockExitState ExitState;
- bool AddedToDirtySuccessors;
+ BlockExitState ExitState = PASS_THROUGH;
+ bool AddedToDirtySuccessors = false;
MachineBasicBlock::iterator FirstUnguardedCall;
+
+ BlockState() = default;
};
- typedef SmallVector<BlockState, 8> BlockStateMap;
- typedef SmallVector<MachineBasicBlock*, 8> DirtySuccessorsWorkList;
+
+ using BlockStateMap = SmallVector<BlockState, 8>;
+ using DirtySuccessorsWorkList = SmallVector<MachineBasicBlock *, 8>;
BlockStateMap BlockStates;
DirtySuccessorsWorkList DirtySuccessors;
@@ -90,8 +105,9 @@ namespace {
static char ID;
};
- char VZeroUpperInserter::ID = 0;
-}
+} // end anonymous namespace
+
+char VZeroUpperInserter::ID = 0;
FunctionPass *llvm::createX86IssueVZeroUpperPass() {
return new VZeroUpperInserter();
@@ -116,9 +132,8 @@ static bool isYmmOrZmmReg(unsigned Reg) {
}
static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
- for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
- E = MRI.livein_end(); I != E; ++I)
- if (isYmmOrZmmReg(I->first))
+ for (std::pair<unsigned, unsigned> LI : MRI.liveins())
+ if (isYmmOrZmmReg(LI.first))
return true;
return false;
@@ -220,7 +235,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
// If the call has no RegMask, skip it as well. It usually happens on
// helper function calls (such as '_chkstk', '_ftol2') where standard
// calling convention is not used (RegMask is not used to mark register
- // clobbered and register usage (def/imp-def/use) is well-defined and
+ // clobbered and register usage (def/implicit-def/use) is well-defined and
// explicitly specified.
if (IsCall && !callHasRegMask(MI))
continue;
@@ -270,7 +285,7 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
EverMadeChange = false;
- IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
+ IsX86INTR = MF.getFunction().getCallingConv() == CallingConv::X86_INTR;
bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
diff --git a/lib/Target/X86/X86WinAllocaExpander.cpp b/lib/Target/X86/X86WinAllocaExpander.cpp
index fc08f1582ad7..1046696587d9 100644
--- a/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -25,9 +25,9 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/IR/Function.h"
#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
using namespace llvm;
@@ -279,9 +279,9 @@ bool X86WinAllocaExpander::runOnMachineFunction(MachineFunction &MF) {
SlotSize = TRI->getSlotSize();
StackProbeSize = 4096;
- if (MF.getFunction()->hasFnAttribute("stack-probe-size")) {
+ if (MF.getFunction().hasFnAttribute("stack-probe-size")) {
MF.getFunction()
- ->getFnAttribute("stack-probe-size")
+ .getFnAttribute("stack-probe-size")
.getValueAsString()
.getAsInteger(0, StackProbeSize);
}
diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp
index 0c3b34341476..0472a85f50da 100644
--- a/lib/Target/X86/X86WinEHState.cpp
+++ b/lib/Target/X86/X86WinEHState.cpp
@@ -401,6 +401,8 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) {
Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape(
ParentFunc->getName()),
TheModule);
+ if (auto *C = ParentFunc->getComdat())
+ Trampoline->setComdat(C);
BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline);
IRBuilder<> Builder(EntryBB);
Value *LSDA = emitEHLSDA(Builder, ParentFunc);