aboutsummaryrefslogtreecommitdiff
path: root/llvm/lib/Target/X86
diff options
context:
space:
mode:
authorDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
committerDimitry Andric <dim@FreeBSD.org>2020-01-17 20:45:01 +0000
commit706b4fc47bbc608932d3b491ae19a3b9cde9497b (patch)
tree4adf86a776049cbf7f69a1929c4babcbbef925eb /llvm/lib/Target/X86
parent7cc9cf2bf09f069cb2dd947ead05d0b54301fb71 (diff)
Notes
Diffstat (limited to 'llvm/lib/Target/X86')
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp224
-rw-r--r--llvm/lib/Target/X86/AsmParser/X86Operand.h27
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp1670
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp1938
-rw-r--r--llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h69
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp7
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h6
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp463
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h294
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp8
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h6
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp1057
-rw-r--r--llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp14
-rw-r--r--llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp2
-rw-r--r--llvm/lib/Target/X86/X86.h12
-rw-r--r--llvm/lib/Target/X86/X86.td133
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.cpp50
-rw-r--r--llvm/lib/Target/X86/X86AsmPrinter.h6
-rw-r--r--llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp9
-rw-r--r--llvm/lib/Target/X86/X86CallFrameOptimization.cpp12
-rw-r--r--llvm/lib/Target/X86/X86CallLowering.cpp2
-rw-r--r--llvm/lib/Target/X86/X86CallingConv.td35
-rw-r--r--llvm/lib/Target/X86/X86CmovConversion.cpp7
-rw-r--r--llvm/lib/Target/X86/X86CondBrFolding.cpp2
-rw-r--r--llvm/lib/Target/X86/X86DomainReassignment.cpp6
-rwxr-xr-xllvm/lib/Target/X86/X86EvexToVex.cpp2
-rw-r--r--llvm/lib/Target/X86/X86ExpandPseudo.cpp10
-rw-r--r--llvm/lib/Target/X86/X86FastISel.cpp2
-rw-r--r--llvm/lib/Target/X86/X86FixupBWInsts.cpp24
-rw-r--r--llvm/lib/Target/X86/X86FixupLEAs.cpp7
-rw-r--r--llvm/lib/Target/X86/X86FixupSetCC.cpp50
-rw-r--r--llvm/lib/Target/X86/X86FlagsCopyLowering.cpp92
-rw-r--r--llvm/lib/Target/X86/X86FloatingPoint.cpp48
-rw-r--r--llvm/lib/Target/X86/X86FrameLowering.cpp32
-rw-r--r--llvm/lib/Target/X86/X86ISelDAGToDAG.cpp179
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.cpp4018
-rw-r--r--llvm/lib/Target/X86/X86ISelLowering.h65
-rw-r--r--llvm/lib/Target/X86/X86IndirectBranchTracking.cpp6
-rw-r--r--llvm/lib/Target/X86/X86InstrAVX512.td705
-rw-r--r--llvm/lib/Target/X86/X86InstrControl.td23
-rw-r--r--llvm/lib/Target/X86/X86InstrFMA.td35
-rw-r--r--llvm/lib/Target/X86/X86InstrFPStack.td141
-rw-r--r--llvm/lib/Target/X86/X86InstrFormats.td6
-rw-r--r--llvm/lib/Target/X86/X86InstrFragmentsSIMD.td96
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.cpp175
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.h14
-rw-r--r--llvm/lib/Target/X86/X86InstrInfo.td61
-rw-r--r--llvm/lib/Target/X86/X86InstrMMX.td10
-rw-r--r--llvm/lib/Target/X86/X86InstrSSE.td560
-rw-r--r--llvm/lib/Target/X86/X86InstrTSX.td2
-rw-r--r--llvm/lib/Target/X86/X86InstructionSelector.cpp87
-rw-r--r--llvm/lib/Target/X86/X86IntrinsicsInfo.h1
-rw-r--r--llvm/lib/Target/X86/X86LegalizerInfo.cpp8
-rw-r--r--llvm/lib/Target/X86/X86MCInstLower.cpp166
-rw-r--r--llvm/lib/Target/X86/X86MacroFusion.cpp183
-rw-r--r--llvm/lib/Target/X86/X86OptimizeLEAs.cpp24
-rw-r--r--llvm/lib/Target/X86/X86PadShortFunction.cpp20
-rw-r--r--llvm/lib/Target/X86/X86PfmCounters.td16
-rw-r--r--llvm/lib/Target/X86/X86RegisterBankInfo.cpp5
-rw-r--r--llvm/lib/Target/X86/X86RegisterBankInfo.h4
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.cpp34
-rw-r--r--llvm/lib/Target/X86/X86RegisterInfo.td5
-rw-r--r--llvm/lib/Target/X86/X86RetpolineThunks.cpp12
-rw-r--r--llvm/lib/Target/X86/X86ScheduleAtom.td3
-rw-r--r--llvm/lib/Target/X86/X86ScheduleSLM.td30
-rw-r--r--llvm/lib/Target/X86/X86ScheduleZnver2.td1548
-rw-r--r--llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp12
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.cpp14
-rw-r--r--llvm/lib/Target/X86/X86Subtarget.h36
-rw-r--r--llvm/lib/Target/X86/X86TargetMachine.cpp20
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.cpp146
-rw-r--r--llvm/lib/Target/X86/X86TargetTransformInfo.h22
-rw-r--r--llvm/lib/Target/X86/X86VZeroUpper.cpp2
-rw-r--r--llvm/lib/Target/X86/X86WinAllocaExpander.cpp16
-rw-r--r--llvm/lib/Target/X86/X86WinEHState.cpp10
75 files changed, 9120 insertions, 5726 deletions
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 25be79ec2b1e..d37d812df485 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -134,7 +134,6 @@ private:
IOK_LENGTH,
IOK_SIZE,
IOK_TYPE,
- IOK_OFFSET
};
class InfixCalculator {
@@ -326,6 +325,7 @@ private:
IES_RSHIFT,
IES_PLUS,
IES_MINUS,
+ IES_OFFSET,
IES_NOT,
IES_MULTIPLY,
IES_DIVIDE,
@@ -350,16 +350,30 @@ private:
InlineAsmIdentifierInfo Info;
short BracCount;
bool MemExpr;
+ bool OffsetOperator;
+ SMLoc OffsetOperatorLoc;
+
+ bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
+ if (Sym) {
+ ErrMsg = "cannot use more than one symbol in memory operand";
+ return true;
+ }
+ Sym = Val;
+ SymName = ID;
+ return false;
+ }
public:
IntelExprStateMachine()
: State(IES_INIT), PrevState(IES_ERROR), BaseReg(0), IndexReg(0),
TmpReg(0), Scale(0), Imm(0), Sym(nullptr), BracCount(0),
- MemExpr(false) {}
+ MemExpr(false), OffsetOperator(false) {}
void addImm(int64_t imm) { Imm += imm; }
short getBracCount() { return BracCount; }
bool isMemExpr() { return MemExpr; }
+ bool isOffsetOperator() { return OffsetOperator; }
+ SMLoc getOffsetLoc() { return OffsetOperatorLoc; }
unsigned getBaseReg() { return BaseReg; }
unsigned getIndexReg() { return IndexReg; }
unsigned getScale() { return Scale; }
@@ -456,6 +470,7 @@ private:
case IES_INTEGER:
case IES_RPAREN:
case IES_REGISTER:
+ case IES_OFFSET:
State = IES_PLUS;
IC.pushOperator(IC_PLUS);
if (CurrState == IES_REGISTER && PrevState != IES_MULTIPLY) {
@@ -500,10 +515,12 @@ private:
case IES_INTEGER:
case IES_REGISTER:
case IES_INIT:
+ case IES_OFFSET:
State = IES_MINUS;
// push minus operator if it is not a negate operator
if (CurrState == IES_REGISTER || CurrState == IES_RPAREN ||
- CurrState == IES_INTEGER || CurrState == IES_RBRAC)
+ CurrState == IES_INTEGER || CurrState == IES_RBRAC ||
+ CurrState == IES_OFFSET)
IC.pushOperator(IC_MINUS);
else if (PrevState == IES_REGISTER && CurrState == IES_MULTIPLY) {
// We have negate operator for Scale: it's illegal
@@ -556,7 +573,6 @@ private:
}
PrevState = CurrState;
}
-
bool onRegister(unsigned Reg, StringRef &ErrMsg) {
IntelExprState CurrState = State;
switch (State) {
@@ -604,7 +620,6 @@ private:
if (auto *CE = dyn_cast<MCConstantExpr>(SymRef))
return onInteger(CE->getValue(), ErrMsg);
PrevState = State;
- bool HasSymbol = Sym != nullptr;
switch (State) {
default:
State = IES_ERROR;
@@ -614,18 +629,16 @@ private:
case IES_NOT:
case IES_INIT:
case IES_LBRAC:
+ if (setSymRef(SymRef, SymRefName, ErrMsg))
+ return true;
MemExpr = true;
State = IES_INTEGER;
- Sym = SymRef;
- SymName = SymRefName;
IC.pushOperand(IC_IMM);
if (ParsingInlineAsm)
Info = IDInfo;
break;
}
- if (HasSymbol)
- ErrMsg = "cannot use more than one symbol in memory operand";
- return HasSymbol;
+ return false;
}
bool onInteger(int64_t TmpInt, StringRef &ErrMsg) {
IntelExprState CurrState = State;
@@ -738,6 +751,7 @@ private:
State = IES_ERROR;
break;
case IES_INTEGER:
+ case IES_OFFSET:
case IES_REGISTER:
case IES_RPAREN:
if (BracCount-- != 1)
@@ -792,6 +806,7 @@ private:
State = IES_ERROR;
break;
case IES_INTEGER:
+ case IES_OFFSET:
case IES_REGISTER:
case IES_RPAREN:
State = IES_RPAREN;
@@ -799,6 +814,32 @@ private:
break;
}
}
+ bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
+ const InlineAsmIdentifierInfo &IDInfo, bool ParsingInlineAsm,
+ StringRef &ErrMsg) {
+ PrevState = State;
+ switch (State) {
+ default:
+ ErrMsg = "unexpected offset operator expression";
+ return true;
+ case IES_PLUS:
+ case IES_INIT:
+ case IES_LBRAC:
+ if (setSymRef(Val, ID, ErrMsg))
+ return true;
+ OffsetOperator = true;
+ OffsetOperatorLoc = OffsetLoc;
+ State = IES_OFFSET;
+ // As we cannot yet resolve the actual value (offset), we retain
+ // the requested semantics by pushing a '0' to the operands stack
+ IC.pushOperand(IC_IMM);
+ if (ParsingInlineAsm) {
+ Info = IDInfo;
+ }
+ break;
+ }
+ return false;
+ }
};
bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
@@ -830,18 +871,21 @@ private:
std::unique_ptr<X86Operand> ParseOperand();
std::unique_ptr<X86Operand> ParseATTOperand();
std::unique_ptr<X86Operand> ParseIntelOperand();
- std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
+ bool ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
+ InlineAsmIdentifierInfo &Info, SMLoc &End);
bool ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End);
unsigned IdentifyIntelInlineAsmOperator(StringRef Name);
unsigned ParseIntelInlineAsmOperator(unsigned OpKind);
std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start);
- bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
+ bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End);
void RewriteIntelExpression(IntelExprStateMachine &SM, SMLoc Start,
SMLoc End);
bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
bool ParseIntelInlineAsmIdentifier(const MCExpr *&Val, StringRef &Identifier,
InlineAsmIdentifierInfo &Info,
- bool IsUnevaluatedOperand, SMLoc &End);
+ bool IsUnevaluatedOperand, SMLoc &End,
+ bool IsParsingOffsetOperator = false);
std::unique_ptr<X86Operand> ParseMemOperand(unsigned SegReg,
const MCExpr *&Disp,
@@ -1112,9 +1156,10 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
if (RegNo == 0)
RegNo = MatchRegisterName(Tok.getString().lower());
- // The "flags" register cannot be referenced directly.
+ // The "flags" and "mxcsr" registers cannot be referenced directly.
// Treat it as an identifier instead.
- if (isParsingInlineAsm() && isParsingIntelSyntax() && RegNo == X86::EFLAGS)
+ if (isParsingInlineAsm() && isParsingIntelSyntax() &&
+ (RegNo == X86::EFLAGS || RegNo == X86::MXCSR))
RegNo = 0;
if (!is64BitMode()) {
@@ -1408,26 +1453,44 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
// Some binary bitwise operators have a named synonymous
// Query a candidate string for being such a named operator
// and if so - invoke the appropriate handler
-bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) {
+bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
+ IntelExprStateMachine &SM,
+ bool &ParseError, SMLoc &End) {
// A named operator should be either lower or upper case, but not a mix
if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
return false;
- if (Name.equals_lower("not"))
+ if (Name.equals_lower("not")) {
SM.onNot();
- else if (Name.equals_lower("or"))
+ } else if (Name.equals_lower("or")) {
SM.onOr();
- else if (Name.equals_lower("shl"))
+ } else if (Name.equals_lower("shl")) {
SM.onLShift();
- else if (Name.equals_lower("shr"))
+ } else if (Name.equals_lower("shr")) {
SM.onRShift();
- else if (Name.equals_lower("xor"))
+ } else if (Name.equals_lower("xor")) {
SM.onXor();
- else if (Name.equals_lower("and"))
+ } else if (Name.equals_lower("and")) {
SM.onAnd();
- else if (Name.equals_lower("mod"))
+ } else if (Name.equals_lower("mod")) {
SM.onMod();
- else
+ } else if (Name.equals_lower("offset")) {
+ SMLoc OffsetLoc = getTok().getLoc();
+ const MCExpr *Val = nullptr;
+ StringRef ID;
+ InlineAsmIdentifierInfo Info;
+ ParseError = ParseIntelOffsetOperator(Val, ID, Info, End);
+ if (ParseError)
+ return true;
+ StringRef ErrMsg;
+ ParseError =
+ SM.onOffset(Val, OffsetLoc, ID, Info, isParsingInlineAsm(), ErrMsg);
+ if (ParseError)
+ return Error(SMLoc::getFromPointer(Name.data()), ErrMsg);
+ } else {
return false;
+ }
+ if (!Name.equals_lower("offset"))
+ End = consumeToken();
return true;
}
@@ -1470,8 +1533,12 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
break;
}
// Operator synonymous ("not", "or" etc.)
- if ((UpdateLocLex = ParseIntelNamedOperator(Identifier, SM)))
+ bool ParseError = false;
+ if (ParseIntelNamedOperator(Identifier, SM, ParseError, End)) {
+ if (ParseError)
+ return true;
break;
+ }
// Symbol reference, when parsing assembly content
InlineAsmIdentifierInfo Info;
const MCExpr *Val;
@@ -1485,9 +1552,6 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
}
// MS InlineAsm operators (TYPE/LENGTH/SIZE)
if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
- if (OpKind == IOK_OFFSET)
- return Error(IdentLoc, "Dealing OFFSET operator as part of"
- "a compound immediate expression is yet to be supported");
if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
if (SM.onInteger(Val, ErrMsg))
return Error(IdentLoc, ErrMsg);
@@ -1589,9 +1653,9 @@ void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM,
SMLoc Loc = Start;
unsigned ExprLen = End.getPointer() - Start.getPointer();
// Skip everything before a symbol displacement (if we have one)
- if (SM.getSym()) {
+ if (SM.getSym() && !SM.isOffsetOperator()) {
StringRef SymName = SM.getSymName();
- if (unsigned Len = SymName.data() - Start.getPointer())
+ if (unsigned Len = SymName.data() - Start.getPointer())
InstInfo->AsmRewrites->emplace_back(AOK_Skip, Start, Len);
Loc = SMLoc::getFromPointer(SymName.data() + SymName.size());
ExprLen = End.getPointer() - (SymName.data() + SymName.size());
@@ -1606,21 +1670,23 @@ void X86AsmParser::RewriteIntelExpression(IntelExprStateMachine &SM,
// Build an Intel Expression rewrite
StringRef BaseRegStr;
StringRef IndexRegStr;
+ StringRef OffsetNameStr;
if (SM.getBaseReg())
BaseRegStr = X86IntelInstPrinter::getRegisterName(SM.getBaseReg());
if (SM.getIndexReg())
IndexRegStr = X86IntelInstPrinter::getRegisterName(SM.getIndexReg());
+ if (SM.isOffsetOperator())
+ OffsetNameStr = SM.getSymName();
// Emit it
- IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), SM.getImm(), SM.isMemExpr());
+ IntelExpr Expr(BaseRegStr, IndexRegStr, SM.getScale(), OffsetNameStr,
+ SM.getImm(), SM.isMemExpr());
InstInfo->AsmRewrites->emplace_back(Loc, ExprLen, Expr);
}
// Inline assembly may use variable names with namespace alias qualifiers.
-bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val,
- StringRef &Identifier,
- InlineAsmIdentifierInfo &Info,
- bool IsUnevaluatedOperand,
- SMLoc &End) {
+bool X86AsmParser::ParseIntelInlineAsmIdentifier(
+ const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info,
+ bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator) {
MCAsmParser &Parser = getParser();
assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
Val = nullptr;
@@ -1653,9 +1719,13 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(const MCExpr *&Val,
SemaCallback->LookupInlineAsmLabel(Identifier, getSourceManager(),
Loc, false);
assert(InternalName.size() && "We should have an internal name here.");
- // Push a rewrite for replacing the identifier name with the internal name.
- InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
- InternalName);
+ // Push a rewrite for replacing the identifier name with the internal name,
+ // unless we are parsing the operand of an offset operator
+ if (!IsParsingOffsetOperator)
+ InstInfo->AsmRewrites->emplace_back(AOK_Label, Loc, Identifier.size(),
+ InternalName);
+ else
+ Identifier = InternalName;
} else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
return false;
// Create the symbol reference.
@@ -1738,39 +1808,25 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End)
return false;
}
-/// Parse the 'offset' operator. This operator is used to specify the
-/// location rather then the content of a variable.
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
- MCAsmParser &Parser = getParser();
- const AsmToken &Tok = Parser.getTok();
- SMLoc OffsetOfLoc = Tok.getLoc();
- Parser.Lex(); // Eat offset.
-
- const MCExpr *Val;
- InlineAsmIdentifierInfo Info;
- SMLoc Start = Tok.getLoc(), End;
- StringRef Identifier = Tok.getString();
- if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info,
- /*Unevaluated=*/false, End))
- return nullptr;
-
- void *Decl = nullptr;
- // FIXME: MS evaluates "offset <Constant>" to the underlying integral
- if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
- return ErrorOperand(Start, "offset operator cannot yet handle constants");
- else if (Info.isKind(InlineAsmIdentifierInfo::IK_Var))
- Decl = Info.Var.Decl;
- // Don't emit the offset operator.
- InstInfo->AsmRewrites->emplace_back(AOK_Skip, OffsetOfLoc, 7);
-
- // The offset operator will have an 'r' constraint, thus we need to create
- // register operand to ensure proper matching. Just pick a GPR based on
- // the size of a pointer.
- bool Parse32 = is32BitMode() || Code16GCC;
- unsigned RegNo = is64BitMode() ? X86::RBX : (Parse32 ? X86::EBX : X86::BX);
-
- return X86Operand::CreateReg(RegNo, Start, End, /*GetAddress=*/true,
- OffsetOfLoc, Identifier, Decl);
+/// Parse the 'offset' operator.
+/// This operator is used to specify the location of a given operand
+bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
+ InlineAsmIdentifierInfo &Info,
+ SMLoc &End) {
+ // Eat offset, mark start of identifier.
+ SMLoc Start = Lex().getLoc();
+ ID = getTok().getString();
+ if (!isParsingInlineAsm()) {
+ if ((getTok().isNot(AsmToken::Identifier) &&
+ getTok().isNot(AsmToken::String)) ||
+ getParser().parsePrimaryExpr(Val, End))
+ return Error(Start, "unexpected token!");
+ } else if (ParseIntelInlineAsmIdentifier(Val, ID, Info, false, End, true)) {
+ return Error(Start, "unable to lookup expression");
+ } else if (Info.isKind(InlineAsmIdentifierInfo::IK_EnumVal)) {
+ return Error(Start, "offset operator cannot yet handle constants");
+ }
+ return false;
}
// Query a candidate string for being an Intel assembly operator
@@ -1780,7 +1836,6 @@ unsigned X86AsmParser::IdentifyIntelInlineAsmOperator(StringRef Name) {
.Cases("TYPE","type",IOK_TYPE)
.Cases("SIZE","size",IOK_SIZE)
.Cases("LENGTH","length",IOK_LENGTH)
- .Cases("OFFSET","offset",IOK_OFFSET)
.Default(IOK_INVALID);
}
@@ -1850,13 +1905,6 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
const AsmToken &Tok = Parser.getTok();
SMLoc Start, End;
- // FIXME: Offset operator
- // Should be handled as part of immediate expression, as other operators
- // Currently, only supported as a stand-alone operand
- if (isParsingInlineAsm())
- if (IdentifyIntelInlineAsmOperator(Tok.getString()) == IOK_OFFSET)
- return ParseIntelOffsetOfOperator();
-
// Parse optional Size directive.
unsigned Size;
if (ParseIntelMemoryOperandSize(Size))
@@ -1904,8 +1952,19 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
// RegNo != 0 specifies a valid segment register,
// and we are parsing a segment override
- if (!SM.isMemExpr() && !RegNo)
+ if (!SM.isMemExpr() && !RegNo) {
+ if (isParsingInlineAsm() && SM.isOffsetOperator()) {
+ const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo();
+ if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
+ // Disp includes the address of a variable; make sure this is recorded
+ // for later handling.
+ return X86Operand::CreateImm(Disp, Start, End, SM.getSymName(),
+ Info.Var.Decl, Info.Var.IsGlobalLV);
+ }
+ }
+
return X86Operand::CreateImm(Disp, Start, End);
+ }
StringRef ErrMsg;
unsigned BaseReg = SM.getBaseReg();
@@ -3131,6 +3190,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
case X86::VCVTTSS2SI64Zrm: case X86::VCVTTSS2SI64Zrm_Int:
if (ForcedVEXEncoding != VEXEncoding_EVEX)
return Match_Unsupported;
+ break;
}
return Match_Success;
@@ -3879,7 +3939,7 @@ bool X86AsmParser::parseDirectiveSEHPushFrame(SMLoc Loc) {
}
// Force static initialization.
-extern "C" void LLVMInitializeX86AsmParser() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86AsmParser() {
RegisterMCAsmParser<X86AsmParser> X(getTheX86_32Target());
RegisterMCAsmParser<X86AsmParser> Y(getTheX86_64Target());
}
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index 3a76d023e640..d831a63b04ee 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -36,6 +36,7 @@ struct X86Operand final : public MCParsedAsmOperand {
StringRef SymName;
void *OpDecl;
bool AddressOf;
+ bool CallOperand;
struct TokOp {
const char *Data;
@@ -52,6 +53,7 @@ struct X86Operand final : public MCParsedAsmOperand {
struct ImmOp {
const MCExpr *Val;
+ bool LocalRef;
};
struct MemOp {
@@ -77,7 +79,7 @@ struct X86Operand final : public MCParsedAsmOperand {
};
X86Operand(KindTy K, SMLoc Start, SMLoc End)
- : Kind(K), StartLoc(Start), EndLoc(End) {}
+ : Kind(K), StartLoc(Start), EndLoc(End), CallOperand(false) {}
StringRef getSymName() override { return SymName; }
void *getOpDecl() override { return OpDecl; }
@@ -104,8 +106,8 @@ struct X86Operand final : public MCParsedAsmOperand {
} else if (Val->getKind() == MCExpr::SymbolRef) {
if (auto *SRE = dyn_cast<MCSymbolRefExpr>(Val)) {
const MCSymbol &Sym = SRE->getSymbol();
- if (auto SymName = Sym.getName().data())
- OS << VName << SymName;
+ if (const char *SymNameStr = Sym.getName().data())
+ OS << VName << SymNameStr;
}
}
};
@@ -278,13 +280,9 @@ struct X86Operand final : public MCParsedAsmOperand {
return isImmUnsignedi8Value(CE->getValue());
}
- bool isOffsetOf() const override {
- return OffsetOfLoc.getPointer();
- }
+ bool isOffsetOfLocal() const override { return isImm() && Imm.LocalRef; }
- bool needAddressOf() const override {
- return AddressOf;
- }
+ bool needAddressOf() const override { return AddressOf; }
bool isMem() const override { return Kind == Memory; }
bool isMemUnsized() const {
@@ -613,9 +611,16 @@ struct X86Operand final : public MCParsedAsmOperand {
}
static std::unique_ptr<X86Operand> CreateImm(const MCExpr *Val,
- SMLoc StartLoc, SMLoc EndLoc) {
+ SMLoc StartLoc, SMLoc EndLoc,
+ StringRef SymName = StringRef(),
+ void *OpDecl = nullptr,
+ bool GlobalRef = true) {
auto Res = std::make_unique<X86Operand>(Immediate, StartLoc, EndLoc);
- Res->Imm.Val = Val;
+ Res->Imm.Val = Val;
+ Res->Imm.LocalRef = !GlobalRef;
+ Res->SymName = SymName;
+ Res->OpDecl = OpDecl;
+ Res->AddressOf = true;
return Res;
}
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index 9a635bbe5f85..ea8c606d1564 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -84,6 +84,7 @@
#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCSubtargetInfo.h"
#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
@@ -92,24 +93,1552 @@ using namespace llvm::X86Disassembler;
#define DEBUG_TYPE "x86-disassembler"
-void llvm::X86Disassembler::Debug(const char *file, unsigned line,
- const char *s) {
- dbgs() << file << ":" << line << ": " << s;
+#define debug(s) LLVM_DEBUG(dbgs() << __LINE__ << ": " << s);
+
+// Specifies whether a ModR/M byte is needed and (if so) which
+// instruction each possible value of the ModR/M byte corresponds to. Once
+// this information is known, we have narrowed down to a single instruction.
+struct ModRMDecision {
+ uint8_t modrm_type;
+ uint16_t instructionIDs;
+};
+
+// Specifies which set of ModR/M->instruction tables to look at
+// given a particular opcode.
+struct OpcodeDecision {
+ ModRMDecision modRMDecisions[256];
+};
+
+// Specifies which opcode->instruction tables to look at given
+// a particular context (set of attributes). Since there are many possible
+// contexts, the decoder first uses CONTEXTS_SYM to determine which context
+// applies given a specific set of attributes. Hence there are only IC_max
+// entries in this table, rather than 2^(ATTR_max).
+struct ContextDecision {
+ OpcodeDecision opcodeDecisions[IC_max];
+};
+
+#include "X86GenDisassemblerTables.inc"
+
+static InstrUID decode(OpcodeType type, InstructionContext insnContext,
+ uint8_t opcode, uint8_t modRM) {
+ const struct ModRMDecision *dec;
+
+ switch (type) {
+ case ONEBYTE:
+ dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case TWOBYTE:
+ dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_38:
+ dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEBYTE_3A:
+ dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP8_MAP:
+ dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOP9_MAP:
+ dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case XOPA_MAP:
+ dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ case THREEDNOW_MAP:
+ dec =
+ &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
+ break;
+ }
+
+ switch (dec->modrm_type) {
+ default:
+ llvm_unreachable("Corrupt table! Unknown modrm_type");
+ return 0;
+ case MODRM_ONEENTRY:
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITRM:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs + 1];
+ return modRMTable[dec->instructionIDs];
+ case MODRM_SPLITREG:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3) + 8];
+ return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)];
+ case MODRM_SPLITMISC:
+ if (modFromModRM(modRM) == 0x3)
+ return modRMTable[dec->instructionIDs + (modRM & 0x3f) + 8];
+ return modRMTable[dec->instructionIDs + ((modRM & 0x38) >> 3)];
+ case MODRM_FULL:
+ return modRMTable[dec->instructionIDs + modRM];
+ }
}
-StringRef llvm::X86Disassembler::GetInstrName(unsigned Opcode,
- const void *mii) {
- const MCInstrInfo *MII = static_cast<const MCInstrInfo *>(mii);
- return MII->getName(Opcode);
+static bool peek(struct InternalInstruction *insn, uint8_t &byte) {
+ uint64_t offset = insn->readerCursor - insn->startLocation;
+ if (offset >= insn->bytes.size())
+ return true;
+ byte = insn->bytes[offset];
+ return false;
}
-#define debug(s) LLVM_DEBUG(Debug(__FILE__, __LINE__, s));
+template <typename T> static bool consume(InternalInstruction *insn, T &ptr) {
+ auto r = insn->bytes;
+ uint64_t offset = insn->readerCursor - insn->startLocation;
+ if (offset + sizeof(T) > r.size())
+ return true;
+ T ret = 0;
+ for (unsigned i = 0; i < sizeof(T); ++i)
+ ret |= (uint64_t)r[offset + i] << (i * 8);
+ ptr = ret;
+ insn->readerCursor += sizeof(T);
+ return false;
+}
+
+static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
+ return insn->mode == MODE_64BIT && prefix >= 0x40 && prefix <= 0x4f;
+}
+
+// Consumes all of an instruction's prefix bytes, and marks the
+// instruction as having them. Also sets the instruction's default operand,
+// address, and other relevant data sizes to report operands correctly.
+//
+// insn must not be empty.
+static int readPrefixes(struct InternalInstruction *insn) {
+ bool isPrefix = true;
+ uint8_t byte = 0;
+ uint8_t nextByte;
+
+ LLVM_DEBUG(dbgs() << "readPrefixes()");
+
+ while (isPrefix) {
+ // If we fail reading prefixes, just stop here and let the opcode reader
+ // deal with it.
+ if (consume(insn, byte))
+ break;
+
+ // If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
+ // break and let it be disassembled as a normal "instruction".
+ if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK
+ break;
+
+ if ((byte == 0xf2 || byte == 0xf3) && !peek(insn, nextByte)) {
+ // If the byte is 0xf2 or 0xf3, and any of the following conditions are
+ // met:
+ // - it is followed by a LOCK (0xf0) prefix
+ // - it is followed by an xchg instruction
+ // then it should be disassembled as a xacquire/xrelease not repne/rep.
+ if (((nextByte == 0xf0) ||
+ ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) {
+ insn->xAcquireRelease = true;
+ if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support
+ break;
+ }
+ // Also if the byte is 0xf3, and the following condition is met:
+ // - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
+ // "mov mem, imm" (opcode 0xc6/0xc7) instructions.
+ // then it should be disassembled as an xrelease not rep.
+ if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
+ nextByte == 0xc6 || nextByte == 0xc7)) {
+ insn->xAcquireRelease = true;
+ break;
+ }
+ if (isREX(insn, nextByte)) {
+ uint8_t nnextByte;
+ // Go to REX prefix after the current one
+ if (consume(insn, nnextByte))
+ return -1;
+ // We should be able to read next byte after REX prefix
+ if (peek(insn, nnextByte))
+ return -1;
+ --insn->readerCursor;
+ }
+ }
+
+ switch (byte) {
+ case 0xf0: // LOCK
+ insn->hasLockPrefix = true;
+ break;
+ case 0xf2: // REPNE/REPNZ
+ case 0xf3: { // REP or REPE/REPZ
+ uint8_t nextByte;
+ if (peek(insn, nextByte))
+ break;
+ // TODO:
+ // 1. There could be several 0x66
+ // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then
+ // it's not mandatory prefix
+ // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need
+ // 0x0f exactly after it to be mandatory prefix
+ if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66)
+ // The last of 0xf2 /0xf3 is mandatory prefix
+ insn->mandatoryPrefix = byte;
+ insn->repeatPrefix = byte;
+ break;
+ }
+ case 0x2e: // CS segment override -OR- Branch not taken
+ insn->segmentOverride = SEG_OVERRIDE_CS;
+ break;
+ case 0x36: // SS segment override -OR- Branch taken
+ insn->segmentOverride = SEG_OVERRIDE_SS;
+ break;
+ case 0x3e: // DS segment override
+ insn->segmentOverride = SEG_OVERRIDE_DS;
+ break;
+ case 0x26: // ES segment override
+ insn->segmentOverride = SEG_OVERRIDE_ES;
+ break;
+ case 0x64: // FS segment override
+ insn->segmentOverride = SEG_OVERRIDE_FS;
+ break;
+ case 0x65: // GS segment override
+ insn->segmentOverride = SEG_OVERRIDE_GS;
+ break;
+ case 0x66: { // Operand-size override {
+ uint8_t nextByte;
+ insn->hasOpSize = true;
+ if (peek(insn, nextByte))
+ break;
+ // 0x66 can't overwrite existing mandatory prefix and should be ignored
+ if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte)))
+ insn->mandatoryPrefix = byte;
+ break;
+ }
+ case 0x67: // Address-size override
+ insn->hasAdSize = true;
+ break;
+ default: // Not a prefix byte
+ isPrefix = false;
+ break;
+ }
+
+ if (isPrefix)
+ LLVM_DEBUG(dbgs() << format("Found prefix 0x%hhx", byte));
+ }
+
+ insn->vectorExtensionType = TYPE_NO_VEX_XOP;
+
+ if (byte == 0x62) {
+ uint8_t byte1, byte2;
+ if (consume(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of EVEX prefix");
+ return -1;
+ }
+
+ if (peek(insn, byte2)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+
+ if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
+ ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
+ insn->vectorExtensionType = TYPE_EVEX;
+ } else {
+ --insn->readerCursor; // unconsume byte1
+ --insn->readerCursor; // unconsume byte
+ }
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ insn->vectorExtensionPrefix[0] = byte;
+ insn->vectorExtensionPrefix[1] = byte1;
+ if (consume(insn, insn->vectorExtensionPrefix[2])) {
+ LLVM_DEBUG(dbgs() << "Couldn't read third byte of EVEX prefix");
+ return -1;
+ }
+ if (consume(insn, insn->vectorExtensionPrefix[3])) {
+ LLVM_DEBUG(dbgs() << "Couldn't read fourth byte of EVEX prefix");
+ return -1;
+ }
+
+ // We simulate the REX prefix for simplicity's sake
+ if (insn->mode == MODE_64BIT) {
+ insn->rexPrefix = 0x40 |
+ (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3) |
+ (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2) |
+ (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1) |
+ (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
+ }
+
+ LLVM_DEBUG(
+ dbgs() << format(
+ "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]));
+ }
+ } else if (byte == 0xc4) {
+ uint8_t byte1;
+ if (peek(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
+ insn->vectorExtensionType = TYPE_VEX_3B;
+ else
+ --insn->readerCursor;
+
+ if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consume(insn, insn->vectorExtensionPrefix[1]);
+ consume(insn, insn->vectorExtensionPrefix[2]);
+
+ // We simulate the REX prefix for simplicity's sake
+
+ if (insn->mode == MODE_64BIT)
+ insn->rexPrefix = 0x40 |
+ (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3) |
+ (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2) |
+ (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1) |
+ (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
+
+ LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]));
+ }
+ } else if (byte == 0xc5) {
+ uint8_t byte1;
+ if (peek(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of VEX");
+ return -1;
+ }
+
+ if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
+ insn->vectorExtensionType = TYPE_VEX_2B;
+ else
+ --insn->readerCursor;
+
+ if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consume(insn, insn->vectorExtensionPrefix[1]);
+
+ if (insn->mode == MODE_64BIT)
+ insn->rexPrefix =
+ 0x40 | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
+
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ insn->hasOpSize = true;
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << format("Found VEX prefix 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1]));
+ }
+ } else if (byte == 0x8f) {
+ uint8_t byte1;
+ if (peek(insn, byte1)) {
+ LLVM_DEBUG(dbgs() << "Couldn't read second byte of XOP");
+ return -1;
+ }
+
+ if ((byte1 & 0x38) != 0x0) // 0 in these 3 bits is a POP instruction.
+ insn->vectorExtensionType = TYPE_XOP;
+ else
+ --insn->readerCursor;
+
+ if (insn->vectorExtensionType == TYPE_XOP) {
+ insn->vectorExtensionPrefix[0] = byte;
+ consume(insn, insn->vectorExtensionPrefix[1]);
+ consume(insn, insn->vectorExtensionPrefix[2]);
+
+ // We simulate the REX prefix for simplicity's sake
+
+ if (insn->mode == MODE_64BIT)
+ insn->rexPrefix = 0x40 |
+ (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3) |
+ (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2) |
+ (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1) |
+ (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
+
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ default:
+ break;
+ case VEX_PREFIX_66:
+ insn->hasOpSize = true;
+ break;
+ }
+
+ LLVM_DEBUG(dbgs() << format("Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
+ insn->vectorExtensionPrefix[0],
+ insn->vectorExtensionPrefix[1],
+ insn->vectorExtensionPrefix[2]));
+ }
+ } else if (isREX(insn, byte)) {
+ if (peek(insn, nextByte))
+ return -1;
+ insn->rexPrefix = byte;
+ LLVM_DEBUG(dbgs() << format("Found REX prefix 0x%hhx", byte));
+ } else
+ --insn->readerCursor;
+
+ if (insn->mode == MODE_16BIT) {
+ insn->registerSize = (insn->hasOpSize ? 4 : 2);
+ insn->addressSize = (insn->hasAdSize ? 4 : 2);
+ insn->displacementSize = (insn->hasAdSize ? 4 : 2);
+ insn->immediateSize = (insn->hasOpSize ? 4 : 2);
+ } else if (insn->mode == MODE_32BIT) {
+ insn->registerSize = (insn->hasOpSize ? 2 : 4);
+ insn->addressSize = (insn->hasAdSize ? 2 : 4);
+ insn->displacementSize = (insn->hasAdSize ? 2 : 4);
+ insn->immediateSize = (insn->hasOpSize ? 2 : 4);
+ } else if (insn->mode == MODE_64BIT) {
+ if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
+ insn->registerSize = 8;
+ insn->addressSize = (insn->hasAdSize ? 4 : 8);
+ insn->displacementSize = 4;
+ insn->immediateSize = 4;
+ } else {
+ insn->registerSize = (insn->hasOpSize ? 2 : 4);
+ insn->addressSize = (insn->hasAdSize ? 4 : 8);
+ insn->displacementSize = (insn->hasOpSize ? 2 : 4);
+ insn->immediateSize = (insn->hasOpSize ? 2 : 4);
+ }
+ }
+
+ return 0;
+}
+
+// Consumes the SIB byte to determine addressing information.
+static int readSIB(struct InternalInstruction *insn) {
+ SIBBase sibBaseBase = SIB_BASE_NONE;
+ uint8_t index, base;
+
+ LLVM_DEBUG(dbgs() << "readSIB()");
+ switch (insn->addressSize) {
+ case 2:
+ default:
+ llvm_unreachable("SIB-based addressing doesn't work in 16-bit mode");
+ case 4:
+ insn->sibIndexBase = SIB_INDEX_EAX;
+ sibBaseBase = SIB_BASE_EAX;
+ break;
+ case 8:
+ insn->sibIndexBase = SIB_INDEX_RAX;
+ sibBaseBase = SIB_BASE_RAX;
+ break;
+ }
+
+ if (consume(insn, insn->sib))
+ return -1;
+
+ index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
+
+ if (index == 0x4) {
+ insn->sibIndex = SIB_INDEX_NONE;
+ } else {
+ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index);
+ }
+
+ insn->sibScale = 1 << scaleFromSIB(insn->sib);
+
+ base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
+
+ switch (base) {
+ case 0x5:
+ case 0xd:
+ switch (modFromModRM(insn->modRM)) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = SIB_BASE_NONE;
+ break;
+ case 0x1:
+ insn->eaDisplacement = EA_DISP_8;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ case 0x2:
+ insn->eaDisplacement = EA_DISP_32;
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ default:
+ llvm_unreachable("Cannot have Mod = 0b11 and a SIB byte");
+ }
+ break;
+ default:
+ insn->sibBase = (SIBBase)(sibBaseBase + base);
+ break;
+ }
+
+ return 0;
+}
+
+static int readDisplacement(struct InternalInstruction *insn) {
+ int8_t d8;
+ int16_t d16;
+ int32_t d32;
+ LLVM_DEBUG(dbgs() << "readDisplacement()");
+
+ insn->displacementOffset = insn->readerCursor - insn->startLocation;
+ switch (insn->eaDisplacement) {
+ case EA_DISP_NONE:
+ break;
+ case EA_DISP_8:
+ if (consume(insn, d8))
+ return -1;
+ insn->displacement = d8;
+ break;
+ case EA_DISP_16:
+ if (consume(insn, d16))
+ return -1;
+ insn->displacement = d16;
+ break;
+ case EA_DISP_32:
+ if (consume(insn, d32))
+ return -1;
+ insn->displacement = d32;
+ break;
+ }
+
+ return 0;
+}
+
+// Consumes all addressing information (ModR/M byte, SIB byte, and displacement.
+static int readModRM(struct InternalInstruction *insn) {
+ uint8_t mod, rm, reg, evexrm;
+ LLVM_DEBUG(dbgs() << "readModRM()");
+
+ if (insn->consumedModRM)
+ return 0;
+
+ if (consume(insn, insn->modRM))
+ return -1;
+ insn->consumedModRM = true;
+
+ mod = modFromModRM(insn->modRM);
+ rm = rmFromModRM(insn->modRM);
+ reg = regFromModRM(insn->modRM);
+
+ // This goes by insn->registerSize to pick the correct register, which messes
+ // up if we're using (say) XMM or 8-bit register operands. That gets fixed in
+ // fixupReg().
+ switch (insn->registerSize) {
+ case 2:
+ insn->regBase = MODRM_REG_AX;
+ insn->eaRegBase = EA_REG_AX;
+ break;
+ case 4:
+ insn->regBase = MODRM_REG_EAX;
+ insn->eaRegBase = EA_REG_EAX;
+ break;
+ case 8:
+ insn->regBase = MODRM_REG_RAX;
+ insn->eaRegBase = EA_REG_RAX;
+ break;
+ }
+
+ reg |= rFromREX(insn->rexPrefix) << 3;
+ rm |= bFromREX(insn->rexPrefix) << 3;
+
+ evexrm = 0;
+ if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) {
+ reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
+ }
+
+ insn->reg = (Reg)(insn->regBase + reg);
+
+ switch (insn->addressSize) {
+ case 2: {
+ EABase eaBaseBase = EA_BASE_BX_SI;
+
+ switch (mod) {
+ case 0x0:
+ if (rm == 0x6) {
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ } else {
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_NONE;
+ }
+ break;
+ case 0x1:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_8;
+ insn->displacementSize = 1;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x2:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ insn->eaDisplacement = EA_DISP_16;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ case 0x3:
+ insn->eaBase = (EABase)(insn->eaRegBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ }
+ case 4:
+ case 8: {
+ EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
+
+ switch (mod) {
+ case 0x0:
+ insn->eaDisplacement = EA_DISP_NONE; // readSIB may override this
+ // In determining whether RIP-relative mode is used (rm=5),
+ // or whether a SIB byte is present (rm=4),
+ // the extension bits (REX.b and EVEX.x) are ignored.
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = (insn->addressSize == 4 ? EA_BASE_sib : EA_BASE_sib64);
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ case 0x5: // RIP-relative
+ insn->eaBase = EA_BASE_NONE;
+ insn->eaDisplacement = EA_DISP_32;
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ break;
+ }
+ break;
+ case 0x1:
+ insn->displacementSize = 1;
+ LLVM_FALLTHROUGH;
+ case 0x2:
+ insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
+ switch (rm & 7) {
+ case 0x4: // SIB byte is present
+ insn->eaBase = EA_BASE_sib;
+ if (readSIB(insn) || readDisplacement(insn))
+ return -1;
+ break;
+ default:
+ insn->eaBase = (EABase)(eaBaseBase + rm);
+ if (readDisplacement(insn))
+ return -1;
+ break;
+ }
+ break;
+ case 0x3:
+ insn->eaDisplacement = EA_DISP_NONE;
+ insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm);
+ break;
+ }
+ break;
+ }
+ } // switch (insn->addressSize)
+
+ return 0;
+}
+
+#define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \
+ static uint16_t name(struct InternalInstruction *insn, OperandType type, \
+ uint8_t index, uint8_t *valid) { \
+ *valid = 1; \
+ switch (type) { \
+ default: \
+ debug("Unhandled register type"); \
+ *valid = 0; \
+ return 0; \
+ case TYPE_Rv: \
+ return base + index; \
+ case TYPE_R8: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ if (insn->rexPrefix && index >= 4 && index <= 7) { \
+ return prefix##_SPL + (index - 4); \
+ } else { \
+ return prefix##_AL + index; \
+ } \
+ case TYPE_R16: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ return prefix##_AX + index; \
+ case TYPE_R32: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ return prefix##_EAX + index; \
+ case TYPE_R64: \
+ index &= mask; \
+ if (index > 0xf) \
+ *valid = 0; \
+ return prefix##_RAX + index; \
+ case TYPE_ZMM: \
+ return prefix##_ZMM0 + index; \
+ case TYPE_YMM: \
+ return prefix##_YMM0 + index; \
+ case TYPE_XMM: \
+ return prefix##_XMM0 + index; \
+ case TYPE_VK: \
+ index &= 0xf; \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0 + index; \
+ case TYPE_VK_PAIR: \
+ if (index > 7) \
+ *valid = 0; \
+ return prefix##_K0_K1 + (index / 2); \
+ case TYPE_MM64: \
+ return prefix##_MM0 + (index & 0x7); \
+ case TYPE_SEGMENTREG: \
+ if ((index & 7) > 5) \
+ *valid = 0; \
+ return prefix##_ES + (index & 7); \
+ case TYPE_DEBUGREG: \
+ return prefix##_DR0 + index; \
+ case TYPE_CONTROLREG: \
+ return prefix##_CR0 + index; \
+ case TYPE_BNDR: \
+ if (index > 3) \
+ *valid = 0; \
+ return prefix##_BND0 + index; \
+ case TYPE_MVSIBX: \
+ return prefix##_XMM0 + index; \
+ case TYPE_MVSIBY: \
+ return prefix##_YMM0 + index; \
+ case TYPE_MVSIBZ: \
+ return prefix##_ZMM0 + index; \
+ } \
+ }
+
+// Consult an operand type to determine the meaning of the reg or R/M field. If
+// the operand is an XMM operand, for example, an operand would be XMM0 instead
+// of AX, which readModRM() would otherwise misinterpret it as.
+//
+// @param insn - The instruction containing the operand.
+// @param type - The operand type.
+// @param index - The existing value of the field as reported by readModRM().
+// @param valid - The address of a uint8_t. The target is set to 1 if the
+// field is valid for the register class; 0 if not.
+// @return - The proper value.
+GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f)
+GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf)
+
+// Consult an operand specifier to determine which of the fixup*Value functions
+// to use in correcting readModRM()'ss interpretation.
+//
+// @param insn - See fixup*Value().
+// @param op - The operand specifier.
+// @return - 0 if fixup was successful; -1 if the register returned was
+// invalid for its class.
+static int fixupReg(struct InternalInstruction *insn,
+ const struct OperandSpecifier *op) {
+ uint8_t valid;
+ LLVM_DEBUG(dbgs() << "fixupReg()");
+
+ switch ((OperandEncoding)op->encoding) {
+ default:
+ debug("Expected a REG or R/M encoding in fixupReg");
+ return -1;
+ case ENCODING_VVVV:
+ insn->vvvv =
+ (Reg)fixupRegValue(insn, (OperandType)op->type, insn->vvvv, &valid);
+ if (!valid)
+ return -1;
+ break;
+ case ENCODING_REG:
+ insn->reg = (Reg)fixupRegValue(insn, (OperandType)op->type,
+ insn->reg - insn->regBase, &valid);
+ if (!valid)
+ return -1;
+ break;
+ CASE_ENCODING_RM:
+ if (insn->eaBase >= insn->eaRegBase) {
+ insn->eaBase = (EABase)fixupRMValue(
+ insn, (OperandType)op->type, insn->eaBase - insn->eaRegBase, &valid);
+ if (!valid)
+ return -1;
+ }
+ break;
+ }
+
+ return 0;
+}
+
+// Read the opcode (except the ModR/M byte in the case of extended or escape
+// opcodes).
+static bool readOpcode(struct InternalInstruction *insn) {
+ uint8_t current;
+ LLVM_DEBUG(dbgs() << "readOpcode()");
+
+ insn->opcodeType = ONEBYTE;
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
+ default:
+ LLVM_DEBUG(
+ dbgs() << format("Unhandled mm field for instruction (0x%hhx)",
+ mmFromEVEX2of4(insn->vectorExtensionPrefix[1])));
+ return true;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consume(insn, insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ LLVM_DEBUG(
+ dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])));
+ return true;
+ case VEX_LOB_0F:
+ insn->opcodeType = TWOBYTE;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F38:
+ insn->opcodeType = THREEBYTE_38;
+ return consume(insn, insn->opcode);
+ case VEX_LOB_0F3A:
+ insn->opcodeType = THREEBYTE_3A;
+ return consume(insn, insn->opcode);
+ }
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ insn->opcodeType = TWOBYTE;
+ return consume(insn, insn->opcode);
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
+ default:
+ LLVM_DEBUG(
+ dbgs() << format("Unhandled m-mmmm field for instruction (0x%hhx)",
+ mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])));
+ return true;
+ case XOP_MAP_SELECT_8:
+ insn->opcodeType = XOP8_MAP;
+ return consume(insn, insn->opcode);
+ case XOP_MAP_SELECT_9:
+ insn->opcodeType = XOP9_MAP;
+ return consume(insn, insn->opcode);
+ case XOP_MAP_SELECT_A:
+ insn->opcodeType = XOPA_MAP;
+ return consume(insn, insn->opcode);
+ }
+ }
+
+ if (consume(insn, current))
+ return true;
+
+ if (current == 0x0f) {
+ LLVM_DEBUG(
+ dbgs() << format("Found a two-byte escape prefix (0x%hhx)", current));
+ if (consume(insn, current))
+ return true;
+
+ if (current == 0x38) {
+ LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)",
+ current));
+ if (consume(insn, current))
+ return true;
+
+ insn->opcodeType = THREEBYTE_38;
+ } else if (current == 0x3a) {
+ LLVM_DEBUG(dbgs() << format("Found a three-byte escape prefix (0x%hhx)",
+ current));
+ if (consume(insn, current))
+ return true;
+
+ insn->opcodeType = THREEBYTE_3A;
+ } else if (current == 0x0f) {
+ LLVM_DEBUG(
+ dbgs() << format("Found a 3dnow escape prefix (0x%hhx)", current));
+
+ // Consume operands before the opcode to comply with the 3DNow encoding
+ if (readModRM(insn))
+ return true;
+
+ if (consume(insn, current))
+ return true;
+
+ insn->opcodeType = THREEDNOW_MAP;
+ } else {
+ LLVM_DEBUG(dbgs() << "Didn't find a three-byte escape prefix");
+ insn->opcodeType = TWOBYTE;
+ }
+ } else if (insn->mandatoryPrefix)
+ // The opcode with mandatory prefix must start with opcode escape.
+ // If not it's legacy repeat prefix
+ insn->mandatoryPrefix = 0;
+
+ // At this point we have consumed the full opcode.
+ // Anything we consume from here on must be unconsumed.
+ insn->opcode = current;
+
+ return false;
+}
+
+// Determine whether equiv is the 16-bit equivalent of orig (32-bit or 64-bit).
+static bool is16BitEquivalent(const char *orig, const char *equiv) {
+ for (int i = 0;; i++) {
+ if (orig[i] == '\0' && equiv[i] == '\0')
+ return true;
+ if (orig[i] == '\0' || equiv[i] == '\0')
+ return false;
+ if (orig[i] != equiv[i]) {
+ if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
+ continue;
+ if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
+ continue;
+ if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
+ continue;
+ return false;
+ }
+ }
+}
+
+// Determine whether this instruction is a 64-bit instruction.
+static bool is64Bit(const char *name) {
+ for (int i = 0;; ++i) {
+ if (name[i] == '\0')
+ return false;
+ if (name[i] == '6' && name[i + 1] == '4')
+ return true;
+ }
+}
+
+// Determine the ID of an instruction, consuming the ModR/M byte as appropriate
+// for extended and escape opcodes, and using a supplied attribute mask.
+static int getInstructionIDWithAttrMask(uint16_t *instructionID,
+ struct InternalInstruction *insn,
+ uint16_t attrMask) {
+ auto insnCtx = InstructionContext(x86DisassemblerContexts[attrMask]);
+ const ContextDecision *decision;
+ switch (insn->opcodeType) {
+ case ONEBYTE:
+ decision = &ONEBYTE_SYM;
+ break;
+ case TWOBYTE:
+ decision = &TWOBYTE_SYM;
+ break;
+ case THREEBYTE_38:
+ decision = &THREEBYTE38_SYM;
+ break;
+ case THREEBYTE_3A:
+ decision = &THREEBYTE3A_SYM;
+ break;
+ case XOP8_MAP:
+ decision = &XOP8_MAP_SYM;
+ break;
+ case XOP9_MAP:
+ decision = &XOP9_MAP_SYM;
+ break;
+ case XOPA_MAP:
+ decision = &XOPA_MAP_SYM;
+ break;
+ case THREEDNOW_MAP:
+ decision = &THREEDNOW_MAP_SYM;
+ break;
+ }
+
+ if (decision->opcodeDecisions[insnCtx]
+ .modRMDecisions[insn->opcode]
+ .modrm_type != MODRM_ONEENTRY) {
+ if (readModRM(insn))
+ return -1;
+ *instructionID =
+ decode(insn->opcodeType, insnCtx, insn->opcode, insn->modRM);
+ } else {
+ *instructionID = decode(insn->opcodeType, insnCtx, insn->opcode, 0);
+ }
+
+ return 0;
+}
+
+// Determine the ID of an instruction, consuming the ModR/M byte as appropriate
+// for extended and escape opcodes. Determines the attributes and context for
+// the instruction before doing so.
+static int getInstructionID(struct InternalInstruction *insn,
+ const MCInstrInfo *mii) {
+ uint16_t attrMask;
+ uint16_t instructionID;
+
+ LLVM_DEBUG(dbgs() << "getID()");
+
+ attrMask = ATTR_NONE;
+
+ if (insn->mode == MODE_64BIT)
+ attrMask |= ATTR_64BIT;
+
+ if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
+
+ if (insn->vectorExtensionType == TYPE_EVEX) {
+ switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXKZ;
+ if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXB;
+ if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXK;
+ if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_VEXL;
+ if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ attrMask |= ATTR_EVEXL2;
+ } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
+ switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
+ switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
+ attrMask |= ATTR_VEXL;
+ } else if (insn->vectorExtensionType == TYPE_XOP) {
+ switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
+ case VEX_PREFIX_66:
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case VEX_PREFIX_F3:
+ attrMask |= ATTR_XS;
+ break;
+ case VEX_PREFIX_F2:
+ attrMask |= ATTR_XD;
+ break;
+ }
+
+ if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
+ attrMask |= ATTR_VEXL;
+ } else {
+ return -1;
+ }
+ } else if (!insn->mandatoryPrefix) {
+ // If we don't have mandatory prefix we should use legacy prefixes here
+ if (insn->hasOpSize && (insn->mode != MODE_16BIT))
+ attrMask |= ATTR_OPSIZE;
+ if (insn->hasAdSize)
+ attrMask |= ATTR_ADSIZE;
+ if (insn->opcodeType == ONEBYTE) {
+ if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90))
+ // Special support for PAUSE
+ attrMask |= ATTR_XS;
+ } else {
+ if (insn->repeatPrefix == 0xf2)
+ attrMask |= ATTR_XD;
+ else if (insn->repeatPrefix == 0xf3)
+ attrMask |= ATTR_XS;
+ }
+ } else {
+ switch (insn->mandatoryPrefix) {
+ case 0xf2:
+ attrMask |= ATTR_XD;
+ break;
+ case 0xf3:
+ attrMask |= ATTR_XS;
+ break;
+ case 0x66:
+ if (insn->mode != MODE_16BIT)
+ attrMask |= ATTR_OPSIZE;
+ break;
+ case 0x67:
+ attrMask |= ATTR_ADSIZE;
+ break;
+ }
+ }
+
+ if (insn->rexPrefix & 0x08) {
+ attrMask |= ATTR_REXW;
+ attrMask &= ~ATTR_ADSIZE;
+ }
+
+ if (insn->mode == MODE_16BIT) {
+ // JCXZ/JECXZ need special handling for 16-bit mode because the meaning
+ // of the AdSize prefix is inverted w.r.t. 32-bit mode.
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0xE3)
+ attrMask ^= ATTR_ADSIZE;
+ // If we're in 16-bit mode and this is one of the relative jumps and opsize
+ // prefix isn't present, we need to force the opsize attribute since the
+ // prefix is inverted relative to 32-bit mode.
+ if (!insn->hasOpSize && insn->opcodeType == ONEBYTE &&
+ (insn->opcode == 0xE8 || insn->opcode == 0xE9))
+ attrMask |= ATTR_OPSIZE;
+
+ if (!insn->hasOpSize && insn->opcodeType == TWOBYTE &&
+ insn->opcode >= 0x80 && insn->opcode <= 0x8F)
+ attrMask |= ATTR_OPSIZE;
+ }
+
+
+ if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ // The following clauses compensate for limitations of the tables.
+
+ if (insn->mode != MODE_64BIT &&
+ insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
+ // The tables can't distinquish between cases where the W-bit is used to
+ // select register size and cases where its a required part of the opcode.
+ if ((insn->vectorExtensionType == TYPE_EVEX &&
+ wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_VEX_3B &&
+ wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
+ (insn->vectorExtensionType == TYPE_XOP &&
+ wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
+
+ uint16_t instructionIDWithREXW;
+ if (getInstructionIDWithAttrMask(&instructionIDWithREXW, insn,
+ attrMask | ATTR_REXW)) {
+ insn->instructionID = instructionID;
+ insn->spec = &INSTRUCTIONS_SYM[instructionID];
+ return 0;
+ }
+
+ auto SpecName = mii->getName(instructionIDWithREXW);
+ // If not a 64-bit instruction. Switch the opcode.
+ if (!is64Bit(SpecName.data())) {
+ insn->instructionID = instructionIDWithREXW;
+ insn->spec = &INSTRUCTIONS_SYM[instructionIDWithREXW];
+ return 0;
+ }
+ }
+ }
+
+ // Absolute moves, umonitor, and movdir64b need special handling.
+ // -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
+ // inverted w.r.t.
+ // -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
+ // any position.
+ if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) ||
+ (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) ||
+ (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) {
+ // Make sure we observed the prefixes in any position.
+ if (insn->hasAdSize)
+ attrMask |= ATTR_ADSIZE;
+ if (insn->hasOpSize)
+ attrMask |= ATTR_OPSIZE;
+
+ // In 16-bit, invert the attributes.
+ if (insn->mode == MODE_16BIT) {
+ attrMask ^= ATTR_ADSIZE;
+
+ // The OpSize attribute is only valid with the absolute moves.
+ if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0))
+ attrMask ^= ATTR_OPSIZE;
+ }
+
+ if (getInstructionIDWithAttrMask(&instructionID, insn, attrMask))
+ return -1;
+
+ insn->instructionID = instructionID;
+ insn->spec = &INSTRUCTIONS_SYM[instructionID];
+ return 0;
+ }
+
+ if ((insn->mode == MODE_16BIT || insn->hasOpSize) &&
+ !(attrMask & ATTR_OPSIZE)) {
+ // The instruction tables make no distinction between instructions that
+ // allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
+ // particular spot (i.e., many MMX operations). In general we're
+ // conservative, but in the specific case where OpSize is present but not in
+ // the right place we check if there's a 16-bit operation.
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithOpsize;
+ llvm::StringRef specName, specWithOpSizeName;
+
+ spec = &INSTRUCTIONS_SYM[instructionID];
+
+ if (getInstructionIDWithAttrMask(&instructionIDWithOpsize, insn,
+ attrMask | ATTR_OPSIZE)) {
+ // ModRM required with OpSize but not present. Give up and return the
+ // version without OpSize set.
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specName = mii->getName(instructionID);
+ specWithOpSizeName = mii->getName(instructionIDWithOpsize);
+
+ if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
+ (insn->mode == MODE_16BIT) ^ insn->hasOpSize) {
+ insn->instructionID = instructionIDWithOpsize;
+ insn->spec = &INSTRUCTIONS_SYM[instructionIDWithOpsize];
+ } else {
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ }
+ return 0;
+ }
+
+ if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
+ insn->rexPrefix & 0x01) {
+ // NOOP shouldn't decode as NOOP if REX.b is set. Instead it should decode
+ // as XCHG %r8, %eax.
+ const struct InstructionSpecifier *spec;
+ uint16_t instructionIDWithNewOpcode;
+ const struct InstructionSpecifier *specWithNewOpcode;
+
+ spec = &INSTRUCTIONS_SYM[instructionID];
+
+ // Borrow opcode from one of the other XCHGar opcodes
+ insn->opcode = 0x91;
+
+ if (getInstructionIDWithAttrMask(&instructionIDWithNewOpcode, insn,
+ attrMask)) {
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionID;
+ insn->spec = spec;
+ return 0;
+ }
+
+ specWithNewOpcode = &INSTRUCTIONS_SYM[instructionIDWithNewOpcode];
+
+ // Change back
+ insn->opcode = 0x90;
+
+ insn->instructionID = instructionIDWithNewOpcode;
+ insn->spec = specWithNewOpcode;
+
+ return 0;
+ }
+
+ insn->instructionID = instructionID;
+ insn->spec = &INSTRUCTIONS_SYM[insn->instructionID];
+
+ return 0;
+}
+
+// Read an operand from the opcode field of an instruction and interprets it
+// appropriately given the operand width. Handles AddRegFrm instructions.
+//
+// @param insn - the instruction whose opcode field is to be read.
+// @param size - The width (in bytes) of the register being specified.
+// 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
+// RAX.
+// @return - 0 on success; nonzero otherwise.
+static int readOpcodeRegister(struct InternalInstruction *insn, uint8_t size) {
+ LLVM_DEBUG(dbgs() << "readOpcodeRegister()");
+
+ if (size == 0)
+ size = insn->registerSize;
+
+ switch (size) {
+ case 1:
+ insn->opcodeRegister = (Reg)(
+ MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ if (insn->rexPrefix && insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
+ insn->opcodeRegister < MODRM_REG_AL + 0x8) {
+ insn->opcodeRegister =
+ (Reg)(MODRM_REG_SPL + (insn->opcodeRegister - MODRM_REG_AL - 4));
+ }
+
+ break;
+ case 2:
+ insn->opcodeRegister = (Reg)(
+ MODRM_REG_AX + ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ break;
+ case 4:
+ insn->opcodeRegister =
+ (Reg)(MODRM_REG_EAX +
+ ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ break;
+ case 8:
+ insn->opcodeRegister =
+ (Reg)(MODRM_REG_RAX +
+ ((bFromREX(insn->rexPrefix) << 3) | (insn->opcode & 7)));
+ break;
+ }
+
+ return 0;
+}
+
+// Consume an immediate operand from an instruction, given the desired operand
+// size.
+//
+// @param insn - The instruction whose operand is to be read.
+// @param size - The width (in bytes) of the operand.
+// @return - 0 if the immediate was successfully consumed; nonzero
+// otherwise.
+static int readImmediate(struct InternalInstruction *insn, uint8_t size) {
+ uint8_t imm8;
+ uint16_t imm16;
+ uint32_t imm32;
+ uint64_t imm64;
+
+ LLVM_DEBUG(dbgs() << "readImmediate()");
+
+ assert(insn->numImmediatesConsumed < 2 && "Already consumed two immediates");
+
+ insn->immediateSize = size;
+ insn->immediateOffset = insn->readerCursor - insn->startLocation;
+
+ switch (size) {
+ case 1:
+ if (consume(insn, imm8))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm8;
+ break;
+ case 2:
+ if (consume(insn, imm16))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm16;
+ break;
+ case 4:
+ if (consume(insn, imm32))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm32;
+ break;
+ case 8:
+ if (consume(insn, imm64))
+ return -1;
+ insn->immediates[insn->numImmediatesConsumed] = imm64;
+ break;
+ default:
+ llvm_unreachable("invalid size");
+ }
+
+ insn->numImmediatesConsumed++;
+
+ return 0;
+}
+
+// Consume vvvv from an instruction if it has a VEX prefix.
+static int readVVVV(struct InternalInstruction *insn) {
+ LLVM_DEBUG(dbgs() << "readVVVV()");
+
+ int vvvv;
+ if (insn->vectorExtensionType == TYPE_EVEX)
+ vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
+ vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
+ else if (insn->vectorExtensionType == TYPE_VEX_3B)
+ vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
+ else if (insn->vectorExtensionType == TYPE_VEX_2B)
+ vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
+ else if (insn->vectorExtensionType == TYPE_XOP)
+ vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
+ else
+ return -1;
+
+ if (insn->mode != MODE_64BIT)
+ vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later.
+
+ insn->vvvv = static_cast<Reg>(vvvv);
+ return 0;
+}
+
+// Read an mask register from the opcode field of an instruction.
+//
+// @param insn - The instruction whose opcode field is to be read.
+// @return - 0 on success; nonzero otherwise.
+static int readMaskRegister(struct InternalInstruction *insn) {
+ LLVM_DEBUG(dbgs() << "readMaskRegister()");
+
+ if (insn->vectorExtensionType != TYPE_EVEX)
+ return -1;
+
+ insn->writemask =
+ static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
+ return 0;
+}
+
+// Consults the specifier for an instruction and consumes all
+// operands for that instruction, interpreting them as it goes.
+static int readOperands(struct InternalInstruction *insn) {
+ int hasVVVV, needVVVV;
+ int sawRegImm = 0;
+
+ LLVM_DEBUG(dbgs() << "readOperands()");
+
+ // If non-zero vvvv specified, make sure one of the operands uses it.
+ hasVVVV = !readVVVV(insn);
+ needVVVV = hasVVVV && (insn->vvvv != 0);
+
+ for (const auto &Op : x86OperandSets[insn->spec->operands]) {
+ switch (Op.encoding) {
+ case ENCODING_NONE:
+ case ENCODING_SI:
+ case ENCODING_DI:
+ break;
+ CASE_ENCODING_VSIB:
+ // VSIB can use the V2 bit so check only the other bits.
+ if (needVVVV)
+ needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
+ if (readModRM(insn))
+ return -1;
+
+ // Reject if SIB wasn't used.
+ if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
+ return -1;
+
+ // If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
+ if (insn->sibIndex == SIB_INDEX_NONE)
+ insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4);
+
+ // If EVEX.v2 is set this is one of the 16-31 registers.
+ if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT &&
+ v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
+ insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
+
+ // Adjust the index register to the correct size.
+ switch ((OperandType)Op.type) {
+ default:
+ debug("Unhandled VSIB index type");
+ return -1;
+ case TYPE_MVSIBX:
+ insn->sibIndex =
+ (SIBIndex)(SIB_INDEX_XMM0 + (insn->sibIndex - insn->sibIndexBase));
+ break;
+ case TYPE_MVSIBY:
+ insn->sibIndex =
+ (SIBIndex)(SIB_INDEX_YMM0 + (insn->sibIndex - insn->sibIndexBase));
+ break;
+ case TYPE_MVSIBZ:
+ insn->sibIndex =
+ (SIBIndex)(SIB_INDEX_ZMM0 + (insn->sibIndex - insn->sibIndexBase));
+ break;
+ }
+
+ // Apply the AVX512 compressed displacement scaling factor.
+ if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+ insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
+ break;
+ case ENCODING_REG:
+ CASE_ENCODING_RM:
+ if (readModRM(insn))
+ return -1;
+ if (fixupReg(insn, &Op))
+ return -1;
+ // Apply the AVX512 compressed displacement scaling factor.
+ if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
+ insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
+ break;
+ case ENCODING_IB:
+ if (sawRegImm) {
+ // Saw a register immediate so don't read again and instead split the
+ // previous immediate. FIXME: This is a hack.
+ insn->immediates[insn->numImmediatesConsumed] =
+ insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
+ ++insn->numImmediatesConsumed;
+ break;
+ }
+ if (readImmediate(insn, 1))
+ return -1;
+ if (Op.type == TYPE_XMM || Op.type == TYPE_YMM)
+ sawRegImm = 1;
+ break;
+ case ENCODING_IW:
+ if (readImmediate(insn, 2))
+ return -1;
+ break;
+ case ENCODING_ID:
+ if (readImmediate(insn, 4))
+ return -1;
+ break;
+ case ENCODING_IO:
+ if (readImmediate(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Iv:
+ if (readImmediate(insn, insn->immediateSize))
+ return -1;
+ break;
+ case ENCODING_Ia:
+ if (readImmediate(insn, insn->addressSize))
+ return -1;
+ break;
+ case ENCODING_IRC:
+ insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) |
+ lFromEVEX4of4(insn->vectorExtensionPrefix[3]);
+ break;
+ case ENCODING_RB:
+ if (readOpcodeRegister(insn, 1))
+ return -1;
+ break;
+ case ENCODING_RW:
+ if (readOpcodeRegister(insn, 2))
+ return -1;
+ break;
+ case ENCODING_RD:
+ if (readOpcodeRegister(insn, 4))
+ return -1;
+ break;
+ case ENCODING_RO:
+ if (readOpcodeRegister(insn, 8))
+ return -1;
+ break;
+ case ENCODING_Rv:
+ if (readOpcodeRegister(insn, 0))
+ return -1;
+ break;
+ case ENCODING_CC:
+ insn->immediates[1] = insn->opcode & 0xf;
+ break;
+ case ENCODING_FP:
+ break;
+ case ENCODING_VVVV:
+ needVVVV = 0; // Mark that we have found a VVVV operand.
+ if (!hasVVVV)
+ return -1;
+ if (insn->mode != MODE_64BIT)
+ insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7);
+ if (fixupReg(insn, &Op))
+ return -1;
+ break;
+ case ENCODING_WRITEMASK:
+ if (readMaskRegister(insn))
+ return -1;
+ break;
+ case ENCODING_DUP:
+ break;
+ default:
+ LLVM_DEBUG(dbgs() << "Encountered an operand with an unknown encoding.");
+ return -1;
+ }
+ }
+
+ // If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail
+ if (needVVVV)
+ return -1;
+
+ return 0;
+}
namespace llvm {
-// Fill-ins to make the compiler happy. These constants are never actually
-// assigned; they are just filler to make an automatically-generated switch
-// statement work.
+// Fill-ins to make the compiler happy. These constants are never actually
+// assigned; they are just filler to make an automatically-generated switch
+// statement work.
namespace X86 {
enum {
BX_SI = 500,
@@ -140,7 +1669,6 @@ public:
public:
DecodeStatus getInstruction(MCInst &instr, uint64_t &size,
ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &vStream,
raw_ostream &cStream) const override;
private:
@@ -169,91 +1697,51 @@ X86GenericDisassembler::X86GenericDisassembler(
llvm_unreachable("Invalid CPU mode");
}
-namespace {
-struct Region {
- ArrayRef<uint8_t> Bytes;
- uint64_t Base;
- Region(ArrayRef<uint8_t> Bytes, uint64_t Base) : Bytes(Bytes), Base(Base) {}
-};
-} // end anonymous namespace
-
-/// A callback function that wraps the readByte method from Region.
-///
-/// @param Arg - The generic callback parameter. In this case, this should
-/// be a pointer to a Region.
-/// @param Byte - A pointer to the byte to be read.
-/// @param Address - The address to be read.
-static int regionReader(const void *Arg, uint8_t *Byte, uint64_t Address) {
- auto *R = static_cast<const Region *>(Arg);
- ArrayRef<uint8_t> Bytes = R->Bytes;
- unsigned Index = Address - R->Base;
- if (Bytes.size() <= Index)
- return -1;
- *Byte = Bytes[Index];
- return 0;
-}
-
-/// logger - a callback function that wraps the operator<< method from
-/// raw_ostream.
-///
-/// @param arg - The generic callback parameter. This should be a pointe
-/// to a raw_ostream.
-/// @param log - A string to be logged. logger() adds a newline.
-static void logger(void* arg, const char* log) {
- if (!arg)
- return;
-
- raw_ostream &vStream = *(static_cast<raw_ostream*>(arg));
- vStream << log << "\n";
-}
-
-//
-// Public interface for the disassembler
-//
-
MCDisassembler::DecodeStatus X86GenericDisassembler::getInstruction(
MCInst &Instr, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
- raw_ostream &VStream, raw_ostream &CStream) const {
+ raw_ostream &CStream) const {
CommentStream = &CStream;
- InternalInstruction InternalInstr;
-
- dlog_t LoggerFn = logger;
- if (&VStream == &nulls())
- LoggerFn = nullptr; // Disable logging completely if it's going to nulls().
-
- Region R(Bytes, Address);
-
- int Ret = decodeInstruction(&InternalInstr, regionReader, (const void *)&R,
- LoggerFn, (void *)&VStream,
- (const void *)MII.get(), Address, fMode);
-
- if (Ret) {
- Size = InternalInstr.readerCursor - Address;
+ InternalInstruction Insn;
+ memset(&Insn, 0, sizeof(InternalInstruction));
+ Insn.bytes = Bytes;
+ Insn.startLocation = Address;
+ Insn.readerCursor = Address;
+ Insn.mode = fMode;
+
+ if (Bytes.empty() || readPrefixes(&Insn) || readOpcode(&Insn) ||
+ getInstructionID(&Insn, MII.get()) || Insn.instructionID == 0 ||
+ readOperands(&Insn)) {
+ Size = Insn.readerCursor - Address;
return Fail;
- } else {
- Size = InternalInstr.length;
- bool Ret = translateInstruction(Instr, InternalInstr, this);
- if (!Ret) {
- unsigned Flags = X86::IP_NO_PREFIX;
- if (InternalInstr.hasAdSize)
- Flags |= X86::IP_HAS_AD_SIZE;
- if (!InternalInstr.mandatoryPrefix) {
- if (InternalInstr.hasOpSize)
- Flags |= X86::IP_HAS_OP_SIZE;
- if (InternalInstr.repeatPrefix == 0xf2)
- Flags |= X86::IP_HAS_REPEAT_NE;
- else if (InternalInstr.repeatPrefix == 0xf3 &&
- // It should not be 'pause' f3 90
- InternalInstr.opcode != 0x90)
- Flags |= X86::IP_HAS_REPEAT;
- if (InternalInstr.hasLockPrefix)
- Flags |= X86::IP_HAS_LOCK;
- }
- Instr.setFlags(Flags);
+ }
+
+ Insn.operands = x86OperandSets[Insn.spec->operands];
+ Insn.length = Insn.readerCursor - Insn.startLocation;
+ Size = Insn.length;
+ if (Size > 15)
+ LLVM_DEBUG(dbgs() << "Instruction exceeds 15-byte limit");
+
+ bool Ret = translateInstruction(Instr, Insn, this);
+ if (!Ret) {
+ unsigned Flags = X86::IP_NO_PREFIX;
+ if (Insn.hasAdSize)
+ Flags |= X86::IP_HAS_AD_SIZE;
+ if (!Insn.mandatoryPrefix) {
+ if (Insn.hasOpSize)
+ Flags |= X86::IP_HAS_OP_SIZE;
+ if (Insn.repeatPrefix == 0xf2)
+ Flags |= X86::IP_HAS_REPEAT_NE;
+ else if (Insn.repeatPrefix == 0xf3 &&
+ // It should not be 'pause' f3 90
+ Insn.opcode != 0x90)
+ Flags |= X86::IP_HAS_REPEAT;
+ if (Insn.hasLockPrefix)
+ Flags |= X86::IP_HAS_LOCK;
}
- return (!Ret) ? Success : Fail;
+ Instr.setFlags(Flags);
}
+ return (!Ret) ? Success : Fail;
}
//
@@ -844,7 +2332,7 @@ static MCDisassembler *createX86Disassembler(const Target &T,
return new X86GenericDisassembler(STI, Ctx, std::move(MII));
}
-extern "C" void LLVMInitializeX86Disassembler() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Disassembler() {
// Register the disassembler.
TargetRegistry::RegisterMCDisassembler(getTheX86_32Target(),
createX86Disassembler);
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
deleted file mode 100644
index e287f6625115..000000000000
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ /dev/null
@@ -1,1938 +0,0 @@
-//===-- X86DisassemblerDecoder.cpp - Disassembler decoder -----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is part of the X86 Disassembler.
-// It contains the implementation of the instruction decoder.
-// Documentation for the disassembler can be found in X86Disassembler.h.
-//
-//===----------------------------------------------------------------------===//
-
-#include "X86DisassemblerDecoder.h"
-#include "llvm/ADT/StringRef.h"
-
-#include <cstdarg> /* for va_*() */
-#include <cstdio> /* for vsnprintf() */
-#include <cstdlib> /* for exit() */
-#include <cstring> /* for memset() */
-
-using namespace llvm::X86Disassembler;
-
-/// Specifies whether a ModR/M byte is needed and (if so) which
-/// instruction each possible value of the ModR/M byte corresponds to. Once
-/// this information is known, we have narrowed down to a single instruction.
-struct ModRMDecision {
- uint8_t modrm_type;
- uint16_t instructionIDs;
-};
-
-/// Specifies which set of ModR/M->instruction tables to look at
-/// given a particular opcode.
-struct OpcodeDecision {
- ModRMDecision modRMDecisions[256];
-};
-
-/// Specifies which opcode->instruction tables to look at given
-/// a particular context (set of attributes). Since there are many possible
-/// contexts, the decoder first uses CONTEXTS_SYM to determine which context
-/// applies given a specific set of attributes. Hence there are only IC_max
-/// entries in this table, rather than 2^(ATTR_max).
-struct ContextDecision {
- OpcodeDecision opcodeDecisions[IC_max];
-};
-
-#include "X86GenDisassemblerTables.inc"
-
-#ifndef NDEBUG
-#define debug(s) do { Debug(__FILE__, __LINE__, s); } while (0)
-#else
-#define debug(s) do { } while (0)
-#endif
-
-/*
- * contextForAttrs - Client for the instruction context table. Takes a set of
- * attributes and returns the appropriate decode context.
- *
- * @param attrMask - Attributes, from the enumeration attributeBits.
- * @return - The InstructionContext to use when looking up an
- * an instruction with these attributes.
- */
-static InstructionContext contextForAttrs(uint16_t attrMask) {
- return static_cast<InstructionContext>(CONTEXTS_SYM[attrMask]);
-}
-
-/*
- * modRMRequired - Reads the appropriate instruction table to determine whether
- * the ModR/M byte is required to decode a particular instruction.
- *
- * @param type - The opcode type (i.e., how many bytes it has).
- * @param insnContext - The context for the instruction, as returned by
- * contextForAttrs.
- * @param opcode - The last byte of the instruction's opcode, not counting
- * ModR/M extensions and escapes.
- * @return - true if the ModR/M byte is required, false otherwise.
- */
-static int modRMRequired(OpcodeType type,
- InstructionContext insnContext,
- uint16_t opcode) {
- const struct ContextDecision* decision = nullptr;
-
- switch (type) {
- case ONEBYTE:
- decision = &ONEBYTE_SYM;
- break;
- case TWOBYTE:
- decision = &TWOBYTE_SYM;
- break;
- case THREEBYTE_38:
- decision = &THREEBYTE38_SYM;
- break;
- case THREEBYTE_3A:
- decision = &THREEBYTE3A_SYM;
- break;
- case XOP8_MAP:
- decision = &XOP8_MAP_SYM;
- break;
- case XOP9_MAP:
- decision = &XOP9_MAP_SYM;
- break;
- case XOPA_MAP:
- decision = &XOPA_MAP_SYM;
- break;
- case THREEDNOW_MAP:
- decision = &THREEDNOW_MAP_SYM;
- break;
- }
-
- return decision->opcodeDecisions[insnContext].modRMDecisions[opcode].
- modrm_type != MODRM_ONEENTRY;
-}
-
-/*
- * decode - Reads the appropriate instruction table to obtain the unique ID of
- * an instruction.
- *
- * @param type - See modRMRequired().
- * @param insnContext - See modRMRequired().
- * @param opcode - See modRMRequired().
- * @param modRM - The ModR/M byte if required, or any value if not.
- * @return - The UID of the instruction, or 0 on failure.
- */
-static InstrUID decode(OpcodeType type,
- InstructionContext insnContext,
- uint8_t opcode,
- uint8_t modRM) {
- const struct ModRMDecision* dec = nullptr;
-
- switch (type) {
- case ONEBYTE:
- dec = &ONEBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case TWOBYTE:
- dec = &TWOBYTE_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case THREEBYTE_38:
- dec = &THREEBYTE38_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case THREEBYTE_3A:
- dec = &THREEBYTE3A_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case XOP8_MAP:
- dec = &XOP8_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case XOP9_MAP:
- dec = &XOP9_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case XOPA_MAP:
- dec = &XOPA_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- case THREEDNOW_MAP:
- dec = &THREEDNOW_MAP_SYM.opcodeDecisions[insnContext].modRMDecisions[opcode];
- break;
- }
-
- switch (dec->modrm_type) {
- default:
- debug("Corrupt table! Unknown modrm_type");
- return 0;
- case MODRM_ONEENTRY:
- return modRMTable[dec->instructionIDs];
- case MODRM_SPLITRM:
- if (modFromModRM(modRM) == 0x3)
- return modRMTable[dec->instructionIDs+1];
- return modRMTable[dec->instructionIDs];
- case MODRM_SPLITREG:
- if (modFromModRM(modRM) == 0x3)
- return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)+8];
- return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
- case MODRM_SPLITMISC:
- if (modFromModRM(modRM) == 0x3)
- return modRMTable[dec->instructionIDs+(modRM & 0x3f)+8];
- return modRMTable[dec->instructionIDs+((modRM & 0x38) >> 3)];
- case MODRM_FULL:
- return modRMTable[dec->instructionIDs+modRM];
- }
-}
-
-/*
- * specifierForUID - Given a UID, returns the name and operand specification for
- * that instruction.
- *
- * @param uid - The unique ID for the instruction. This should be returned by
- * decode(); specifierForUID will not check bounds.
- * @return - A pointer to the specification for that instruction.
- */
-static const struct InstructionSpecifier *specifierForUID(InstrUID uid) {
- return &INSTRUCTIONS_SYM[uid];
-}
-
-/*
- * consumeByte - Uses the reader function provided by the user to consume one
- * byte from the instruction's memory and advance the cursor.
- *
- * @param insn - The instruction with the reader function to use. The cursor
- * for this instruction is advanced.
- * @param byte - A pointer to a pre-allocated memory buffer to be populated
- * with the data read.
- * @return - 0 if the read was successful; nonzero otherwise.
- */
-static int consumeByte(struct InternalInstruction* insn, uint8_t* byte) {
- int ret = insn->reader(insn->readerArg, byte, insn->readerCursor);
-
- if (!ret)
- ++(insn->readerCursor);
-
- return ret;
-}
-
-/*
- * lookAtByte - Like consumeByte, but does not advance the cursor.
- *
- * @param insn - See consumeByte().
- * @param byte - See consumeByte().
- * @return - See consumeByte().
- */
-static int lookAtByte(struct InternalInstruction* insn, uint8_t* byte) {
- return insn->reader(insn->readerArg, byte, insn->readerCursor);
-}
-
-static void unconsumeByte(struct InternalInstruction* insn) {
- insn->readerCursor--;
-}
-
-#define CONSUME_FUNC(name, type) \
- static int name(struct InternalInstruction* insn, type* ptr) { \
- type combined = 0; \
- unsigned offset; \
- for (offset = 0; offset < sizeof(type); ++offset) { \
- uint8_t byte; \
- int ret = insn->reader(insn->readerArg, \
- &byte, \
- insn->readerCursor + offset); \
- if (ret) \
- return ret; \
- combined = combined | ((uint64_t)byte << (offset * 8)); \
- } \
- *ptr = combined; \
- insn->readerCursor += sizeof(type); \
- return 0; \
- }
-
-/*
- * consume* - Use the reader function provided by the user to consume data
- * values of various sizes from the instruction's memory and advance the
- * cursor appropriately. These readers perform endian conversion.
- *
- * @param insn - See consumeByte().
- * @param ptr - A pointer to a pre-allocated memory of appropriate size to
- * be populated with the data read.
- * @return - See consumeByte().
- */
-CONSUME_FUNC(consumeInt8, int8_t)
-CONSUME_FUNC(consumeInt16, int16_t)
-CONSUME_FUNC(consumeInt32, int32_t)
-CONSUME_FUNC(consumeUInt16, uint16_t)
-CONSUME_FUNC(consumeUInt32, uint32_t)
-CONSUME_FUNC(consumeUInt64, uint64_t)
-
-/*
- * dbgprintf - Uses the logging function provided by the user to log a single
- * message, typically without a carriage-return.
- *
- * @param insn - The instruction containing the logging function.
- * @param format - See printf().
- * @param ... - See printf().
- */
-static void dbgprintf(struct InternalInstruction* insn,
- const char* format,
- ...) {
- char buffer[256];
- va_list ap;
-
- if (!insn->dlog)
- return;
-
- va_start(ap, format);
- (void)vsnprintf(buffer, sizeof(buffer), format, ap);
- va_end(ap);
-
- insn->dlog(insn->dlogArg, buffer);
-}
-
-static bool isREX(struct InternalInstruction *insn, uint8_t prefix) {
- if (insn->mode == MODE_64BIT)
- return prefix >= 0x40 && prefix <= 0x4f;
- return false;
-}
-
-/*
- * setPrefixPresent - Marks that a particular prefix is present as mandatory
- *
- * @param insn - The instruction to be marked as having the prefix.
- * @param prefix - The prefix that is present.
- */
-static void setPrefixPresent(struct InternalInstruction *insn, uint8_t prefix) {
- uint8_t nextByte;
- switch (prefix) {
- case 0xf0:
- insn->hasLockPrefix = true;
- break;
- case 0xf2:
- case 0xf3:
- if (lookAtByte(insn, &nextByte))
- break;
- // TODO:
- // 1. There could be several 0x66
- // 2. if (nextByte == 0x66) and nextNextByte != 0x0f then
- // it's not mandatory prefix
- // 3. if (nextByte >= 0x40 && nextByte <= 0x4f) it's REX and we need
- // 0x0f exactly after it to be mandatory prefix
- if (isREX(insn, nextByte) || nextByte == 0x0f || nextByte == 0x66)
- // The last of 0xf2 /0xf3 is mandatory prefix
- insn->mandatoryPrefix = prefix;
- insn->repeatPrefix = prefix;
- break;
- case 0x66:
- if (lookAtByte(insn, &nextByte))
- break;
- // 0x66 can't overwrite existing mandatory prefix and should be ignored
- if (!insn->mandatoryPrefix && (nextByte == 0x0f || isREX(insn, nextByte)))
- insn->mandatoryPrefix = prefix;
- break;
- }
-}
-
-/*
- * readPrefixes - Consumes all of an instruction's prefix bytes, and marks the
- * instruction as having them. Also sets the instruction's default operand,
- * address, and other relevant data sizes to report operands correctly.
- *
- * @param insn - The instruction whose prefixes are to be read.
- * @return - 0 if the instruction could be read until the end of the prefix
- * bytes, and no prefixes conflicted; nonzero otherwise.
- */
-static int readPrefixes(struct InternalInstruction* insn) {
- bool isPrefix = true;
- uint8_t byte = 0;
- uint8_t nextByte;
-
- dbgprintf(insn, "readPrefixes()");
-
- while (isPrefix) {
- /* If we fail reading prefixes, just stop here and let the opcode reader deal with it */
- if (consumeByte(insn, &byte))
- break;
-
- /*
- * If the byte is a LOCK/REP/REPNE prefix and not a part of the opcode, then
- * break and let it be disassembled as a normal "instruction".
- */
- if (insn->readerCursor - 1 == insn->startLocation && byte == 0xf0) // LOCK
- break;
-
- if ((byte == 0xf2 || byte == 0xf3) && !lookAtByte(insn, &nextByte)) {
- /*
- * If the byte is 0xf2 or 0xf3, and any of the following conditions are
- * met:
- * - it is followed by a LOCK (0xf0) prefix
- * - it is followed by an xchg instruction
- * then it should be disassembled as a xacquire/xrelease not repne/rep.
- */
- if (((nextByte == 0xf0) ||
- ((nextByte & 0xfe) == 0x86 || (nextByte & 0xf8) == 0x90))) {
- insn->xAcquireRelease = true;
- if (!(byte == 0xf3 && nextByte == 0x90)) // PAUSE instruction support
- break;
- }
- /*
- * Also if the byte is 0xf3, and the following condition is met:
- * - it is followed by a "mov mem, reg" (opcode 0x88/0x89) or
- * "mov mem, imm" (opcode 0xc6/0xc7) instructions.
- * then it should be disassembled as an xrelease not rep.
- */
- if (byte == 0xf3 && (nextByte == 0x88 || nextByte == 0x89 ||
- nextByte == 0xc6 || nextByte == 0xc7)) {
- insn->xAcquireRelease = true;
- break;
- }
- if (isREX(insn, nextByte)) {
- uint8_t nnextByte;
- // Go to REX prefix after the current one
- if (consumeByte(insn, &nnextByte))
- return -1;
- // We should be able to read next byte after REX prefix
- if (lookAtByte(insn, &nnextByte))
- return -1;
- unconsumeByte(insn);
- }
- }
-
- switch (byte) {
- case 0xf0: /* LOCK */
- case 0xf2: /* REPNE/REPNZ */
- case 0xf3: /* REP or REPE/REPZ */
- setPrefixPresent(insn, byte);
- break;
- case 0x2e: /* CS segment override -OR- Branch not taken */
- case 0x36: /* SS segment override -OR- Branch taken */
- case 0x3e: /* DS segment override */
- case 0x26: /* ES segment override */
- case 0x64: /* FS segment override */
- case 0x65: /* GS segment override */
- switch (byte) {
- case 0x2e:
- insn->segmentOverride = SEG_OVERRIDE_CS;
- break;
- case 0x36:
- insn->segmentOverride = SEG_OVERRIDE_SS;
- break;
- case 0x3e:
- insn->segmentOverride = SEG_OVERRIDE_DS;
- break;
- case 0x26:
- insn->segmentOverride = SEG_OVERRIDE_ES;
- break;
- case 0x64:
- insn->segmentOverride = SEG_OVERRIDE_FS;
- break;
- case 0x65:
- insn->segmentOverride = SEG_OVERRIDE_GS;
- break;
- default:
- debug("Unhandled override");
- return -1;
- }
- setPrefixPresent(insn, byte);
- break;
- case 0x66: /* Operand-size override */
- insn->hasOpSize = true;
- setPrefixPresent(insn, byte);
- break;
- case 0x67: /* Address-size override */
- insn->hasAdSize = true;
- setPrefixPresent(insn, byte);
- break;
- default: /* Not a prefix byte */
- isPrefix = false;
- break;
- }
-
- if (isPrefix)
- dbgprintf(insn, "Found prefix 0x%hhx", byte);
- }
-
- insn->vectorExtensionType = TYPE_NO_VEX_XOP;
-
- if (byte == 0x62) {
- uint8_t byte1, byte2;
-
- if (consumeByte(insn, &byte1)) {
- dbgprintf(insn, "Couldn't read second byte of EVEX prefix");
- return -1;
- }
-
- if (lookAtByte(insn, &byte2)) {
- dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
- return -1;
- }
-
- if ((insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0) &&
- ((~byte1 & 0xc) == 0xc) && ((byte2 & 0x4) == 0x4)) {
- insn->vectorExtensionType = TYPE_EVEX;
- } else {
- unconsumeByte(insn); /* unconsume byte1 */
- unconsumeByte(insn); /* unconsume byte */
- }
-
- if (insn->vectorExtensionType == TYPE_EVEX) {
- insn->vectorExtensionPrefix[0] = byte;
- insn->vectorExtensionPrefix[1] = byte1;
- if (consumeByte(insn, &insn->vectorExtensionPrefix[2])) {
- dbgprintf(insn, "Couldn't read third byte of EVEX prefix");
- return -1;
- }
- if (consumeByte(insn, &insn->vectorExtensionPrefix[3])) {
- dbgprintf(insn, "Couldn't read fourth byte of EVEX prefix");
- return -1;
- }
-
- /* We simulate the REX prefix for simplicity's sake */
- if (insn->mode == MODE_64BIT) {
- insn->rexPrefix = 0x40
- | (wFromEVEX3of4(insn->vectorExtensionPrefix[2]) << 3)
- | (rFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 2)
- | (xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 1)
- | (bFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 0);
- }
-
- dbgprintf(insn, "Found EVEX prefix 0x%hhx 0x%hhx 0x%hhx 0x%hhx",
- insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
- insn->vectorExtensionPrefix[2], insn->vectorExtensionPrefix[3]);
- }
- } else if (byte == 0xc4) {
- uint8_t byte1;
-
- if (lookAtByte(insn, &byte1)) {
- dbgprintf(insn, "Couldn't read second byte of VEX");
- return -1;
- }
-
- if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
- insn->vectorExtensionType = TYPE_VEX_3B;
- else
- unconsumeByte(insn);
-
- if (insn->vectorExtensionType == TYPE_VEX_3B) {
- insn->vectorExtensionPrefix[0] = byte;
- consumeByte(insn, &insn->vectorExtensionPrefix[1]);
- consumeByte(insn, &insn->vectorExtensionPrefix[2]);
-
- /* We simulate the REX prefix for simplicity's sake */
-
- if (insn->mode == MODE_64BIT)
- insn->rexPrefix = 0x40
- | (wFromVEX3of3(insn->vectorExtensionPrefix[2]) << 3)
- | (rFromVEX2of3(insn->vectorExtensionPrefix[1]) << 2)
- | (xFromVEX2of3(insn->vectorExtensionPrefix[1]) << 1)
- | (bFromVEX2of3(insn->vectorExtensionPrefix[1]) << 0);
-
- dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx 0x%hhx",
- insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
- insn->vectorExtensionPrefix[2]);
- }
- } else if (byte == 0xc5) {
- uint8_t byte1;
-
- if (lookAtByte(insn, &byte1)) {
- dbgprintf(insn, "Couldn't read second byte of VEX");
- return -1;
- }
-
- if (insn->mode == MODE_64BIT || (byte1 & 0xc0) == 0xc0)
- insn->vectorExtensionType = TYPE_VEX_2B;
- else
- unconsumeByte(insn);
-
- if (insn->vectorExtensionType == TYPE_VEX_2B) {
- insn->vectorExtensionPrefix[0] = byte;
- consumeByte(insn, &insn->vectorExtensionPrefix[1]);
-
- if (insn->mode == MODE_64BIT)
- insn->rexPrefix = 0x40
- | (rFromVEX2of2(insn->vectorExtensionPrefix[1]) << 2);
-
- switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
- default:
- break;
- case VEX_PREFIX_66:
- insn->hasOpSize = true;
- break;
- }
-
- dbgprintf(insn, "Found VEX prefix 0x%hhx 0x%hhx",
- insn->vectorExtensionPrefix[0],
- insn->vectorExtensionPrefix[1]);
- }
- } else if (byte == 0x8f) {
- uint8_t byte1;
-
- if (lookAtByte(insn, &byte1)) {
- dbgprintf(insn, "Couldn't read second byte of XOP");
- return -1;
- }
-
- if ((byte1 & 0x38) != 0x0) /* 0 in these 3 bits is a POP instruction. */
- insn->vectorExtensionType = TYPE_XOP;
- else
- unconsumeByte(insn);
-
- if (insn->vectorExtensionType == TYPE_XOP) {
- insn->vectorExtensionPrefix[0] = byte;
- consumeByte(insn, &insn->vectorExtensionPrefix[1]);
- consumeByte(insn, &insn->vectorExtensionPrefix[2]);
-
- /* We simulate the REX prefix for simplicity's sake */
-
- if (insn->mode == MODE_64BIT)
- insn->rexPrefix = 0x40
- | (wFromXOP3of3(insn->vectorExtensionPrefix[2]) << 3)
- | (rFromXOP2of3(insn->vectorExtensionPrefix[1]) << 2)
- | (xFromXOP2of3(insn->vectorExtensionPrefix[1]) << 1)
- | (bFromXOP2of3(insn->vectorExtensionPrefix[1]) << 0);
-
- switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
- default:
- break;
- case VEX_PREFIX_66:
- insn->hasOpSize = true;
- break;
- }
-
- dbgprintf(insn, "Found XOP prefix 0x%hhx 0x%hhx 0x%hhx",
- insn->vectorExtensionPrefix[0], insn->vectorExtensionPrefix[1],
- insn->vectorExtensionPrefix[2]);
- }
- } else if (isREX(insn, byte)) {
- if (lookAtByte(insn, &nextByte))
- return -1;
- insn->rexPrefix = byte;
- dbgprintf(insn, "Found REX prefix 0x%hhx", byte);
- } else
- unconsumeByte(insn);
-
- if (insn->mode == MODE_16BIT) {
- insn->registerSize = (insn->hasOpSize ? 4 : 2);
- insn->addressSize = (insn->hasAdSize ? 4 : 2);
- insn->displacementSize = (insn->hasAdSize ? 4 : 2);
- insn->immediateSize = (insn->hasOpSize ? 4 : 2);
- } else if (insn->mode == MODE_32BIT) {
- insn->registerSize = (insn->hasOpSize ? 2 : 4);
- insn->addressSize = (insn->hasAdSize ? 2 : 4);
- insn->displacementSize = (insn->hasAdSize ? 2 : 4);
- insn->immediateSize = (insn->hasOpSize ? 2 : 4);
- } else if (insn->mode == MODE_64BIT) {
- if (insn->rexPrefix && wFromREX(insn->rexPrefix)) {
- insn->registerSize = 8;
- insn->addressSize = (insn->hasAdSize ? 4 : 8);
- insn->displacementSize = 4;
- insn->immediateSize = 4;
- } else {
- insn->registerSize = (insn->hasOpSize ? 2 : 4);
- insn->addressSize = (insn->hasAdSize ? 4 : 8);
- insn->displacementSize = (insn->hasOpSize ? 2 : 4);
- insn->immediateSize = (insn->hasOpSize ? 2 : 4);
- }
- }
-
- return 0;
-}
-
-static int readModRM(struct InternalInstruction* insn);
-
-/*
- * readOpcode - Reads the opcode (excepting the ModR/M byte in the case of
- * extended or escape opcodes).
- *
- * @param insn - The instruction whose opcode is to be read.
- * @return - 0 if the opcode could be read successfully; nonzero otherwise.
- */
-static int readOpcode(struct InternalInstruction* insn) {
- /* Determine the length of the primary opcode */
-
- uint8_t current;
-
- dbgprintf(insn, "readOpcode()");
-
- insn->opcodeType = ONEBYTE;
-
- if (insn->vectorExtensionType == TYPE_EVEX) {
- switch (mmFromEVEX2of4(insn->vectorExtensionPrefix[1])) {
- default:
- dbgprintf(insn, "Unhandled mm field for instruction (0x%hhx)",
- mmFromEVEX2of4(insn->vectorExtensionPrefix[1]));
- return -1;
- case VEX_LOB_0F:
- insn->opcodeType = TWOBYTE;
- return consumeByte(insn, &insn->opcode);
- case VEX_LOB_0F38:
- insn->opcodeType = THREEBYTE_38;
- return consumeByte(insn, &insn->opcode);
- case VEX_LOB_0F3A:
- insn->opcodeType = THREEBYTE_3A;
- return consumeByte(insn, &insn->opcode);
- }
- } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
- switch (mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1])) {
- default:
- dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
- mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
- return -1;
- case VEX_LOB_0F:
- insn->opcodeType = TWOBYTE;
- return consumeByte(insn, &insn->opcode);
- case VEX_LOB_0F38:
- insn->opcodeType = THREEBYTE_38;
- return consumeByte(insn, &insn->opcode);
- case VEX_LOB_0F3A:
- insn->opcodeType = THREEBYTE_3A;
- return consumeByte(insn, &insn->opcode);
- }
- } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
- insn->opcodeType = TWOBYTE;
- return consumeByte(insn, &insn->opcode);
- } else if (insn->vectorExtensionType == TYPE_XOP) {
- switch (mmmmmFromXOP2of3(insn->vectorExtensionPrefix[1])) {
- default:
- dbgprintf(insn, "Unhandled m-mmmm field for instruction (0x%hhx)",
- mmmmmFromVEX2of3(insn->vectorExtensionPrefix[1]));
- return -1;
- case XOP_MAP_SELECT_8:
- insn->opcodeType = XOP8_MAP;
- return consumeByte(insn, &insn->opcode);
- case XOP_MAP_SELECT_9:
- insn->opcodeType = XOP9_MAP;
- return consumeByte(insn, &insn->opcode);
- case XOP_MAP_SELECT_A:
- insn->opcodeType = XOPA_MAP;
- return consumeByte(insn, &insn->opcode);
- }
- }
-
- if (consumeByte(insn, &current))
- return -1;
-
- if (current == 0x0f) {
- dbgprintf(insn, "Found a two-byte escape prefix (0x%hhx)", current);
-
- if (consumeByte(insn, &current))
- return -1;
-
- if (current == 0x38) {
- dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
- if (consumeByte(insn, &current))
- return -1;
-
- insn->opcodeType = THREEBYTE_38;
- } else if (current == 0x3a) {
- dbgprintf(insn, "Found a three-byte escape prefix (0x%hhx)", current);
-
- if (consumeByte(insn, &current))
- return -1;
-
- insn->opcodeType = THREEBYTE_3A;
- } else if (current == 0x0f) {
- dbgprintf(insn, "Found a 3dnow escape prefix (0x%hhx)", current);
-
- // Consume operands before the opcode to comply with the 3DNow encoding
- if (readModRM(insn))
- return -1;
-
- if (consumeByte(insn, &current))
- return -1;
-
- insn->opcodeType = THREEDNOW_MAP;
- } else {
- dbgprintf(insn, "Didn't find a three-byte escape prefix");
-
- insn->opcodeType = TWOBYTE;
- }
- } else if (insn->mandatoryPrefix)
- // The opcode with mandatory prefix must start with opcode escape.
- // If not it's legacy repeat prefix
- insn->mandatoryPrefix = 0;
-
- /*
- * At this point we have consumed the full opcode.
- * Anything we consume from here on must be unconsumed.
- */
-
- insn->opcode = current;
-
- return 0;
-}
-
-/*
- * getIDWithAttrMask - Determines the ID of an instruction, consuming
- * the ModR/M byte as appropriate for extended and escape opcodes,
- * and using a supplied attribute mask.
- *
- * @param instructionID - A pointer whose target is filled in with the ID of the
- * instruction.
- * @param insn - The instruction whose ID is to be determined.
- * @param attrMask - The attribute mask to search.
- * @return - 0 if the ModR/M could be read when needed or was not
- * needed; nonzero otherwise.
- */
-static int getIDWithAttrMask(uint16_t* instructionID,
- struct InternalInstruction* insn,
- uint16_t attrMask) {
- bool hasModRMExtension;
-
- InstructionContext instructionClass = contextForAttrs(attrMask);
-
- hasModRMExtension = modRMRequired(insn->opcodeType,
- instructionClass,
- insn->opcode);
-
- if (hasModRMExtension) {
- if (readModRM(insn))
- return -1;
-
- *instructionID = decode(insn->opcodeType,
- instructionClass,
- insn->opcode,
- insn->modRM);
- } else {
- *instructionID = decode(insn->opcodeType,
- instructionClass,
- insn->opcode,
- 0);
- }
-
- return 0;
-}
-
-/*
- * is16BitEquivalent - Determines whether two instruction names refer to
- * equivalent instructions but one is 16-bit whereas the other is not.
- *
- * @param orig - The instruction that is not 16-bit
- * @param equiv - The instruction that is 16-bit
- */
-static bool is16BitEquivalent(const char *orig, const char *equiv) {
- off_t i;
-
- for (i = 0;; i++) {
- if (orig[i] == '\0' && equiv[i] == '\0')
- return true;
- if (orig[i] == '\0' || equiv[i] == '\0')
- return false;
- if (orig[i] != equiv[i]) {
- if ((orig[i] == 'Q' || orig[i] == 'L') && equiv[i] == 'W')
- continue;
- if ((orig[i] == '6' || orig[i] == '3') && equiv[i] == '1')
- continue;
- if ((orig[i] == '4' || orig[i] == '2') && equiv[i] == '6')
- continue;
- return false;
- }
- }
-}
-
-/*
- * is64Bit - Determines whether this instruction is a 64-bit instruction.
- *
- * @param name - The instruction that is not 16-bit
- */
-static bool is64Bit(const char *name) {
- off_t i;
-
- for (i = 0;; ++i) {
- if (name[i] == '\0')
- return false;
- if (name[i] == '6' && name[i+1] == '4')
- return true;
- }
-}
-
-/*
- * getID - Determines the ID of an instruction, consuming the ModR/M byte as
- * appropriate for extended and escape opcodes. Determines the attributes and
- * context for the instruction before doing so.
- *
- * @param insn - The instruction whose ID is to be determined.
- * @return - 0 if the ModR/M could be read when needed or was not needed;
- * nonzero otherwise.
- */
-static int getID(struct InternalInstruction* insn, const void *miiArg) {
- uint16_t attrMask;
- uint16_t instructionID;
-
- dbgprintf(insn, "getID()");
-
- attrMask = ATTR_NONE;
-
- if (insn->mode == MODE_64BIT)
- attrMask |= ATTR_64BIT;
-
- if (insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
- attrMask |= (insn->vectorExtensionType == TYPE_EVEX) ? ATTR_EVEX : ATTR_VEX;
-
- if (insn->vectorExtensionType == TYPE_EVEX) {
- switch (ppFromEVEX3of4(insn->vectorExtensionPrefix[2])) {
- case VEX_PREFIX_66:
- attrMask |= ATTR_OPSIZE;
- break;
- case VEX_PREFIX_F3:
- attrMask |= ATTR_XS;
- break;
- case VEX_PREFIX_F2:
- attrMask |= ATTR_XD;
- break;
- }
-
- if (zFromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_EVEXKZ;
- if (bFromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_EVEXB;
- if (aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_EVEXK;
- if (lFromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_VEXL;
- if (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
- attrMask |= ATTR_EVEXL2;
- } else if (insn->vectorExtensionType == TYPE_VEX_3B) {
- switch (ppFromVEX3of3(insn->vectorExtensionPrefix[2])) {
- case VEX_PREFIX_66:
- attrMask |= ATTR_OPSIZE;
- break;
- case VEX_PREFIX_F3:
- attrMask |= ATTR_XS;
- break;
- case VEX_PREFIX_F2:
- attrMask |= ATTR_XD;
- break;
- }
-
- if (lFromVEX3of3(insn->vectorExtensionPrefix[2]))
- attrMask |= ATTR_VEXL;
- } else if (insn->vectorExtensionType == TYPE_VEX_2B) {
- switch (ppFromVEX2of2(insn->vectorExtensionPrefix[1])) {
- case VEX_PREFIX_66:
- attrMask |= ATTR_OPSIZE;
- break;
- case VEX_PREFIX_F3:
- attrMask |= ATTR_XS;
- break;
- case VEX_PREFIX_F2:
- attrMask |= ATTR_XD;
- break;
- }
-
- if (lFromVEX2of2(insn->vectorExtensionPrefix[1]))
- attrMask |= ATTR_VEXL;
- } else if (insn->vectorExtensionType == TYPE_XOP) {
- switch (ppFromXOP3of3(insn->vectorExtensionPrefix[2])) {
- case VEX_PREFIX_66:
- attrMask |= ATTR_OPSIZE;
- break;
- case VEX_PREFIX_F3:
- attrMask |= ATTR_XS;
- break;
- case VEX_PREFIX_F2:
- attrMask |= ATTR_XD;
- break;
- }
-
- if (lFromXOP3of3(insn->vectorExtensionPrefix[2]))
- attrMask |= ATTR_VEXL;
- } else {
- return -1;
- }
- } else if (!insn->mandatoryPrefix) {
- // If we don't have mandatory prefix we should use legacy prefixes here
- if (insn->hasOpSize && (insn->mode != MODE_16BIT))
- attrMask |= ATTR_OPSIZE;
- if (insn->hasAdSize)
- attrMask |= ATTR_ADSIZE;
- if (insn->opcodeType == ONEBYTE) {
- if (insn->repeatPrefix == 0xf3 && (insn->opcode == 0x90))
- // Special support for PAUSE
- attrMask |= ATTR_XS;
- } else {
- if (insn->repeatPrefix == 0xf2)
- attrMask |= ATTR_XD;
- else if (insn->repeatPrefix == 0xf3)
- attrMask |= ATTR_XS;
- }
- } else {
- switch (insn->mandatoryPrefix) {
- case 0xf2:
- attrMask |= ATTR_XD;
- break;
- case 0xf3:
- attrMask |= ATTR_XS;
- break;
- case 0x66:
- if (insn->mode != MODE_16BIT)
- attrMask |= ATTR_OPSIZE;
- break;
- case 0x67:
- attrMask |= ATTR_ADSIZE;
- break;
- }
-
- }
-
- if (insn->rexPrefix & 0x08) {
- attrMask |= ATTR_REXW;
- attrMask &= ~ATTR_ADSIZE;
- }
-
- /*
- * JCXZ/JECXZ need special handling for 16-bit mode because the meaning
- * of the AdSize prefix is inverted w.r.t. 32-bit mode.
- */
- if (insn->mode == MODE_16BIT && insn->opcodeType == ONEBYTE &&
- insn->opcode == 0xE3)
- attrMask ^= ATTR_ADSIZE;
-
- // If we're in 16-bit mode and this is one of the relative jumps and opsize
- // prefix isn't present, we need to force the opsize attribute since the
- // prefix is inverted relative to 32-bit mode.
- if (insn->mode == MODE_16BIT && !insn->hasOpSize &&
- insn->opcodeType == ONEBYTE &&
- (insn->opcode == 0xE8 || insn->opcode == 0xE9))
- attrMask |= ATTR_OPSIZE;
-
- if (insn->mode == MODE_16BIT && !insn->hasOpSize &&
- insn->opcodeType == TWOBYTE &&
- insn->opcode >= 0x80 && insn->opcode <= 0x8F)
- attrMask |= ATTR_OPSIZE;
-
- if (getIDWithAttrMask(&instructionID, insn, attrMask))
- return -1;
-
- /* The following clauses compensate for limitations of the tables. */
-
- if (insn->mode != MODE_64BIT &&
- insn->vectorExtensionType != TYPE_NO_VEX_XOP) {
- /*
- * The tables can't distinquish between cases where the W-bit is used to
- * select register size and cases where its a required part of the opcode.
- */
- if ((insn->vectorExtensionType == TYPE_EVEX &&
- wFromEVEX3of4(insn->vectorExtensionPrefix[2])) ||
- (insn->vectorExtensionType == TYPE_VEX_3B &&
- wFromVEX3of3(insn->vectorExtensionPrefix[2])) ||
- (insn->vectorExtensionType == TYPE_XOP &&
- wFromXOP3of3(insn->vectorExtensionPrefix[2]))) {
-
- uint16_t instructionIDWithREXW;
- if (getIDWithAttrMask(&instructionIDWithREXW,
- insn, attrMask | ATTR_REXW)) {
- insn->instructionID = instructionID;
- insn->spec = specifierForUID(instructionID);
- return 0;
- }
-
- auto SpecName = GetInstrName(instructionIDWithREXW, miiArg);
- // If not a 64-bit instruction. Switch the opcode.
- if (!is64Bit(SpecName.data())) {
- insn->instructionID = instructionIDWithREXW;
- insn->spec = specifierForUID(instructionIDWithREXW);
- return 0;
- }
- }
- }
-
- /*
- * Absolute moves, umonitor, and movdir64b need special handling.
- * -For 16-bit mode because the meaning of the AdSize and OpSize prefixes are
- * inverted w.r.t.
- * -For 32-bit mode we need to ensure the ADSIZE prefix is observed in
- * any position.
- */
- if ((insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0)) ||
- (insn->opcodeType == TWOBYTE && (insn->opcode == 0xAE)) ||
- (insn->opcodeType == THREEBYTE_38 && insn->opcode == 0xF8)) {
- /* Make sure we observed the prefixes in any position. */
- if (insn->hasAdSize)
- attrMask |= ATTR_ADSIZE;
- if (insn->hasOpSize)
- attrMask |= ATTR_OPSIZE;
-
- /* In 16-bit, invert the attributes. */
- if (insn->mode == MODE_16BIT) {
- attrMask ^= ATTR_ADSIZE;
-
- /* The OpSize attribute is only valid with the absolute moves. */
- if (insn->opcodeType == ONEBYTE && ((insn->opcode & 0xFC) == 0xA0))
- attrMask ^= ATTR_OPSIZE;
- }
-
- if (getIDWithAttrMask(&instructionID, insn, attrMask))
- return -1;
-
- insn->instructionID = instructionID;
- insn->spec = specifierForUID(instructionID);
- return 0;
- }
-
- if ((insn->mode == MODE_16BIT || insn->hasOpSize) &&
- !(attrMask & ATTR_OPSIZE)) {
- /*
- * The instruction tables make no distinction between instructions that
- * allow OpSize anywhere (i.e., 16-bit operations) and that need it in a
- * particular spot (i.e., many MMX operations). In general we're
- * conservative, but in the specific case where OpSize is present but not
- * in the right place we check if there's a 16-bit operation.
- */
-
- const struct InstructionSpecifier *spec;
- uint16_t instructionIDWithOpsize;
- llvm::StringRef specName, specWithOpSizeName;
-
- spec = specifierForUID(instructionID);
-
- if (getIDWithAttrMask(&instructionIDWithOpsize,
- insn,
- attrMask | ATTR_OPSIZE)) {
- /*
- * ModRM required with OpSize but not present; give up and return version
- * without OpSize set
- */
-
- insn->instructionID = instructionID;
- insn->spec = spec;
- return 0;
- }
-
- specName = GetInstrName(instructionID, miiArg);
- specWithOpSizeName = GetInstrName(instructionIDWithOpsize, miiArg);
-
- if (is16BitEquivalent(specName.data(), specWithOpSizeName.data()) &&
- (insn->mode == MODE_16BIT) ^ insn->hasOpSize) {
- insn->instructionID = instructionIDWithOpsize;
- insn->spec = specifierForUID(instructionIDWithOpsize);
- } else {
- insn->instructionID = instructionID;
- insn->spec = spec;
- }
- return 0;
- }
-
- if (insn->opcodeType == ONEBYTE && insn->opcode == 0x90 &&
- insn->rexPrefix & 0x01) {
- /*
- * NOOP shouldn't decode as NOOP if REX.b is set. Instead
- * it should decode as XCHG %r8, %eax.
- */
-
- const struct InstructionSpecifier *spec;
- uint16_t instructionIDWithNewOpcode;
- const struct InstructionSpecifier *specWithNewOpcode;
-
- spec = specifierForUID(instructionID);
-
- /* Borrow opcode from one of the other XCHGar opcodes */
- insn->opcode = 0x91;
-
- if (getIDWithAttrMask(&instructionIDWithNewOpcode,
- insn,
- attrMask)) {
- insn->opcode = 0x90;
-
- insn->instructionID = instructionID;
- insn->spec = spec;
- return 0;
- }
-
- specWithNewOpcode = specifierForUID(instructionIDWithNewOpcode);
-
- /* Change back */
- insn->opcode = 0x90;
-
- insn->instructionID = instructionIDWithNewOpcode;
- insn->spec = specWithNewOpcode;
-
- return 0;
- }
-
- insn->instructionID = instructionID;
- insn->spec = specifierForUID(insn->instructionID);
-
- return 0;
-}
-
-/*
- * readSIB - Consumes the SIB byte to determine addressing information for an
- * instruction.
- *
- * @param insn - The instruction whose SIB byte is to be read.
- * @return - 0 if the SIB byte was successfully read; nonzero otherwise.
- */
-static int readSIB(struct InternalInstruction* insn) {
- SIBBase sibBaseBase = SIB_BASE_NONE;
- uint8_t index, base;
-
- dbgprintf(insn, "readSIB()");
-
- if (insn->consumedSIB)
- return 0;
-
- insn->consumedSIB = true;
-
- switch (insn->addressSize) {
- case 2:
- dbgprintf(insn, "SIB-based addressing doesn't work in 16-bit mode");
- return -1;
- case 4:
- insn->sibIndexBase = SIB_INDEX_EAX;
- sibBaseBase = SIB_BASE_EAX;
- break;
- case 8:
- insn->sibIndexBase = SIB_INDEX_RAX;
- sibBaseBase = SIB_BASE_RAX;
- break;
- }
-
- if (consumeByte(insn, &insn->sib))
- return -1;
-
- index = indexFromSIB(insn->sib) | (xFromREX(insn->rexPrefix) << 3);
-
- if (index == 0x4) {
- insn->sibIndex = SIB_INDEX_NONE;
- } else {
- insn->sibIndex = (SIBIndex)(insn->sibIndexBase + index);
- }
-
- insn->sibScale = 1 << scaleFromSIB(insn->sib);
-
- base = baseFromSIB(insn->sib) | (bFromREX(insn->rexPrefix) << 3);
-
- switch (base) {
- case 0x5:
- case 0xd:
- switch (modFromModRM(insn->modRM)) {
- case 0x0:
- insn->eaDisplacement = EA_DISP_32;
- insn->sibBase = SIB_BASE_NONE;
- break;
- case 0x1:
- insn->eaDisplacement = EA_DISP_8;
- insn->sibBase = (SIBBase)(sibBaseBase + base);
- break;
- case 0x2:
- insn->eaDisplacement = EA_DISP_32;
- insn->sibBase = (SIBBase)(sibBaseBase + base);
- break;
- case 0x3:
- debug("Cannot have Mod = 0b11 and a SIB byte");
- return -1;
- }
- break;
- default:
- insn->sibBase = (SIBBase)(sibBaseBase + base);
- break;
- }
-
- return 0;
-}
-
-/*
- * readDisplacement - Consumes the displacement of an instruction.
- *
- * @param insn - The instruction whose displacement is to be read.
- * @return - 0 if the displacement byte was successfully read; nonzero
- * otherwise.
- */
-static int readDisplacement(struct InternalInstruction* insn) {
- int8_t d8;
- int16_t d16;
- int32_t d32;
-
- dbgprintf(insn, "readDisplacement()");
-
- if (insn->consumedDisplacement)
- return 0;
-
- insn->consumedDisplacement = true;
- insn->displacementOffset = insn->readerCursor - insn->startLocation;
-
- switch (insn->eaDisplacement) {
- case EA_DISP_NONE:
- insn->consumedDisplacement = false;
- break;
- case EA_DISP_8:
- if (consumeInt8(insn, &d8))
- return -1;
- insn->displacement = d8;
- break;
- case EA_DISP_16:
- if (consumeInt16(insn, &d16))
- return -1;
- insn->displacement = d16;
- break;
- case EA_DISP_32:
- if (consumeInt32(insn, &d32))
- return -1;
- insn->displacement = d32;
- break;
- }
-
- insn->consumedDisplacement = true;
- return 0;
-}
-
-/*
- * readModRM - Consumes all addressing information (ModR/M byte, SIB byte, and
- * displacement) for an instruction and interprets it.
- *
- * @param insn - The instruction whose addressing information is to be read.
- * @return - 0 if the information was successfully read; nonzero otherwise.
- */
-static int readModRM(struct InternalInstruction* insn) {
- uint8_t mod, rm, reg, evexrm;
-
- dbgprintf(insn, "readModRM()");
-
- if (insn->consumedModRM)
- return 0;
-
- if (consumeByte(insn, &insn->modRM))
- return -1;
- insn->consumedModRM = true;
-
- mod = modFromModRM(insn->modRM);
- rm = rmFromModRM(insn->modRM);
- reg = regFromModRM(insn->modRM);
-
- /*
- * This goes by insn->registerSize to pick the correct register, which messes
- * up if we're using (say) XMM or 8-bit register operands. That gets fixed in
- * fixupReg().
- */
- switch (insn->registerSize) {
- case 2:
- insn->regBase = MODRM_REG_AX;
- insn->eaRegBase = EA_REG_AX;
- break;
- case 4:
- insn->regBase = MODRM_REG_EAX;
- insn->eaRegBase = EA_REG_EAX;
- break;
- case 8:
- insn->regBase = MODRM_REG_RAX;
- insn->eaRegBase = EA_REG_RAX;
- break;
- }
-
- reg |= rFromREX(insn->rexPrefix) << 3;
- rm |= bFromREX(insn->rexPrefix) << 3;
-
- evexrm = 0;
- if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT) {
- reg |= r2FromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
- evexrm = xFromEVEX2of4(insn->vectorExtensionPrefix[1]) << 4;
- }
-
- insn->reg = (Reg)(insn->regBase + reg);
-
- switch (insn->addressSize) {
- case 2: {
- EABase eaBaseBase = EA_BASE_BX_SI;
-
- switch (mod) {
- case 0x0:
- if (rm == 0x6) {
- insn->eaBase = EA_BASE_NONE;
- insn->eaDisplacement = EA_DISP_16;
- if (readDisplacement(insn))
- return -1;
- } else {
- insn->eaBase = (EABase)(eaBaseBase + rm);
- insn->eaDisplacement = EA_DISP_NONE;
- }
- break;
- case 0x1:
- insn->eaBase = (EABase)(eaBaseBase + rm);
- insn->eaDisplacement = EA_DISP_8;
- insn->displacementSize = 1;
- if (readDisplacement(insn))
- return -1;
- break;
- case 0x2:
- insn->eaBase = (EABase)(eaBaseBase + rm);
- insn->eaDisplacement = EA_DISP_16;
- if (readDisplacement(insn))
- return -1;
- break;
- case 0x3:
- insn->eaBase = (EABase)(insn->eaRegBase + rm);
- if (readDisplacement(insn))
- return -1;
- break;
- }
- break;
- }
- case 4:
- case 8: {
- EABase eaBaseBase = (insn->addressSize == 4 ? EA_BASE_EAX : EA_BASE_RAX);
-
- switch (mod) {
- case 0x0:
- insn->eaDisplacement = EA_DISP_NONE; /* readSIB may override this */
- // In determining whether RIP-relative mode is used (rm=5),
- // or whether a SIB byte is present (rm=4),
- // the extension bits (REX.b and EVEX.x) are ignored.
- switch (rm & 7) {
- case 0x4: // SIB byte is present
- insn->eaBase = (insn->addressSize == 4 ?
- EA_BASE_sib : EA_BASE_sib64);
- if (readSIB(insn) || readDisplacement(insn))
- return -1;
- break;
- case 0x5: // RIP-relative
- insn->eaBase = EA_BASE_NONE;
- insn->eaDisplacement = EA_DISP_32;
- if (readDisplacement(insn))
- return -1;
- break;
- default:
- insn->eaBase = (EABase)(eaBaseBase + rm);
- break;
- }
- break;
- case 0x1:
- insn->displacementSize = 1;
- LLVM_FALLTHROUGH;
- case 0x2:
- insn->eaDisplacement = (mod == 0x1 ? EA_DISP_8 : EA_DISP_32);
- switch (rm & 7) {
- case 0x4: // SIB byte is present
- insn->eaBase = EA_BASE_sib;
- if (readSIB(insn) || readDisplacement(insn))
- return -1;
- break;
- default:
- insn->eaBase = (EABase)(eaBaseBase + rm);
- if (readDisplacement(insn))
- return -1;
- break;
- }
- break;
- case 0x3:
- insn->eaDisplacement = EA_DISP_NONE;
- insn->eaBase = (EABase)(insn->eaRegBase + rm + evexrm);
- break;
- }
- break;
- }
- } /* switch (insn->addressSize) */
-
- return 0;
-}
-
-#define GENERIC_FIXUP_FUNC(name, base, prefix, mask) \
- static uint16_t name(struct InternalInstruction *insn, \
- OperandType type, \
- uint8_t index, \
- uint8_t *valid) { \
- *valid = 1; \
- switch (type) { \
- default: \
- debug("Unhandled register type"); \
- *valid = 0; \
- return 0; \
- case TYPE_Rv: \
- return base + index; \
- case TYPE_R8: \
- index &= mask; \
- if (index > 0xf) \
- *valid = 0; \
- if (insn->rexPrefix && \
- index >= 4 && index <= 7) { \
- return prefix##_SPL + (index - 4); \
- } else { \
- return prefix##_AL + index; \
- } \
- case TYPE_R16: \
- index &= mask; \
- if (index > 0xf) \
- *valid = 0; \
- return prefix##_AX + index; \
- case TYPE_R32: \
- index &= mask; \
- if (index > 0xf) \
- *valid = 0; \
- return prefix##_EAX + index; \
- case TYPE_R64: \
- index &= mask; \
- if (index > 0xf) \
- *valid = 0; \
- return prefix##_RAX + index; \
- case TYPE_ZMM: \
- return prefix##_ZMM0 + index; \
- case TYPE_YMM: \
- return prefix##_YMM0 + index; \
- case TYPE_XMM: \
- return prefix##_XMM0 + index; \
- case TYPE_VK: \
- index &= 0xf; \
- if (index > 7) \
- *valid = 0; \
- return prefix##_K0 + index; \
- case TYPE_VK_PAIR: \
- if (index > 7) \
- *valid = 0; \
- return prefix##_K0_K1 + (index / 2); \
- case TYPE_MM64: \
- return prefix##_MM0 + (index & 0x7); \
- case TYPE_SEGMENTREG: \
- if ((index & 7) > 5) \
- *valid = 0; \
- return prefix##_ES + (index & 7); \
- case TYPE_DEBUGREG: \
- return prefix##_DR0 + index; \
- case TYPE_CONTROLREG: \
- return prefix##_CR0 + index; \
- case TYPE_BNDR: \
- if (index > 3) \
- *valid = 0; \
- return prefix##_BND0 + index; \
- case TYPE_MVSIBX: \
- return prefix##_XMM0 + index; \
- case TYPE_MVSIBY: \
- return prefix##_YMM0 + index; \
- case TYPE_MVSIBZ: \
- return prefix##_ZMM0 + index; \
- } \
- }
-
-/*
- * fixup*Value - Consults an operand type to determine the meaning of the
- * reg or R/M field. If the operand is an XMM operand, for example, an
- * operand would be XMM0 instead of AX, which readModRM() would otherwise
- * misinterpret it as.
- *
- * @param insn - The instruction containing the operand.
- * @param type - The operand type.
- * @param index - The existing value of the field as reported by readModRM().
- * @param valid - The address of a uint8_t. The target is set to 1 if the
- * field is valid for the register class; 0 if not.
- * @return - The proper value.
- */
-GENERIC_FIXUP_FUNC(fixupRegValue, insn->regBase, MODRM_REG, 0x1f)
-GENERIC_FIXUP_FUNC(fixupRMValue, insn->eaRegBase, EA_REG, 0xf)
-
-/*
- * fixupReg - Consults an operand specifier to determine which of the
- * fixup*Value functions to use in correcting readModRM()'ss interpretation.
- *
- * @param insn - See fixup*Value().
- * @param op - The operand specifier.
- * @return - 0 if fixup was successful; -1 if the register returned was
- * invalid for its class.
- */
-static int fixupReg(struct InternalInstruction *insn,
- const struct OperandSpecifier *op) {
- uint8_t valid;
-
- dbgprintf(insn, "fixupReg()");
-
- switch ((OperandEncoding)op->encoding) {
- default:
- debug("Expected a REG or R/M encoding in fixupReg");
- return -1;
- case ENCODING_VVVV:
- insn->vvvv = (Reg)fixupRegValue(insn,
- (OperandType)op->type,
- insn->vvvv,
- &valid);
- if (!valid)
- return -1;
- break;
- case ENCODING_REG:
- insn->reg = (Reg)fixupRegValue(insn,
- (OperandType)op->type,
- insn->reg - insn->regBase,
- &valid);
- if (!valid)
- return -1;
- break;
- CASE_ENCODING_RM:
- if (insn->eaBase >= insn->eaRegBase) {
- insn->eaBase = (EABase)fixupRMValue(insn,
- (OperandType)op->type,
- insn->eaBase - insn->eaRegBase,
- &valid);
- if (!valid)
- return -1;
- }
- break;
- }
-
- return 0;
-}
-
-/*
- * readOpcodeRegister - Reads an operand from the opcode field of an
- * instruction and interprets it appropriately given the operand width.
- * Handles AddRegFrm instructions.
- *
- * @param insn - the instruction whose opcode field is to be read.
- * @param size - The width (in bytes) of the register being specified.
- * 1 means AL and friends, 2 means AX, 4 means EAX, and 8 means
- * RAX.
- * @return - 0 on success; nonzero otherwise.
- */
-static int readOpcodeRegister(struct InternalInstruction* insn, uint8_t size) {
- dbgprintf(insn, "readOpcodeRegister()");
-
- if (size == 0)
- size = insn->registerSize;
-
- switch (size) {
- case 1:
- insn->opcodeRegister = (Reg)(MODRM_REG_AL + ((bFromREX(insn->rexPrefix) << 3)
- | (insn->opcode & 7)));
- if (insn->rexPrefix &&
- insn->opcodeRegister >= MODRM_REG_AL + 0x4 &&
- insn->opcodeRegister < MODRM_REG_AL + 0x8) {
- insn->opcodeRegister = (Reg)(MODRM_REG_SPL
- + (insn->opcodeRegister - MODRM_REG_AL - 4));
- }
-
- break;
- case 2:
- insn->opcodeRegister = (Reg)(MODRM_REG_AX
- + ((bFromREX(insn->rexPrefix) << 3)
- | (insn->opcode & 7)));
- break;
- case 4:
- insn->opcodeRegister = (Reg)(MODRM_REG_EAX
- + ((bFromREX(insn->rexPrefix) << 3)
- | (insn->opcode & 7)));
- break;
- case 8:
- insn->opcodeRegister = (Reg)(MODRM_REG_RAX
- + ((bFromREX(insn->rexPrefix) << 3)
- | (insn->opcode & 7)));
- break;
- }
-
- return 0;
-}
-
-/*
- * readImmediate - Consumes an immediate operand from an instruction, given the
- * desired operand size.
- *
- * @param insn - The instruction whose operand is to be read.
- * @param size - The width (in bytes) of the operand.
- * @return - 0 if the immediate was successfully consumed; nonzero
- * otherwise.
- */
-static int readImmediate(struct InternalInstruction* insn, uint8_t size) {
- uint8_t imm8;
- uint16_t imm16;
- uint32_t imm32;
- uint64_t imm64;
-
- dbgprintf(insn, "readImmediate()");
-
- if (insn->numImmediatesConsumed == 2) {
- debug("Already consumed two immediates");
- return -1;
- }
-
- if (size == 0)
- size = insn->immediateSize;
- else
- insn->immediateSize = size;
- insn->immediateOffset = insn->readerCursor - insn->startLocation;
-
- switch (size) {
- case 1:
- if (consumeByte(insn, &imm8))
- return -1;
- insn->immediates[insn->numImmediatesConsumed] = imm8;
- break;
- case 2:
- if (consumeUInt16(insn, &imm16))
- return -1;
- insn->immediates[insn->numImmediatesConsumed] = imm16;
- break;
- case 4:
- if (consumeUInt32(insn, &imm32))
- return -1;
- insn->immediates[insn->numImmediatesConsumed] = imm32;
- break;
- case 8:
- if (consumeUInt64(insn, &imm64))
- return -1;
- insn->immediates[insn->numImmediatesConsumed] = imm64;
- break;
- }
-
- insn->numImmediatesConsumed++;
-
- return 0;
-}
-
-/*
- * readVVVV - Consumes vvvv from an instruction if it has a VEX prefix.
- *
- * @param insn - The instruction whose operand is to be read.
- * @return - 0 if the vvvv was successfully consumed; nonzero
- * otherwise.
- */
-static int readVVVV(struct InternalInstruction* insn) {
- dbgprintf(insn, "readVVVV()");
-
- int vvvv;
- if (insn->vectorExtensionType == TYPE_EVEX)
- vvvv = (v2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 4 |
- vvvvFromEVEX3of4(insn->vectorExtensionPrefix[2]));
- else if (insn->vectorExtensionType == TYPE_VEX_3B)
- vvvv = vvvvFromVEX3of3(insn->vectorExtensionPrefix[2]);
- else if (insn->vectorExtensionType == TYPE_VEX_2B)
- vvvv = vvvvFromVEX2of2(insn->vectorExtensionPrefix[1]);
- else if (insn->vectorExtensionType == TYPE_XOP)
- vvvv = vvvvFromXOP3of3(insn->vectorExtensionPrefix[2]);
- else
- return -1;
-
- if (insn->mode != MODE_64BIT)
- vvvv &= 0xf; // Can only clear bit 4. Bit 3 must be cleared later.
-
- insn->vvvv = static_cast<Reg>(vvvv);
- return 0;
-}
-
-/*
- * readMaskRegister - Reads an mask register from the opcode field of an
- * instruction.
- *
- * @param insn - The instruction whose opcode field is to be read.
- * @return - 0 on success; nonzero otherwise.
- */
-static int readMaskRegister(struct InternalInstruction* insn) {
- dbgprintf(insn, "readMaskRegister()");
-
- if (insn->vectorExtensionType != TYPE_EVEX)
- return -1;
-
- insn->writemask =
- static_cast<Reg>(aaaFromEVEX4of4(insn->vectorExtensionPrefix[3]));
- return 0;
-}
-
-/*
- * readOperands - Consults the specifier for an instruction and consumes all
- * operands for that instruction, interpreting them as it goes.
- *
- * @param insn - The instruction whose operands are to be read and interpreted.
- * @return - 0 if all operands could be read; nonzero otherwise.
- */
-static int readOperands(struct InternalInstruction* insn) {
- int hasVVVV, needVVVV;
- int sawRegImm = 0;
-
- dbgprintf(insn, "readOperands()");
-
- /* If non-zero vvvv specified, need to make sure one of the operands
- uses it. */
- hasVVVV = !readVVVV(insn);
- needVVVV = hasVVVV && (insn->vvvv != 0);
-
- for (const auto &Op : x86OperandSets[insn->spec->operands]) {
- switch (Op.encoding) {
- case ENCODING_NONE:
- case ENCODING_SI:
- case ENCODING_DI:
- break;
- CASE_ENCODING_VSIB:
- // VSIB can use the V2 bit so check only the other bits.
- if (needVVVV)
- needVVVV = hasVVVV & ((insn->vvvv & 0xf) != 0);
- if (readModRM(insn))
- return -1;
-
- // Reject if SIB wasn't used.
- if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
- return -1;
-
- // If sibIndex was set to SIB_INDEX_NONE, index offset is 4.
- if (insn->sibIndex == SIB_INDEX_NONE)
- insn->sibIndex = (SIBIndex)(insn->sibIndexBase + 4);
-
- // If EVEX.v2 is set this is one of the 16-31 registers.
- if (insn->vectorExtensionType == TYPE_EVEX && insn->mode == MODE_64BIT &&
- v2FromEVEX4of4(insn->vectorExtensionPrefix[3]))
- insn->sibIndex = (SIBIndex)(insn->sibIndex + 16);
-
- // Adjust the index register to the correct size.
- switch ((OperandType)Op.type) {
- default:
- debug("Unhandled VSIB index type");
- return -1;
- case TYPE_MVSIBX:
- insn->sibIndex = (SIBIndex)(SIB_INDEX_XMM0 +
- (insn->sibIndex - insn->sibIndexBase));
- break;
- case TYPE_MVSIBY:
- insn->sibIndex = (SIBIndex)(SIB_INDEX_YMM0 +
- (insn->sibIndex - insn->sibIndexBase));
- break;
- case TYPE_MVSIBZ:
- insn->sibIndex = (SIBIndex)(SIB_INDEX_ZMM0 +
- (insn->sibIndex - insn->sibIndexBase));
- break;
- }
-
- // Apply the AVX512 compressed displacement scaling factor.
- if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
- insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
- break;
- case ENCODING_REG:
- CASE_ENCODING_RM:
- if (readModRM(insn))
- return -1;
- if (fixupReg(insn, &Op))
- return -1;
- // Apply the AVX512 compressed displacement scaling factor.
- if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
- insn->displacement *= 1 << (Op.encoding - ENCODING_RM);
- break;
- case ENCODING_IB:
- if (sawRegImm) {
- /* Saw a register immediate so don't read again and instead split the
- previous immediate. FIXME: This is a hack. */
- insn->immediates[insn->numImmediatesConsumed] =
- insn->immediates[insn->numImmediatesConsumed - 1] & 0xf;
- ++insn->numImmediatesConsumed;
- break;
- }
- if (readImmediate(insn, 1))
- return -1;
- if (Op.type == TYPE_XMM || Op.type == TYPE_YMM)
- sawRegImm = 1;
- break;
- case ENCODING_IW:
- if (readImmediate(insn, 2))
- return -1;
- break;
- case ENCODING_ID:
- if (readImmediate(insn, 4))
- return -1;
- break;
- case ENCODING_IO:
- if (readImmediate(insn, 8))
- return -1;
- break;
- case ENCODING_Iv:
- if (readImmediate(insn, insn->immediateSize))
- return -1;
- break;
- case ENCODING_Ia:
- if (readImmediate(insn, insn->addressSize))
- return -1;
- break;
- case ENCODING_IRC:
- insn->RC = (l2FromEVEX4of4(insn->vectorExtensionPrefix[3]) << 1) |
- lFromEVEX4of4(insn->vectorExtensionPrefix[3]);
- break;
- case ENCODING_RB:
- if (readOpcodeRegister(insn, 1))
- return -1;
- break;
- case ENCODING_RW:
- if (readOpcodeRegister(insn, 2))
- return -1;
- break;
- case ENCODING_RD:
- if (readOpcodeRegister(insn, 4))
- return -1;
- break;
- case ENCODING_RO:
- if (readOpcodeRegister(insn, 8))
- return -1;
- break;
- case ENCODING_Rv:
- if (readOpcodeRegister(insn, 0))
- return -1;
- break;
- case ENCODING_CC:
- insn->immediates[1] = insn->opcode & 0xf;
- break;
- case ENCODING_FP:
- break;
- case ENCODING_VVVV:
- needVVVV = 0; /* Mark that we have found a VVVV operand. */
- if (!hasVVVV)
- return -1;
- if (insn->mode != MODE_64BIT)
- insn->vvvv = static_cast<Reg>(insn->vvvv & 0x7);
- if (fixupReg(insn, &Op))
- return -1;
- break;
- case ENCODING_WRITEMASK:
- if (readMaskRegister(insn))
- return -1;
- break;
- case ENCODING_DUP:
- break;
- default:
- dbgprintf(insn, "Encountered an operand with an unknown encoding.");
- return -1;
- }
- }
-
- /* If we didn't find ENCODING_VVVV operand, but non-zero vvvv present, fail */
- if (needVVVV) return -1;
-
- return 0;
-}
-
-/*
- * decodeInstruction - Reads and interprets a full instruction provided by the
- * user.
- *
- * @param insn - A pointer to the instruction to be populated. Must be
- * pre-allocated.
- * @param reader - The function to be used to read the instruction's bytes.
- * @param readerArg - A generic argument to be passed to the reader to store
- * any internal state.
- * @param logger - If non-NULL, the function to be used to write log messages
- * and warnings.
- * @param loggerArg - A generic argument to be passed to the logger to store
- * any internal state.
- * @param startLoc - The address (in the reader's address space) of the first
- * byte in the instruction.
- * @param mode - The mode (real mode, IA-32e, or IA-32e in 64-bit mode) to
- * decode the instruction in.
- * @return - 0 if the instruction's memory could be read; nonzero if
- * not.
- */
-int llvm::X86Disassembler::decodeInstruction(
- struct InternalInstruction *insn, byteReader_t reader,
- const void *readerArg, dlog_t logger, void *loggerArg, const void *miiArg,
- uint64_t startLoc, DisassemblerMode mode) {
- memset(insn, 0, sizeof(struct InternalInstruction));
-
- insn->reader = reader;
- insn->readerArg = readerArg;
- insn->dlog = logger;
- insn->dlogArg = loggerArg;
- insn->startLocation = startLoc;
- insn->readerCursor = startLoc;
- insn->mode = mode;
- insn->numImmediatesConsumed = 0;
-
- if (readPrefixes(insn) ||
- readOpcode(insn) ||
- getID(insn, miiArg) ||
- insn->instructionID == 0 ||
- readOperands(insn))
- return -1;
-
- insn->operands = x86OperandSets[insn->spec->operands];
-
- insn->length = insn->readerCursor - insn->startLocation;
-
- dbgprintf(insn, "Read from 0x%llx to 0x%llx: length %zu",
- startLoc, insn->readerCursor, insn->length);
-
- if (insn->length > 15)
- dbgprintf(insn, "Instruction exceeds 15-byte limit");
-
- return 0;
-}
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 7c0a42c019e3..147fe46d81b9 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -19,6 +19,9 @@
#include "llvm/Support/X86DisassemblerDecoderCommon.h"
namespace llvm {
+
+class MCInstrInfo;
+
namespace X86Disassembler {
// Accessor functions for various fields of an Intel instruction
@@ -446,12 +449,12 @@ enum SIBBase {
};
/// Possible displacement types for effective-address computations.
-typedef enum {
+enum EADisplacement {
EA_DISP_NONE,
EA_DISP_8,
EA_DISP_16,
EA_DISP_32
-} EADisplacement;
+};
/// All possible values of the reg field in the ModR/M byte.
enum Reg {
@@ -502,25 +505,6 @@ enum VectorExtensionType {
TYPE_XOP = 0x4
};
-/// Type for the byte reader that the consumer must provide to
-/// the decoder. Reads a single byte from the instruction's address space.
-/// \param arg A baton that the consumer can associate with any internal
-/// state that it needs.
-/// \param byte A pointer to a single byte in memory that should be set to
-/// contain the value at address.
-/// \param address The address in the instruction's address space that should
-/// be read from.
-/// \return -1 if the byte cannot be read for any reason; 0 otherwise.
-typedef int (*byteReader_t)(const void *arg, uint8_t *byte, uint64_t address);
-
-/// Type for the logging function that the consumer can provide to
-/// get debugging output from the decoder.
-/// \param arg A baton that the consumer can associate with any internal
-/// state that it needs.
-/// \param log A string that contains the message. Will be reused after
-/// the logger returns.
-typedef void (*dlog_t)(void *arg, const char *log);
-
/// The specification for how to extract and interpret a full instruction and
/// its operands.
struct InstructionSpecifier {
@@ -529,18 +513,11 @@ struct InstructionSpecifier {
/// The x86 internal instruction, which is produced by the decoder.
struct InternalInstruction {
- // Reader interface (C)
- byteReader_t reader;
// Opaque value passed to the reader
- const void* readerArg;
+ llvm::ArrayRef<uint8_t> bytes;
// The address of the next byte to read via the reader
uint64_t readerCursor;
- // Logger interface (C)
- dlog_t dlog;
- // Opaque value passed to the logger
- void* dlogArg;
-
// General instruction information
// The mode to disassemble for (64-bit, protected, real)
@@ -616,11 +593,9 @@ struct InternalInstruction {
uint8_t modRM;
// The SIB byte, used for more complex 32- or 64-bit memory operands
- bool consumedSIB;
uint8_t sib;
// The displacement, used for memory operands
- bool consumedDisplacement;
int32_t displacement;
// Immediates. There can be two in some cases
@@ -657,38 +632,6 @@ struct InternalInstruction {
ArrayRef<OperandSpecifier> operands;
};
-/// Decode one instruction and store the decoding results in
-/// a buffer provided by the consumer.
-/// \param insn The buffer to store the instruction in. Allocated by the
-/// consumer.
-/// \param reader The byteReader_t for the bytes to be read.
-/// \param readerArg An argument to pass to the reader for storing context
-/// specific to the consumer. May be NULL.
-/// \param logger The dlog_t to be used in printing status messages from the
-/// disassembler. May be NULL.
-/// \param loggerArg An argument to pass to the logger for storing context
-/// specific to the logger. May be NULL.
-/// \param startLoc The address (in the reader's address space) of the first
-/// byte in the instruction.
-/// \param mode The mode (16-bit, 32-bit, 64-bit) to decode in.
-/// \return Nonzero if there was an error during decode, 0 otherwise.
-int decodeInstruction(InternalInstruction *insn,
- byteReader_t reader,
- const void *readerArg,
- dlog_t logger,
- void *loggerArg,
- const void *miiArg,
- uint64_t startLoc,
- DisassemblerMode mode);
-
-/// Print a message to debugs()
-/// \param file The name of the file printing the debug message.
-/// \param line The line number that printed the debug message.
-/// \param s The message to print.
-void Debug(const char *file, unsigned line, const char *s);
-
-StringRef GetInstrName(unsigned Opcode, const void *mii);
-
} // namespace X86Disassembler
} // namespace llvm
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index ed2ee55ff2a5..675a9c377b12 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -38,8 +38,9 @@ void X86ATTInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
OS << markup("<reg:") << '%' << getRegisterName(RegNo) << markup(">");
}
-void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
- StringRef Annot, const MCSubtargetInfo &STI) {
+void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &OS) {
// If verbose assembly is enabled, we can print some informative comments.
if (CommentStream)
HasCustomInstComment = EmitAnyX86InstComments(MI, *CommentStream, MII);
@@ -69,7 +70,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
// Try to print any aliases first.
else if (!printAliasInstr(MI, OS) &&
!printVecCompareInstr(MI, OS))
- printInstruction(MI, OS);
+ printInstruction(MI, Address, OS);
// Next always print the annotation.
printAnnotation(OS, Annot);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 747ddd30a2d9..3d5d384dc4a0 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -24,8 +24,8 @@ public:
: X86InstPrinterCommon(MAI, MII, MRI), HasCustomInstComment(false) {}
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
- void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
- const MCSubtargetInfo &STI) override;
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &OS) override;
bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
// Autogenerated by tblgen, returns true if we successfully printed an
@@ -35,7 +35,7 @@ public:
unsigned PrintMethodIdx, raw_ostream &O);
// Autogenerated by tblgen.
- void printInstruction(const MCInst *MI, raw_ostream &OS);
+ void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
static const char *getRegisterName(unsigned RegNo);
void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS) override;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index f08fcb575bf0..dffda5217675 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -12,55 +12,95 @@
#include "llvm/BinaryFormat/ELF.h"
#include "llvm/BinaryFormat/MachO.h"
#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
#include "llvm/MC/MCDwarf.h"
#include "llvm/MC/MCELFObjectWriter.h"
#include "llvm/MC/MCExpr.h"
#include "llvm/MC/MCFixupKindInfo.h"
#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
#include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCObjectStreamer.h"
#include "llvm/MC/MCObjectWriter.h"
#include "llvm/MC/MCRegisterInfo.h"
#include "llvm/MC/MCSectionMachO.h"
#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/CommandLine.h"
#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
#include "llvm/Support/raw_ostream.h"
+
using namespace llvm;
-static unsigned getFixupKindSize(unsigned Kind) {
- switch (Kind) {
- default:
- llvm_unreachable("invalid fixup kind!");
- case FK_NONE:
- return 0;
- case FK_PCRel_1:
- case FK_SecRel_1:
- case FK_Data_1:
- return 1;
- case FK_PCRel_2:
- case FK_SecRel_2:
- case FK_Data_2:
- return 2;
- case FK_PCRel_4:
- case X86::reloc_riprel_4byte:
- case X86::reloc_riprel_4byte_relax:
- case X86::reloc_riprel_4byte_relax_rex:
- case X86::reloc_riprel_4byte_movq_load:
- case X86::reloc_signed_4byte:
- case X86::reloc_signed_4byte_relax:
- case X86::reloc_global_offset_table:
- case X86::reloc_branch_4byte_pcrel:
- case FK_SecRel_4:
- case FK_Data_4:
- return 4;
- case FK_PCRel_8:
- case FK_SecRel_8:
- case FK_Data_8:
- case X86::reloc_global_offset_table8:
- return 8;
+namespace {
+/// A wrapper for holding a mask of the values from X86::AlignBranchBoundaryKind
+class X86AlignBranchKind {
+private:
+ uint8_t AlignBranchKind = 0;
+
+public:
+ void operator=(const std::string &Val) {
+ if (Val.empty())
+ return;
+ SmallVector<StringRef, 6> BranchTypes;
+ StringRef(Val).split(BranchTypes, '+', -1, false);
+ for (auto BranchType : BranchTypes) {
+ if (BranchType == "fused")
+ addKind(X86::AlignBranchFused);
+ else if (BranchType == "jcc")
+ addKind(X86::AlignBranchJcc);
+ else if (BranchType == "jmp")
+ addKind(X86::AlignBranchJmp);
+ else if (BranchType == "call")
+ addKind(X86::AlignBranchCall);
+ else if (BranchType == "ret")
+ addKind(X86::AlignBranchRet);
+ else if (BranchType == "indirect")
+ addKind(X86::AlignBranchIndirect);
+ else {
+ report_fatal_error(
+ "'-x86-align-branch 'The branches's type is combination of jcc, "
+ "fused, jmp, call, ret, indirect.(plus separated)",
+ false);
+ }
+ }
}
-}
-namespace {
+ operator uint8_t() const { return AlignBranchKind; }
+ void addKind(X86::AlignBranchBoundaryKind Value) { AlignBranchKind |= Value; }
+};
+
+X86AlignBranchKind X86AlignBranchKindLoc;
+
+cl::opt<unsigned> X86AlignBranchBoundary(
+ "x86-align-branch-boundary", cl::init(0),
+ cl::desc(
+ "Control how the assembler should align branches with NOP. If the "
+ "boundary's size is not 0, it should be a power of 2 and no less "
+ "than 32. Branches will be aligned to prevent from being across or "
+ "against the boundary of specified size. The default value 0 does not "
+ "align branches."));
+
+cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch(
+ "x86-align-branch",
+ cl::desc("Specify types of branches to align (plus separated list of "
+ "types). The branches's types are combination of jcc, fused, "
+ "jmp, call, ret, indirect."),
+ cl::value_desc("jcc indicates conditional jumps, fused indicates fused "
+ "conditional jumps, jmp indicates unconditional jumps, call "
+ "indicates direct and indirect calls, ret indicates rets, "
+ "indirect indicates indirect jumps."),
+ cl::location(X86AlignBranchKindLoc));
+
+cl::opt<bool> X86AlignBranchWithin32BBoundaries(
+ "x86-branches-within-32B-boundaries", cl::init(false),
+ cl::desc(
+ "Align selected instructions to mitigate negative performance impact "
+ "of Intel's micro code update for errata skx102. May break "
+ "assumptions about labels corresponding to particular instructions, "
+ "and should be used with caution."));
class X86ELFObjectWriter : public MCELFObjectTargetWriter {
public:
@@ -71,9 +111,42 @@ public:
class X86AsmBackend : public MCAsmBackend {
const MCSubtargetInfo &STI;
+ std::unique_ptr<const MCInstrInfo> MCII;
+ X86AlignBranchKind AlignBranchType;
+ Align AlignBoundary;
+
+ bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
+
+ bool needAlign(MCObjectStreamer &OS) const;
+ bool needAlignInst(const MCInst &Inst) const;
+ MCBoundaryAlignFragment *
+ getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const;
+ MCInst PrevInst;
+
public:
X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
- : MCAsmBackend(support::little), STI(STI) {}
+ : MCAsmBackend(support::little), STI(STI),
+ MCII(T.createMCInstrInfo()) {
+ if (X86AlignBranchWithin32BBoundaries) {
+ // At the moment, this defaults to aligning fused branches, unconditional
+ // jumps, and (unfused) conditional jumps with nops. Both the
+ // instructions aligned and the alignment method (nop vs prefix) may
+ // change in the future.
+ AlignBoundary = assumeAligned(32);;
+ AlignBranchType.addKind(X86::AlignBranchFused);
+ AlignBranchType.addKind(X86::AlignBranchJcc);
+ AlignBranchType.addKind(X86::AlignBranchJmp);
+ }
+ // Allow overriding defaults set by master flag
+ if (X86AlignBranchBoundary.getNumOccurrences())
+ AlignBoundary = assumeAligned(X86AlignBranchBoundary);
+ if (X86AlignBranch.getNumOccurrences())
+ AlignBranchType = X86AlignBranchKindLoc;
+ }
+
+ bool allowAutoPadding() const override;
+ void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
+ void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
unsigned getNumFixupKinds() const override {
return X86::NumTargetFixupKinds;
@@ -81,49 +154,15 @@ public:
Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
- const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
- const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
- {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- {"reloc_signed_4byte", 0, 32, 0},
- {"reloc_signed_4byte_relax", 0, 32, 0},
- {"reloc_global_offset_table", 0, 32, 0},
- {"reloc_global_offset_table8", 0, 64, 0},
- {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
- };
-
- if (Kind < FirstTargetFixupKind)
- return MCAsmBackend::getFixupKindInfo(Kind);
-
- assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
- "Invalid kind!");
- assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!");
- return Infos[Kind - FirstTargetFixupKind];
- }
-
+ const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target) override;
void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
const MCValue &Target, MutableArrayRef<char> Data,
uint64_t Value, bool IsResolved,
- const MCSubtargetInfo *STI) const override {
- unsigned Size = getFixupKindSize(Fixup.getKind());
-
- assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
-
- // Check that uppper bits are either all zeros or all ones.
- // Specifically ignore overflow/underflow as long as the leakage is
- // limited to the lower bits. This is to remain compatible with
- // other assemblers.
- assert((Size == 0 || isIntN(Size * 8 + 1, Value)) &&
- "Value does not fit in the Fixup field");
-
- for (unsigned i = 0; i != Size; ++i)
- Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
- }
+ const MCSubtargetInfo *STI) const override;
bool mayNeedRelaxation(const MCInst &Inst,
const MCSubtargetInfo &STI) const override;
@@ -243,6 +282,200 @@ static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
return getRelaxedOpcodeBranch(Inst, is16BitMode);
}
+static X86::CondCode getCondFromBranch(const MCInst &MI,
+ const MCInstrInfo &MCII) {
+ unsigned Opcode = MI.getOpcode();
+ switch (Opcode) {
+ default:
+ return X86::COND_INVALID;
+ case X86::JCC_1: {
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ return static_cast<X86::CondCode>(
+ MI.getOperand(Desc.getNumOperands() - 1).getImm());
+ }
+ }
+}
+
+static X86::SecondMacroFusionInstKind
+classifySecondInstInMacroFusion(const MCInst &MI, const MCInstrInfo &MCII) {
+ X86::CondCode CC = getCondFromBranch(MI, MCII);
+ return classifySecondCondCodeInMacroFusion(CC);
+}
+
+/// Check if the instruction uses RIP relative addressing.
+static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+ unsigned CurOp = X86II::getOperandBias(Desc);
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand < 0)
+ return false;
+ unsigned BaseRegNum = MemoryOperand + CurOp + X86::AddrBaseReg;
+ unsigned BaseReg = MI.getOperand(BaseRegNum).getReg();
+ return (BaseReg == X86::RIP);
+}
+
+/// Check if the instruction is valid as the first instruction in macro fusion.
+static bool isFirstMacroFusibleInst(const MCInst &Inst,
+ const MCInstrInfo &MCII) {
+ // An Intel instruction with RIP relative addressing is not macro fusible.
+ if (isRIPRelative(Inst, MCII))
+ return false;
+ X86::FirstMacroFusionInstKind FIK =
+ X86::classifyFirstOpcodeInMacroFusion(Inst.getOpcode());
+ return FIK != X86::FirstMacroFusionInstKind::Invalid;
+}
+
+/// Check if the two instructions will be macro-fused on the target cpu.
+bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const {
+ const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode());
+ if (!InstDesc.isConditionalBranch())
+ return false;
+ if (!isFirstMacroFusibleInst(Cmp, *MCII))
+ return false;
+ const X86::FirstMacroFusionInstKind CmpKind =
+ X86::classifyFirstOpcodeInMacroFusion(Cmp.getOpcode());
+ const X86::SecondMacroFusionInstKind BranchKind =
+ classifySecondInstInMacroFusion(Jcc, *MCII);
+ return X86::isMacroFused(CmpKind, BranchKind);
+}
+
+/// Check if the instruction has a variant symbol operand.
+static bool hasVariantSymbol(const MCInst &MI) {
+ for (auto &Operand : MI) {
+ if (!Operand.isExpr())
+ continue;
+ const MCExpr &Expr = *Operand.getExpr();
+ if (Expr.getKind() == MCExpr::SymbolRef &&
+ cast<MCSymbolRefExpr>(Expr).getKind() != MCSymbolRefExpr::VK_None)
+ return true;
+ }
+ return false;
+}
+
+bool X86AsmBackend::allowAutoPadding() const {
+ return (AlignBoundary != Align::None() &&
+ AlignBranchType != X86::AlignBranchNone);
+}
+
+bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
+ if (!OS.getAllowAutoPadding())
+ return false;
+ assert(allowAutoPadding() && "incorrect initialization!");
+
+ MCAssembler &Assembler = OS.getAssembler();
+ MCSection *Sec = OS.getCurrentSectionOnly();
+ // To be Done: Currently don't deal with Bundle cases.
+ if (Assembler.isBundlingEnabled() && Sec->isBundleLocked())
+ return false;
+
+ // Branches only need to be aligned in 32-bit or 64-bit mode.
+ if (!(STI.hasFeature(X86::Mode64Bit) || STI.hasFeature(X86::Mode32Bit)))
+ return false;
+
+ return true;
+}
+
+/// Check if the instruction operand needs to be aligned. Padding is disabled
+/// before intruction which may be rewritten by linker(e.g. TLSCALL).
+bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
+ // Linker may rewrite the instruction with variant symbol operand.
+ if (hasVariantSymbol(Inst))
+ return false;
+
+ const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode());
+ return (InstDesc.isConditionalBranch() &&
+ (AlignBranchType & X86::AlignBranchJcc)) ||
+ (InstDesc.isUnconditionalBranch() &&
+ (AlignBranchType & X86::AlignBranchJmp)) ||
+ (InstDesc.isCall() &&
+ (AlignBranchType & X86::AlignBranchCall)) ||
+ (InstDesc.isReturn() &&
+ (AlignBranchType & X86::AlignBranchRet)) ||
+ (InstDesc.isIndirectBranch() &&
+ (AlignBranchType & X86::AlignBranchIndirect));
+}
+
+static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) {
+ // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it.
+ return !F.canEmitNops();
+}
+
+MCBoundaryAlignFragment *
+X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const {
+ auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment());
+ if (!F || !canReuseBoundaryAlignFragment(*F)) {
+ F = new MCBoundaryAlignFragment(AlignBoundary);
+ OS.insert(F);
+ }
+ return F;
+}
+
+/// Insert MCBoundaryAlignFragment before instructions to align branches.
+void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
+ const MCInst &Inst) {
+ if (!needAlign(OS))
+ return;
+
+ MCFragment *CF = OS.getCurrentFragment();
+ bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
+ if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+ // Macro fusion actually happens and there is no other fragment inserted
+ // after the previous instruction. NOP can be emitted in PF to align fused
+ // jcc.
+ if (auto *PF =
+ dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) {
+ const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true);
+ const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true);
+ }
+ } else if (needAlignInst(Inst)) {
+ // Note: When there is at least one fragment, such as MCAlignFragment,
+ // inserted after the previous instruction, e.g.
+ //
+ // \code
+ // cmp %rax %rcx
+ // .align 16
+ // je .Label0
+ // \ endcode
+ //
+ // We will treat the JCC as a unfused branch although it may be fused
+ // with the CMP.
+ auto *F = getOrCreateBoundaryAlignFragment(OS);
+ F->setEmitNops(true);
+ F->setFused(false);
+ } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) {
+ // We don't know if macro fusion happens until the reaching the next
+ // instruction, so a place holder is put here if necessary.
+ getOrCreateBoundaryAlignFragment(OS);
+ }
+
+ PrevInst = Inst;
+}
+
+/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned
+/// if necessary.
+void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
+ if (!needAlign(OS))
+ return;
+ // If the branch is emitted into a MCRelaxableFragment, we can determine the
+ // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
+ // branch is fused, the fused branch(macro fusion pair) must be emitted into
+ // two fragments. Or when the branch is unfused, the branch must be emitted
+ // into one fragment. The MCRelaxableFragment naturally marks the end of the
+ // fused or unfused branch.
+ // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of
+ // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align
+ // other branch.
+ if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment()))
+ OS.insert(new MCBoundaryAlignFragment(AlignBoundary));
+
+ // Update the maximum alignment on the current section if necessary.
+ MCSection *Sec = OS.getCurrentSectionOnly();
+ if (AlignBoundary.value() > Sec->getAlignment())
+ Sec->setAlignment(AlignBoundary);
+}
+
Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
if (STI.getTargetTriple().isOSBinFormatELF()) {
if (STI.getTargetTriple().getArch() == Triple::x86_64) {
@@ -256,12 +489,100 @@ Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
return MCAsmBackend::getFixupKind(Name);
}
+const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+ const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+ {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ {"reloc_signed_4byte", 0, 32, 0},
+ {"reloc_signed_4byte_relax", 0, 32, 0},
+ {"reloc_global_offset_table", 0, 32, 0},
+ {"reloc_global_offset_table8", 0, 64, 0},
+ {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+ };
+
+ if (Kind < FirstTargetFixupKind)
+ return MCAsmBackend::getFixupKindInfo(Kind);
+
+ assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+ "Invalid kind!");
+ assert(Infos[Kind - FirstTargetFixupKind].Name && "Empty fixup name!");
+ return Infos[Kind - FirstTargetFixupKind];
+}
+
bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
const MCFixup &Fixup,
const MCValue &) {
return Fixup.getKind() == FK_NONE;
}
+static unsigned getFixupKindSize(unsigned Kind) {
+ switch (Kind) {
+ default:
+ llvm_unreachable("invalid fixup kind!");
+ case FK_NONE:
+ return 0;
+ case FK_PCRel_1:
+ case FK_SecRel_1:
+ case FK_Data_1:
+ return 1;
+ case FK_PCRel_2:
+ case FK_SecRel_2:
+ case FK_Data_2:
+ return 2;
+ case FK_PCRel_4:
+ case X86::reloc_riprel_4byte:
+ case X86::reloc_riprel_4byte_relax:
+ case X86::reloc_riprel_4byte_relax_rex:
+ case X86::reloc_riprel_4byte_movq_load:
+ case X86::reloc_signed_4byte:
+ case X86::reloc_signed_4byte_relax:
+ case X86::reloc_global_offset_table:
+ case X86::reloc_branch_4byte_pcrel:
+ case FK_SecRel_4:
+ case FK_Data_4:
+ return 4;
+ case FK_PCRel_8:
+ case FK_SecRel_8:
+ case FK_Data_8:
+ case X86::reloc_global_offset_table8:
+ return 8;
+ }
+}
+
+void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+ const MCValue &Target,
+ MutableArrayRef<char> Data,
+ uint64_t Value, bool IsResolved,
+ const MCSubtargetInfo *STI) const {
+ unsigned Size = getFixupKindSize(Fixup.getKind());
+
+ assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
+
+ int64_t SignedValue = static_cast<int64_t>(Value);
+ if ((Target.isAbsolute() || IsResolved) &&
+ getFixupKindInfo(Fixup.getKind()).Flags &
+ MCFixupKindInfo::FKF_IsPCRel) {
+ // check that PC relative fixup fits into the fixup size.
+ if (Size > 0 && !isIntN(Size * 8, SignedValue))
+ Asm.getContext().reportError(
+ Fixup.getLoc(), "value of " + Twine(SignedValue) +
+ " is too large for field of " + Twine(Size) +
+ ((Size == 1) ? " byte." : " bytes."));
+ } else {
+ // Check that uppper bits are either all zeros or all ones.
+ // Specifically ignore overflow/underflow as long as the leakage is
+ // limited to the lower bits. This is to remain compatible with
+ // other assemblers.
+ assert((Size == 0 || isIntN(Size * 8 + 1, SignedValue)) &&
+ "Value does not fit in the Fixup field");
+ }
+
+ for (unsigned i = 0; i != Size; ++i)
+ Data[Fixup.getOffset() + i] = uint8_t(Value >> (i * 8));
+}
+
bool X86AsmBackend::mayNeedRelaxation(const MCInst &Inst,
const MCSubtargetInfo &STI) const {
// Branches can always be relaxed in either mode.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index 6bd6c6cac7df..a4f8dd669e1e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -101,6 +101,261 @@ namespace X86 {
COND_INVALID
};
+
+ // The classification for the first instruction in macro fusion.
+ enum class FirstMacroFusionInstKind {
+ // TEST
+ Test,
+ // CMP
+ Cmp,
+ // AND
+ And,
+ // ADD, SUB
+ AddSub,
+ // INC, DEC
+ IncDec,
+ // Not valid as a first macro fusion instruction
+ Invalid
+ };
+
+ enum class SecondMacroFusionInstKind {
+ // JA, JB and variants.
+ AB,
+ // JE, JL, JG and variants.
+ ELG,
+ // JS, JP, JO and variants
+ SPO,
+ // Not a fusible jump.
+ Invalid,
+ };
+
+ /// \returns the type of the first instruction in macro-fusion.
+ inline FirstMacroFusionInstKind
+ classifyFirstOpcodeInMacroFusion(unsigned Opcode) {
+ switch (Opcode) {
+ default:
+ return FirstMacroFusionInstKind::Invalid;
+ // TEST
+ case X86::TEST16i16:
+ case X86::TEST16mr:
+ case X86::TEST16ri:
+ case X86::TEST16rr:
+ case X86::TEST32i32:
+ case X86::TEST32mr:
+ case X86::TEST32ri:
+ case X86::TEST32rr:
+ case X86::TEST64i32:
+ case X86::TEST64mr:
+ case X86::TEST64ri32:
+ case X86::TEST64rr:
+ case X86::TEST8i8:
+ case X86::TEST8mr:
+ case X86::TEST8ri:
+ case X86::TEST8rr:
+ return FirstMacroFusionInstKind::Test;
+ case X86::AND16i16:
+ case X86::AND16ri:
+ case X86::AND16ri8:
+ case X86::AND16rm:
+ case X86::AND16rr:
+ case X86::AND16rr_REV:
+ case X86::AND32i32:
+ case X86::AND32ri:
+ case X86::AND32ri8:
+ case X86::AND32rm:
+ case X86::AND32rr:
+ case X86::AND32rr_REV:
+ case X86::AND64i32:
+ case X86::AND64ri32:
+ case X86::AND64ri8:
+ case X86::AND64rm:
+ case X86::AND64rr:
+ case X86::AND64rr_REV:
+ case X86::AND8i8:
+ case X86::AND8ri:
+ case X86::AND8ri8:
+ case X86::AND8rm:
+ case X86::AND8rr:
+ case X86::AND8rr_REV:
+ return FirstMacroFusionInstKind::And;
+ // CMP
+ case X86::CMP16i16:
+ case X86::CMP16mr:
+ case X86::CMP16ri:
+ case X86::CMP16ri8:
+ case X86::CMP16rm:
+ case X86::CMP16rr:
+ case X86::CMP16rr_REV:
+ case X86::CMP32i32:
+ case X86::CMP32mr:
+ case X86::CMP32ri:
+ case X86::CMP32ri8:
+ case X86::CMP32rm:
+ case X86::CMP32rr:
+ case X86::CMP32rr_REV:
+ case X86::CMP64i32:
+ case X86::CMP64mr:
+ case X86::CMP64ri32:
+ case X86::CMP64ri8:
+ case X86::CMP64rm:
+ case X86::CMP64rr:
+ case X86::CMP64rr_REV:
+ case X86::CMP8i8:
+ case X86::CMP8mr:
+ case X86::CMP8ri:
+ case X86::CMP8ri8:
+ case X86::CMP8rm:
+ case X86::CMP8rr:
+ case X86::CMP8rr_REV:
+ return FirstMacroFusionInstKind::Cmp;
+ // ADD
+ case X86::ADD16i16:
+ case X86::ADD16ri:
+ case X86::ADD16ri8:
+ case X86::ADD16rm:
+ case X86::ADD16rr:
+ case X86::ADD16rr_REV:
+ case X86::ADD32i32:
+ case X86::ADD32ri:
+ case X86::ADD32ri8:
+ case X86::ADD32rm:
+ case X86::ADD32rr:
+ case X86::ADD32rr_REV:
+ case X86::ADD64i32:
+ case X86::ADD64ri32:
+ case X86::ADD64ri8:
+ case X86::ADD64rm:
+ case X86::ADD64rr:
+ case X86::ADD64rr_REV:
+ case X86::ADD8i8:
+ case X86::ADD8ri:
+ case X86::ADD8ri8:
+ case X86::ADD8rm:
+ case X86::ADD8rr:
+ case X86::ADD8rr_REV:
+ // SUB
+ case X86::SUB16i16:
+ case X86::SUB16ri:
+ case X86::SUB16ri8:
+ case X86::SUB16rm:
+ case X86::SUB16rr:
+ case X86::SUB16rr_REV:
+ case X86::SUB32i32:
+ case X86::SUB32ri:
+ case X86::SUB32ri8:
+ case X86::SUB32rm:
+ case X86::SUB32rr:
+ case X86::SUB32rr_REV:
+ case X86::SUB64i32:
+ case X86::SUB64ri32:
+ case X86::SUB64ri8:
+ case X86::SUB64rm:
+ case X86::SUB64rr:
+ case X86::SUB64rr_REV:
+ case X86::SUB8i8:
+ case X86::SUB8ri:
+ case X86::SUB8ri8:
+ case X86::SUB8rm:
+ case X86::SUB8rr:
+ case X86::SUB8rr_REV:
+ return FirstMacroFusionInstKind::AddSub;
+ // INC
+ case X86::INC16r:
+ case X86::INC16r_alt:
+ case X86::INC32r:
+ case X86::INC32r_alt:
+ case X86::INC64r:
+ case X86::INC8r:
+ // DEC
+ case X86::DEC16r:
+ case X86::DEC16r_alt:
+ case X86::DEC32r:
+ case X86::DEC32r_alt:
+ case X86::DEC64r:
+ case X86::DEC8r:
+ return FirstMacroFusionInstKind::IncDec;
+ }
+ }
+
+ /// \returns the type of the second instruction in macro-fusion.
+ inline SecondMacroFusionInstKind
+ classifySecondCondCodeInMacroFusion(X86::CondCode CC) {
+ if (CC == X86::COND_INVALID)
+ return SecondMacroFusionInstKind::Invalid;
+
+ switch (CC) {
+ default:
+ return SecondMacroFusionInstKind::Invalid;
+ // JE,JZ
+ case X86::COND_E:
+ // JNE,JNZ
+ case X86::COND_NE:
+ // JL,JNGE
+ case X86::COND_L:
+ // JLE,JNG
+ case X86::COND_LE:
+ // JG,JNLE
+ case X86::COND_G:
+ // JGE,JNL
+ case X86::COND_GE:
+ return SecondMacroFusionInstKind::ELG;
+ // JB,JC
+ case X86::COND_B:
+ // JNA,JBE
+ case X86::COND_BE:
+ // JA,JNBE
+ case X86::COND_A:
+ // JAE,JNC,JNB
+ case X86::COND_AE:
+ return SecondMacroFusionInstKind::AB;
+ // JS
+ case X86::COND_S:
+ // JNS
+ case X86::COND_NS:
+ // JP,JPE
+ case X86::COND_P:
+ // JNP,JPO
+ case X86::COND_NP:
+ // JO
+ case X86::COND_O:
+ // JNO
+ case X86::COND_NO:
+ return SecondMacroFusionInstKind::SPO;
+ }
+ }
+
+ /// \param FirstKind kind of the first instruction in macro fusion.
+ /// \param SecondKind kind of the second instruction in macro fusion.
+ ///
+ /// \returns true if the two instruction can be macro fused.
+ inline bool isMacroFused(FirstMacroFusionInstKind FirstKind,
+ SecondMacroFusionInstKind SecondKind) {
+ switch (FirstKind) {
+ case X86::FirstMacroFusionInstKind::Test:
+ case X86::FirstMacroFusionInstKind::And:
+ return true;
+ case X86::FirstMacroFusionInstKind::Cmp:
+ case X86::FirstMacroFusionInstKind::AddSub:
+ return SecondKind == X86::SecondMacroFusionInstKind::AB ||
+ SecondKind == X86::SecondMacroFusionInstKind::ELG;
+ case X86::FirstMacroFusionInstKind::IncDec:
+ return SecondKind == X86::SecondMacroFusionInstKind::ELG;
+ case X86::FirstMacroFusionInstKind::Invalid:
+ return false;
+ }
+ llvm_unreachable("unknown fusion type");
+ }
+
+ /// Defines the possible values of the branch boundary alignment mask.
+ enum AlignBranchBoundaryKind : uint8_t {
+ AlignBranchNone = 0,
+ AlignBranchFused = 1U << 0,
+ AlignBranchJcc = 1U << 1,
+ AlignBranchJmp = 1U << 2,
+ AlignBranchCall = 1U << 3,
+ AlignBranchRet = 1U << 4,
+ AlignBranchIndirect = 1U << 5
+ };
} // end namespace X86;
/// X86II - This namespace holds all of the target specific flags that
@@ -645,9 +900,8 @@ namespace X86II {
NOTRACK = 1ULL << NoTrackShift
};
- // getBaseOpcodeFor - This function returns the "base" X86 opcode for the
- // specified machine instruction.
- //
+ /// \returns the "base" X86 opcode for the specified machine
+ /// instruction.
inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
return TSFlags >> X86II::OpcodeShift;
}
@@ -656,8 +910,8 @@ namespace X86II {
return (TSFlags & X86II::ImmMask) != 0;
}
- /// getSizeOfImm - Decode the "size of immediate" field from the TSFlags field
- /// of the specified instruction.
+ /// Decode the "size of immediate" field from the TSFlags field of the
+ /// specified instruction.
inline unsigned getSizeOfImm(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
default: llvm_unreachable("Unknown immediate size");
@@ -673,9 +927,9 @@ namespace X86II {
}
}
- /// isImmPCRel - Return true if the immediate of the specified instruction's
- /// TSFlags indicates that it is pc relative.
- inline unsigned isImmPCRel(uint64_t TSFlags) {
+ /// \returns true if the immediate of the specified instruction's TSFlags
+ /// indicates that it is pc relative.
+ inline bool isImmPCRel(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
default: llvm_unreachable("Unknown immediate size");
case X86II::Imm8PCRel:
@@ -692,9 +946,9 @@ namespace X86II {
}
}
- /// isImmSigned - Return true if the immediate of the specified instruction's
+ /// \returns true if the immediate of the specified instruction's
/// TSFlags indicates that it is signed.
- inline unsigned isImmSigned(uint64_t TSFlags) {
+ inline bool isImmSigned(uint64_t TSFlags) {
switch (TSFlags & X86II::ImmMask) {
default: llvm_unreachable("Unknown immediate signedness");
case X86II::Imm32S:
@@ -711,8 +965,8 @@ namespace X86II {
}
}
- /// getOperandBias - compute whether all of the def operands are repeated
- /// in the uses and therefore should be skipped.
+ /// Compute whether all of the def operands are repeated in the uses and
+ /// therefore should be skipped.
/// This determines the start of the unique operand list. We need to determine
/// if all of the defs have a corresponding tied operand in the uses.
/// Unfortunately, the tied operand information is encoded in the uses not
@@ -750,8 +1004,8 @@ namespace X86II {
}
}
- /// getMemoryOperandNo - The function returns the MCInst operand # for the
- /// first field of the memory operand. If the instruction doesn't have a
+ /// The function returns the MCInst operand # for the first field of the
+ /// memory operand. If the instruction doesn't have a
/// memory operand, this returns -1.
///
/// Note that this ignores tied operands. If there is a tied register which
@@ -837,8 +1091,8 @@ namespace X86II {
}
}
- /// isX86_64ExtendedReg - Is the MachineOperand a x86-64 extended (r8 or
- /// higher) register? e.g. r8, xmm8, xmm13, etc.
+ /// \returns true if the MachineOperand is a x86-64 extended (r8 or
+ /// higher) register, e.g. r8, xmm8, xmm13, etc.
inline bool isX86_64ExtendedReg(unsigned RegNo) {
if ((RegNo >= X86::XMM8 && RegNo <= X86::XMM31) ||
(RegNo >= X86::YMM8 && RegNo <= X86::YMM31) ||
@@ -864,8 +1118,8 @@ namespace X86II {
return false;
}
- /// is32ExtendedReg - Is the MemoryOperand a 32 extended (zmm16 or higher)
- /// registers? e.g. zmm21, etc.
+ /// \returns true if the MemoryOperand is a 32 extended (zmm16 or higher)
+ /// registers, e.g. zmm21, etc.
static inline bool is32ExtendedReg(unsigned RegNo) {
return ((RegNo >= X86::XMM16 && RegNo <= X86::XMM31) ||
(RegNo >= X86::YMM16 && RegNo <= X86::YMM31) ||
@@ -878,12 +1132,12 @@ namespace X86II {
reg == X86::SIL || reg == X86::DIL);
}
- /// isKMasked - Is this a masked instruction.
+ /// \returns true if this is a masked instruction.
inline bool isKMasked(uint64_t TSFlags) {
return (TSFlags & X86II::EVEX_K) != 0;
}
- /// isKMergedMasked - Is this a merge masked instruction.
+ /// \returns true if this is a merge masked instruction.
inline bool isKMergeMasked(uint64_t TSFlags) {
return isKMasked(TSFlags) && (TSFlags & X86II::EVEX_Z) == 0;
}
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index ea28bef42569..f4bb0fbf62cd 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -36,9 +36,9 @@ void X86IntelInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
OS << getRegisterName(RegNo);
}
-void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
- StringRef Annot,
- const MCSubtargetInfo &STI) {
+void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+ StringRef Annot, const MCSubtargetInfo &STI,
+ raw_ostream &OS) {
printInstFlags(MI, OS);
// In 16-bit mode, print data16 as data32.
@@ -47,7 +47,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, raw_ostream &OS,
OS << "\tdata32";
} else if (!printAliasInstr(MI, OS) &&
!printVecCompareInstr(MI, OS))
- printInstruction(MI, OS);
+ printInstruction(MI, Address, OS);
// Next always print the annotation.
printAnnotation(OS, Annot);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index f32f49f7c417..b409b20cbea8 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -25,8 +25,8 @@ public:
: X86InstPrinterCommon(MAI, MII, MRI) {}
void printRegName(raw_ostream &OS, unsigned RegNo) const override;
- void printInst(const MCInst *MI, raw_ostream &OS, StringRef Annot,
- const MCSubtargetInfo &STI) override;
+ void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
+ const MCSubtargetInfo &STI, raw_ostream &OS) override;
bool printVecCompareInstr(const MCInst *MI, raw_ostream &OS);
// Autogenerated by tblgen, returns true if we successfully printed an
@@ -36,7 +36,7 @@ public:
unsigned PrintMethodIdx, raw_ostream &O);
// Autogenerated by tblgen.
- void printInstruction(const MCInst *MI, raw_ostream &O);
+ void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
static const char *getRegisterName(unsigned RegNo);
void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O) override;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index ac36bf3a12fa..54a293702bd0 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -42,91 +42,68 @@ class X86MCCodeEmitter : public MCCodeEmitter {
public:
X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
- : MCII(mcii), Ctx(ctx) {
- }
+ : MCII(mcii), Ctx(ctx) {}
X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete;
~X86MCCodeEmitter() override = default;
- bool is64BitMode(const MCSubtargetInfo &STI) const {
- return STI.getFeatureBits()[X86::Mode64Bit];
- }
-
- bool is32BitMode(const MCSubtargetInfo &STI) const {
- return STI.getFeatureBits()[X86::Mode32Bit];
- }
-
- bool is16BitMode(const MCSubtargetInfo &STI) const {
- return STI.getFeatureBits()[X86::Mode16Bit];
- }
-
- /// Is16BitMemOperand - Return true if the specified instruction has
- /// a 16-bit memory operand. Op specifies the operand # of the memoperand.
- bool Is16BitMemOperand(const MCInst &MI, unsigned Op,
- const MCSubtargetInfo &STI) const {
- const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
- const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
- const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
+ void emitPrefix(const MCInst &MI, raw_ostream &OS,
+ const MCSubtargetInfo &STI) const override;
- if (is16BitMode(STI) && BaseReg.getReg() == 0 &&
- Disp.isImm() && Disp.getImm() < 0x10000)
- return true;
- if ((BaseReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
- (IndexReg.getReg() != 0 &&
- X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
- return true;
- return false;
- }
+ void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const override;
- unsigned GetX86RegNum(const MCOperand &MO) const {
+private:
+ unsigned getX86RegNum(const MCOperand &MO) const {
return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
}
unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
return Ctx.getRegisterInfo()->getEncodingValue(
- MI.getOperand(OpNum).getReg());
+ MI.getOperand(OpNum).getReg());
}
- // Does this register require a bit to be set in REX prefix.
+ /// \param MI a single low-level machine instruction.
+ /// \param OpNum the operand #.
+ /// \returns true if the OpNumth operand of MI require a bit to be set in
+ /// REX prefix.
bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
}
- void EmitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
+ void emitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
OS << (char)C;
++CurByte;
}
- void EmitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
+ void emitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
raw_ostream &OS) const {
// Output the constant in little endian byte order.
for (unsigned i = 0; i != Size; ++i) {
- EmitByte(Val & 255, CurByte, OS);
+ emitByte(Val & 255, CurByte, OS);
Val >>= 8;
}
}
- void EmitImmediate(const MCOperand &Disp, SMLoc Loc,
- unsigned ImmSize, MCFixupKind FixupKind,
- unsigned &CurByte, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- int ImmOffset = 0) const;
+ void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
+ MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
- static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
+ static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
return RM | (RegOpcode << 3) | (Mod << 6);
}
- void EmitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
+ void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
unsigned &CurByte, raw_ostream &OS) const {
- EmitByte(ModRMByte(3, RegOpcodeFld, GetX86RegNum(ModRMReg)), CurByte, OS);
+ emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), CurByte, OS);
}
- void EmitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+ void emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
unsigned &CurByte, raw_ostream &OS) const {
- // SIB byte is in the same format as the ModRMByte.
- EmitByte(ModRMByte(SS, Index, Base), CurByte, OS);
+ // SIB byte is in the same format as the modRMByte.
+ emitByte(modRMByte(SS, Index, Base), CurByte, OS);
}
void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
@@ -134,43 +111,39 @@ public:
raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const;
- void encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const override;
+ void emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned &CurByte,
+ bool &Rex, const MCInst &MI, const MCInstrDesc &Desc,
+ const MCSubtargetInfo &STI, raw_ostream &OS) const;
- void EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
+ void emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
const MCInst &MI, const MCInstrDesc &Desc,
raw_ostream &OS) const;
- void EmitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
+ void emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
const MCInst &MI, raw_ostream &OS) const;
bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
const MCInst &MI, const MCInstrDesc &Desc,
const MCSubtargetInfo &STI, raw_ostream &OS) const;
- uint8_t DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
- int MemOperand, const MCInstrDesc &Desc) const;
-
- bool isPCRel32Branch(const MCInst &MI) const;
+ uint8_t determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand,
+ const MCInstrDesc &Desc) const;
};
} // end anonymous namespace
-/// isDisp8 - Return true if this signed displacement fits in a 8-bit
-/// sign-extended field.
-static bool isDisp8(int Value) {
- return Value == (int8_t)Value;
-}
+/// \returns true if this signed displacement fits in a 8-bit sign-extended
+/// field.
+static bool isDisp8(int Value) { return Value == (int8_t)Value; }
-/// isCDisp8 - Return true if this signed displacement fits in a 8-bit
-/// compressed dispacement field.
-static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
+/// \returns true if this signed displacement fits in a 8-bit compressed
+/// dispacement field.
+static bool isCDisp8(uint64_t TSFlags, int Value, int &CValue) {
assert(((TSFlags & X86II::EncodingMask) == X86II::EVEX) &&
"Compressed 8-bit displacement is only valid for EVEX inst.");
unsigned CD8_Scale =
- (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
+ (TSFlags & X86II::CD8_Scale_Mask) >> X86II::CD8_Scale_Shift;
if (CD8_Scale == 0) {
CValue = Value;
return isDisp8(Value);
@@ -188,26 +161,49 @@ static bool isCDisp8(uint64_t TSFlags, int Value, int& CValue) {
return Ret;
}
-/// getImmFixupKind - Return the appropriate fixup kind to use for an immediate
-/// in an instruction with the specified TSFlags.
+/// \returns the appropriate fixup kind to use for an immediate in an
+/// instruction with the specified TSFlags.
static MCFixupKind getImmFixupKind(uint64_t TSFlags) {
unsigned Size = X86II::getSizeOfImm(TSFlags);
bool isPCRel = X86II::isImmPCRel(TSFlags);
if (X86II::isImmSigned(TSFlags)) {
switch (Size) {
- default: llvm_unreachable("Unsupported signed fixup size!");
- case 4: return MCFixupKind(X86::reloc_signed_4byte);
+ default:
+ llvm_unreachable("Unsupported signed fixup size!");
+ case 4:
+ return MCFixupKind(X86::reloc_signed_4byte);
}
}
return MCFixup::getKindForSize(Size, isPCRel);
}
-/// Is32BitMemOperand - Return true if the specified instruction has
-/// a 32-bit memory operand. Op specifies the operand # of the memoperand.
-static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
- const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
- const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 16-bit memory operand.
+static bool is16BitMemOperand(const MCInst &MI, unsigned Op,
+ const MCSubtargetInfo &STI) {
+ const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
+ const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
+
+ if (STI.hasFeature(X86::Mode16Bit) && BaseReg.getReg() == 0 && Disp.isImm() &&
+ Disp.getImm() < 0x10000)
+ return true;
+ if ((BaseReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(BaseReg.getReg())) ||
+ (IndexReg.getReg() != 0 &&
+ X86MCRegisterClasses[X86::GR16RegClassID].contains(IndexReg.getReg())))
+ return true;
+ return false;
+}
+
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 32-bit memory operand.
+static bool is32BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
if ((BaseReg.getReg() != 0 &&
X86MCRegisterClasses[X86::GR32RegClassID].contains(BaseReg.getReg())) ||
@@ -223,12 +219,13 @@ static bool Is32BitMemOperand(const MCInst &MI, unsigned Op) {
return false;
}
-/// Is64BitMemOperand - Return true if the specified instruction has
-/// a 64-bit memory operand. Op specifies the operand # of the memoperand.
+/// \param Op operand # of the memory operand.
+///
+/// \returns true if the specified instruction has a 64-bit memory operand.
#ifndef NDEBUG
-static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
- const MCOperand &BaseReg = MI.getOperand(Op+X86::AddrBaseReg);
- const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+static bool is64BitMemOperand(const MCInst &MI, unsigned Op) {
+ const MCOperand &BaseReg = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
if ((BaseReg.getReg() != 0 &&
X86MCRegisterClasses[X86::GR64RegClassID].contains(BaseReg.getReg())) ||
@@ -239,19 +236,15 @@ static bool Is64BitMemOperand(const MCInst &MI, unsigned Op) {
}
#endif
-/// StartsWithGlobalOffsetTable - Check if this expression starts with
-/// _GLOBAL_OFFSET_TABLE_ and if it is of the form
-/// _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on ELF
-/// i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
-/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start
-/// of a binary expression.
-enum GlobalOffsetTableExprKind {
- GOT_None,
- GOT_Normal,
- GOT_SymDiff
-};
+enum GlobalOffsetTableExprKind { GOT_None, GOT_Normal, GOT_SymDiff };
+
+/// Check if this expression starts with _GLOBAL_OFFSET_TABLE_ and if it is
+/// of the form _GLOBAL_OFFSET_TABLE_-symbol. This is needed to support PIC on
+/// ELF i386 as _GLOBAL_OFFSET_TABLE_ is magical. We check only simple case that
+/// are know to be used: _GLOBAL_OFFSET_TABLE_ by itself or at the start of a
+/// binary expression.
static GlobalOffsetTableExprKind
-StartsWithGlobalOffsetTable(const MCExpr *Expr) {
+startsWithGlobalOffsetTable(const MCExpr *Expr) {
const MCExpr *RHS = nullptr;
if (Expr->getKind() == MCExpr::Binary) {
const MCBinaryExpr *BE = static_cast<const MCBinaryExpr *>(Expr);
@@ -262,7 +255,7 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) {
if (Expr->getKind() != MCExpr::SymbolRef)
return GOT_None;
- const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
const MCSymbol &S = Ref->getSymbol();
if (S.getName() != "_GLOBAL_OFFSET_TABLE_")
return GOT_None;
@@ -271,15 +264,15 @@ StartsWithGlobalOffsetTable(const MCExpr *Expr) {
return GOT_Normal;
}
-static bool HasSecRelSymbolRef(const MCExpr *Expr) {
+static bool hasSecRelSymbolRef(const MCExpr *Expr) {
if (Expr->getKind() == MCExpr::SymbolRef) {
- const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr*>(Expr);
+ const MCSymbolRefExpr *Ref = static_cast<const MCSymbolRefExpr *>(Expr);
return Ref->getKind() == MCSymbolRefExpr::VK_SECREL;
}
return false;
}
-bool X86MCCodeEmitter::isPCRel32Branch(const MCInst &MI) const {
+static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MCII.get(Opcode);
if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) ||
@@ -295,18 +288,18 @@ bool X86MCCodeEmitter::isPCRel32Branch(const MCInst &MI) const {
return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
}
-void X86MCCodeEmitter::
-EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
- MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups, int ImmOffset) const {
+void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
+ unsigned Size, MCFixupKind FixupKind,
+ unsigned &CurByte, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ int ImmOffset) const {
const MCExpr *Expr = nullptr;
if (DispOp.isImm()) {
// If this is a simple integer displacement that doesn't require a
// relocation, emit it now.
- if (FixupKind != FK_PCRel_1 &&
- FixupKind != FK_PCRel_2 &&
+ if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 &&
FixupKind != FK_PCRel_4) {
- EmitConstant(DispOp.getImm()+ImmOffset, Size, CurByte, OS);
+ emitConstant(DispOp.getImm() + ImmOffset, Size, CurByte, OS);
return;
}
Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
@@ -315,10 +308,9 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
}
// If we have an immoffset, add it to the expression.
- if ((FixupKind == FK_Data_4 ||
- FixupKind == FK_Data_8 ||
+ if ((FixupKind == FK_Data_4 || FixupKind == FK_Data_8 ||
FixupKind == MCFixupKind(X86::reloc_signed_4byte))) {
- GlobalOffsetTableExprKind Kind = StartsWithGlobalOffsetTable(Expr);
+ GlobalOffsetTableExprKind Kind = startsWithGlobalOffsetTable(Expr);
if (Kind != GOT_None) {
assert(ImmOffset == 0);
@@ -332,13 +324,13 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
if (Kind == GOT_Normal)
ImmOffset = CurByte;
} else if (Expr->getKind() == MCExpr::SymbolRef) {
- if (HasSecRelSymbolRef(Expr)) {
+ if (hasSecRelSymbolRef(Expr)) {
FixupKind = MCFixupKind(FK_SecRel_4);
}
} else if (Expr->getKind() == MCExpr::Binary) {
- const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr*>(Expr);
- if (HasSecRelSymbolRef(Bin->getLHS())
- || HasSecRelSymbolRef(Bin->getRHS())) {
+ const MCBinaryExpr *Bin = static_cast<const MCBinaryExpr *>(Expr);
+ if (hasSecRelSymbolRef(Bin->getLHS()) ||
+ hasSecRelSymbolRef(Bin->getRHS())) {
FixupKind = MCFixupKind(FK_SecRel_4);
}
}
@@ -356,7 +348,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
// If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_:
// leaq _GLOBAL_OFFSET_TABLE_(%rip), %r15
// this needs to be a GOTPC32 relocation.
- if (StartsWithGlobalOffsetTable(Expr) != GOT_None)
+ if (startsWithGlobalOffsetTable(Expr) != GOT_None)
FixupKind = MCFixupKind(X86::reloc_global_offset_table);
}
if (FixupKind == FK_PCRel_2)
@@ -370,7 +362,7 @@ EmitImmediate(const MCOperand &DispOp, SMLoc Loc, unsigned Size,
// Emit a symbolic constant as a fixup and 4 zeros.
Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
- EmitConstant(0, Size, CurByte, OS);
+ emitConstant(0, Size, CurByte, OS);
}
void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
@@ -379,19 +371,20 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
unsigned &CurByte, raw_ostream &OS,
SmallVectorImpl<MCFixup> &Fixups,
const MCSubtargetInfo &STI) const {
- const MCOperand &Disp = MI.getOperand(Op+X86::AddrDisp);
- const MCOperand &Base = MI.getOperand(Op+X86::AddrBaseReg);
- const MCOperand &Scale = MI.getOperand(Op+X86::AddrScaleAmt);
- const MCOperand &IndexReg = MI.getOperand(Op+X86::AddrIndexReg);
+ const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
+ const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
+ const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
+ const MCOperand &IndexReg = MI.getOperand(Op + X86::AddrIndexReg);
unsigned BaseReg = Base.getReg();
bool HasEVEX = (TSFlags & X86II::EncodingMask) == X86II::EVEX;
// Handle %rip relative addressing.
if (BaseReg == X86::RIP ||
- BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
- assert(is64BitMode(STI) && "Rip-relative addressing requires 64-bit mode");
+ BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
+ assert(STI.hasFeature(X86::Mode64Bit) &&
+ "Rip-relative addressing requires 64-bit mode");
assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
- EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
unsigned Opcode = MI.getOpcode();
// movq loads are handled with a special relocation form which allows the
@@ -432,20 +425,20 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
? X86II::getSizeOfImm(TSFlags)
: 0;
- EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind),
- CurByte, OS, Fixups, -ImmSize);
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+ Fixups, -ImmSize);
return;
}
- unsigned BaseRegNo = BaseReg ? GetX86RegNum(Base) : -1U;
+ unsigned BaseRegNo = BaseReg ? getX86RegNum(Base) : -1U;
// 16-bit addressing forms of the ModR/M byte have a different encoding for
// the R/M field and are far more limited in which registers can be used.
- if (Is16BitMemOperand(MI, Op, STI)) {
+ if (is16BitMemOperand(MI, Op, STI)) {
if (BaseReg) {
// For 32-bit addressing, the row and column values in Table 2-2 are
// basically the same. It's AX/CX/DX/BX/SP/BP/SI/DI in that order, with
- // some special cases. And GetX86RegNum reflects that numbering.
+ // some special cases. And getX86RegNum reflects that numbering.
// For 16-bit addressing it's more fun, as shown in the SDM Vol 2A,
// Table 2-1 "16-Bit Addressing Forms with the ModR/M byte". We can only
// use SI/DI/BP/BX, which have "row" values 4-7 in no particular order,
@@ -454,13 +447,13 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
//
// R16Table[] is a lookup from the normal RegNo, to the row values from
// Table 2-1 for 16-bit addressing modes. Where zero means disallowed.
- static const unsigned R16Table[] = { 0, 0, 0, 7, 0, 6, 4, 5 };
+ static const unsigned R16Table[] = {0, 0, 0, 7, 0, 6, 4, 5};
unsigned RMfield = R16Table[BaseRegNo];
assert(RMfield && "invalid 16-bit base register");
if (IndexReg.getReg()) {
- unsigned IndexReg16 = R16Table[GetX86RegNum(IndexReg)];
+ unsigned IndexReg16 = R16Table[getX86RegNum(IndexReg)];
assert(IndexReg16 && "invalid 16-bit index register");
// We must have one of SI/DI (4,5), and one of BP/BX (6,7).
@@ -479,23 +472,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
if (Disp.isImm() && isDisp8(Disp.getImm())) {
if (Disp.getImm() == 0 && RMfield != 6) {
// There is no displacement; just the register.
- EmitByte(ModRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
return;
}
// Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
- EmitByte(ModRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ emitByte(modRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
return;
}
// This is the [REG]+disp16 case.
- EmitByte(ModRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
} else {
// There is no BaseReg; this is the plain [disp16] case.
- EmitByte(ModRMByte(0, RegOpcodeField, 6), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 6), CurByte, OS);
}
// Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
- EmitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+ emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
return;
}
@@ -504,7 +497,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// resolve addresses on-the-fly, otherwise use SIB (Intel Manual 2A, table
// 2-7) and absolute references.
- if (// The SIB byte must be used if there is an index register.
+ if ( // The SIB byte must be used if there is an index register.
IndexReg.getReg() == 0 &&
// The SIB byte must be used if the base is ESP/RSP/R12, all of which
// encode to an R/M value of 4, which indicates that a SIB byte is
@@ -512,11 +505,11 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
BaseRegNo != N86::ESP &&
// If there is no base register and we're in 64-bit mode, we need a SIB
// byte to emit an addr that is just 'disp32' (the non-RIP relative form).
- (!is64BitMode(STI) || BaseReg != 0)) {
+ (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
- if (BaseReg == 0) { // [disp32] in X86-32 mode
- EmitByte(ModRMByte(0, RegOpcodeField, 5), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+ if (BaseReg == 0) { // [disp32] in X86-32 mode
+ emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
+ emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
return;
}
@@ -526,7 +519,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// by emitting a displacement of 0 below.
if (BaseRegNo != N86::EBP) {
if (Disp.isImm() && Disp.getImm() == 0) {
- EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
return;
}
@@ -537,7 +530,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// This is exclusively used by call *a@tlscall(base). The relocation
// (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
- EmitByte(ModRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
return;
}
}
@@ -546,70 +539,70 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
if (Disp.isImm()) {
if (!HasEVEX && isDisp8(Disp.getImm())) {
- EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+ emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
return;
}
// Try EVEX compressed 8-bit displacement first; if failed, fall back to
// 32-bit displacement.
int CDisp8 = 0;
if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
- EmitByte(ModRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
CDisp8 - Disp.getImm());
return;
}
}
// Otherwise, emit the most general non-SIB encoding: [REG+disp32]
- EmitByte(ModRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
unsigned Opcode = MI.getOpcode();
unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
: X86::reloc_signed_4byte;
- EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
Fixups);
return;
}
// We need a SIB byte, so start by outputting the ModR/M byte first
- assert(IndexReg.getReg() != X86::ESP &&
- IndexReg.getReg() != X86::RSP && "Cannot use ESP as index reg!");
+ assert(IndexReg.getReg() != X86::ESP && IndexReg.getReg() != X86::RSP &&
+ "Cannot use ESP as index reg!");
bool ForceDisp32 = false;
- bool ForceDisp8 = false;
+ bool ForceDisp8 = false;
int CDisp8 = 0;
int ImmOffset = 0;
if (BaseReg == 0) {
// If there is no base register, we emit the special case SIB byte with
// MOD=0, BASE=5, to JUST get the index, scale, and displacement.
- EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
ForceDisp32 = true;
} else if (!Disp.isImm()) {
// Emit the normal disp32 encoding.
- EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
ForceDisp32 = true;
} else if (Disp.getImm() == 0 &&
// Base reg can't be anything that ends up with '5' as the base
// reg, it is the magic [*] nomenclature that indicates no base.
BaseRegNo != N86::EBP) {
// Emit no displacement ModR/M byte
- EmitByte(ModRMByte(0, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
} else if (!HasEVEX && isDisp8(Disp.getImm())) {
// Emit the disp8 encoding.
- EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
- ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
} else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
// Emit the disp8 encoding.
- EmitByte(ModRMByte(1, RegOpcodeField, 4), CurByte, OS);
- ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
+ emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+ ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
ImmOffset = CDisp8 - Disp.getImm();
} else {
// Emit the normal disp32 encoding.
- EmitByte(ModRMByte(2, RegOpcodeField, 4), CurByte, OS);
+ emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
}
// Calculate what the SS field value should be...
- static const unsigned SSTable[] = { ~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3 };
+ static const unsigned SSTable[] = {~0U, 0, 1, ~0U, 2, ~0U, ~0U, ~0U, 3};
unsigned SS = SSTable[Scale.getImm()];
if (BaseReg == 0) {
@@ -617,30 +610,133 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
// Manual 2A, table 2-7. The displacement has already been output.
unsigned IndexRegNo;
if (IndexReg.getReg())
- IndexRegNo = GetX86RegNum(IndexReg);
+ IndexRegNo = getX86RegNum(IndexReg);
else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
IndexRegNo = 4;
- EmitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+ emitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
} else {
unsigned IndexRegNo;
if (IndexReg.getReg())
- IndexRegNo = GetX86RegNum(IndexReg);
+ IndexRegNo = getX86RegNum(IndexReg);
else
- IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
- EmitSIBByte(SS, IndexRegNo, GetX86RegNum(Base), CurByte, OS);
+ IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
+ emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), CurByte, OS);
}
// Do we need to output a displacement?
if (ForceDisp8)
- EmitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups, ImmOffset);
+ emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+ ImmOffset);
else if (ForceDisp32 || Disp.getImm() != 0)
- EmitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
+ emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
CurByte, OS, Fixups);
}
-/// EmitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
+void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
+ unsigned &CurByte, bool &Rex,
+ const MCInst &MI, const MCInstrDesc &Desc,
+ const MCSubtargetInfo &STI,
+ raw_ostream &OS) const {
+ // Determine where the memory operand starts, if present.
+ int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+ if (MemoryOperand != -1)
+ MemoryOperand += CurOp;
+
+ // Emit segment override opcode prefix as needed.
+ if (MemoryOperand >= 0)
+ emitSegmentOverridePrefix(CurByte, MemoryOperand + X86::AddrSegmentReg, MI,
+ OS);
+
+ // Emit the repeat opcode prefix as needed.
+ unsigned Flags = MI.getFlags();
+ if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
+ emitByte(0xF3, CurByte, OS);
+ if (Flags & X86::IP_HAS_REPEAT_NE)
+ emitByte(0xF2, CurByte, OS);
+
+ // Emit the address size opcode prefix as needed.
+ bool need_address_override;
+ uint64_t AdSize = TSFlags & X86II::AdSizeMask;
+ if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
+ (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
+ (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
+ need_address_override = true;
+ } else if (MemoryOperand < 0) {
+ need_address_override = false;
+ } else if (STI.hasFeature(X86::Mode64Bit)) {
+ assert(!is16BitMemOperand(MI, MemoryOperand, STI));
+ need_address_override = is32BitMemOperand(MI, MemoryOperand);
+ } else if (STI.hasFeature(X86::Mode32Bit)) {
+ assert(!is64BitMemOperand(MI, MemoryOperand));
+ need_address_override = is16BitMemOperand(MI, MemoryOperand, STI);
+ } else {
+ assert(STI.hasFeature(X86::Mode16Bit));
+ assert(!is64BitMemOperand(MI, MemoryOperand));
+ need_address_override = !is16BitMemOperand(MI, MemoryOperand, STI);
+ }
+
+ if (need_address_override)
+ emitByte(0x67, CurByte, OS);
+
+ // Encoding type for this instruction.
+ uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ if (Encoding == 0)
+ Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+ else
+ emitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+
+ uint64_t Form = TSFlags & X86II::FormMask;
+ switch (Form) {
+ default:
+ break;
+ case X86II::RawFrmDstSrc: {
+ unsigned siReg = MI.getOperand(1).getReg();
+ assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
+ (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
+ (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
+ "SI and DI register sizes do not match");
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(2).getReg() != X86::DS)
+ emitSegmentOverridePrefix(CurByte, 2, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
+ (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
+ emitByte(0x67, CurByte, OS);
+ CurOp += 3; // Consume operands.
+ break;
+ }
+ case X86II::RawFrmSrc: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit segment override opcode prefix as needed (not for %ds).
+ if (MI.getOperand(1).getReg() != X86::DS)
+ emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ // Emit AdSize prefix as needed.
+ if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
+ (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
+ emitByte(0x67, CurByte, OS);
+ CurOp += 2; // Consume operands.
+ break;
+ }
+ case X86II::RawFrmDst: {
+ unsigned siReg = MI.getOperand(0).getReg();
+ // Emit AdSize prefix as needed.
+ if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
+ (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
+ emitByte(0x67, CurByte, OS);
+ ++CurOp; // Consume operand.
+ break;
+ }
+ case X86II::RawFrmMemOffs: {
+ // Emit segment override opcode prefix as needed.
+ emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+ break;
+ }
+ }
+}
+
+/// emitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
/// called VEX.
-void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
+void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
int MemOperand, const MCInst &MI,
const MCInstrDesc &Desc,
raw_ostream &OS) const {
@@ -690,13 +786,26 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 0b01010: XOP map select - 0Ah instructions with imm dword
uint8_t VEX_5M;
switch (TSFlags & X86II::OpMapMask) {
- default: llvm_unreachable("Invalid prefix!");
- case X86II::TB: VEX_5M = 0x1; break; // 0F
- case X86II::T8: VEX_5M = 0x2; break; // 0F 38
- case X86II::TA: VEX_5M = 0x3; break; // 0F 3A
- case X86II::XOP8: VEX_5M = 0x8; break;
- case X86II::XOP9: VEX_5M = 0x9; break;
- case X86II::XOPA: VEX_5M = 0xA; break;
+ default:
+ llvm_unreachable("Invalid prefix!");
+ case X86II::TB:
+ VEX_5M = 0x1;
+ break; // 0F
+ case X86II::T8:
+ VEX_5M = 0x2;
+ break; // 0F 38
+ case X86II::TA:
+ VEX_5M = 0x3;
+ break; // 0F 3A
+ case X86II::XOP8:
+ VEX_5M = 0x8;
+ break;
+ case X86II::XOP9:
+ VEX_5M = 0x9;
+ break;
+ case X86II::XOPA:
+ VEX_5M = 0xA;
+ break;
}
// VEX_4V (VEX vvvv field): a register specifier
@@ -724,9 +833,15 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
//
uint8_t VEX_PP = 0;
switch (TSFlags & X86II::OpPrefixMask) {
- case X86II::PD: VEX_PP = 0x1; break; // 66
- case X86II::XS: VEX_PP = 0x2; break; // F3
- case X86II::XD: VEX_PP = 0x3; break; // F2
+ case X86II::PD:
+ VEX_PP = 0x1;
+ break; // 66
+ case X86II::XS:
+ VEX_PP = 0x2;
+ break; // F3
+ case X86II::XD:
+ VEX_PP = 0x3;
+ break; // F2
}
// EVEX_U
@@ -751,7 +866,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned CurOp = X86II::getOperandBias(Desc);
switch (TSFlags & X86II::FormMask) {
- default: llvm_unreachable("Unexpected form in EmitVEXOpcodePrefix!");
+ default:
+ llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
case X86II::RawFrm:
break;
case X86II::MRMDestMem: {
@@ -762,7 +878,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
//
unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
VEX_B = ~(BaseRegEnc >> 3) & 1;
- unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
@@ -807,7 +924,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
VEX_B = ~(BaseRegEnc >> 3) & 1;
- unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
@@ -822,7 +940,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
VEX_B = ~(BaseRegEnc >> 3) & 1;
- unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
VEX_4V = ~getX86RegEncoding(MI, CurOp + X86::AddrNumOperands) & 0xf;
@@ -838,14 +957,19 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
VEX_B = ~(BaseRegEnc >> 3) & 1;
- unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
break;
}
- case X86II::MRM0m: case X86II::MRM1m:
- case X86II::MRM2m: case X86II::MRM3m:
- case X86II::MRM4m: case X86II::MRM5m:
- case X86II::MRM6m: case X86II::MRM7m: {
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m: {
// MRM[0-9]m instructions forms:
// MemAddr
// src1(VEX_4V), MemAddr
@@ -860,7 +984,8 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
unsigned BaseRegEnc = getX86RegEncoding(MI, MemOperand + X86::AddrBaseReg);
VEX_B = ~(BaseRegEnc >> 3) & 1;
- unsigned IndexRegEnc = getX86RegEncoding(MI, MemOperand+X86::AddrIndexReg);
+ unsigned IndexRegEnc =
+ getX86RegEncoding(MI, MemOperand + X86::AddrIndexReg);
VEX_X = ~(IndexRegEnc >> 3) & 1;
if (!HasVEX_4V) // Only needed with VSIB which don't use VVVV.
EVEX_V2 = ~(IndexRegEnc >> 4) & 1;
@@ -894,7 +1019,7 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
if (EVEX_b) {
if (HasEVEX_RC) {
- unsigned RcOperand = NumOps-1;
+ unsigned RcOperand = NumOps - 1;
assert(RcOperand >= CurOp);
EVEX_rc = MI.getOperand(RcOperand).getImm();
assert(EVEX_rc <= 3 && "Invalid rounding control!");
@@ -956,10 +1081,14 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
EncodeRC = true;
break;
}
- case X86II::MRM0r: case X86II::MRM1r:
- case X86II::MRM2r: case X86II::MRM3r:
- case X86II::MRM4r: case X86II::MRM5r:
- case X86II::MRM6r: case X86II::MRM7r: {
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r: {
// MRM0r-MRM7r instructions forms:
// dst(VEX_4V), src(ModR/M), imm8
if (HasVEX_4V) {
@@ -996,17 +1125,17 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
uint8_t LastByte = VEX_PP | (VEX_L << 2) | (VEX_4V << 3);
// Can we use the 2 byte VEX prefix?
- if (!(MI.getFlags() & X86::IP_USE_VEX3) &&
- Encoding == X86II::VEX && VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
- EmitByte(0xC5, CurByte, OS);
- EmitByte(LastByte | (VEX_R << 7), CurByte, OS);
+ if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX &&
+ VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
+ emitByte(0xC5, CurByte, OS);
+ emitByte(LastByte | (VEX_R << 7), CurByte, OS);
return;
}
// 3 byte VEX prefix
- EmitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
- EmitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
- EmitByte(LastByte | (VEX_W << 7), CurByte, OS);
+ emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
+ emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
+ emitByte(LastByte | (VEX_W << 7), CurByte, OS);
} else {
assert(Encoding == X86II::EVEX && "unknown encoding!");
// EVEX opcode prefix can have 4 bytes
@@ -1014,39 +1143,30 @@ void X86MCCodeEmitter::EmitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// +-----+ +--------------+ +-------------------+ +------------------------+
// | 62h | | RXBR' | 00mm | | W | vvvv | U | pp | | z | L'L | b | v' | aaa |
// +-----+ +--------------+ +-------------------+ +------------------------+
- assert((VEX_5M & 0x3) == VEX_5M
- && "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
-
- EmitByte(0x62, CurByte, OS);
- EmitByte((VEX_R << 7) |
- (VEX_X << 6) |
- (VEX_B << 5) |
- (EVEX_R2 << 4) |
- VEX_5M, CurByte, OS);
- EmitByte((VEX_W << 7) |
- (VEX_4V << 3) |
- (EVEX_U << 2) |
- VEX_PP, CurByte, OS);
+ assert((VEX_5M & 0x3) == VEX_5M &&
+ "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
+
+ emitByte(0x62, CurByte, OS);
+ emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
+ VEX_5M,
+ CurByte, OS);
+ emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, CurByte,
+ OS);
if (EncodeRC)
- EmitByte((EVEX_z << 7) |
- (EVEX_rc << 5) |
- (EVEX_b << 4) |
- (EVEX_V2 << 3) |
- EVEX_aaa, CurByte, OS);
+ emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) |
+ EVEX_aaa,
+ CurByte, OS);
else
- EmitByte((EVEX_z << 7) |
- (EVEX_L2 << 6) |
- (VEX_L << 5) |
- (EVEX_b << 4) |
- (EVEX_V2 << 3) |
- EVEX_aaa, CurByte, OS);
+ emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) |
+ (EVEX_V2 << 3) | EVEX_aaa,
+ CurByte, OS);
}
}
-/// DetermineREXPrefix - Determine if the MCInst has to be encoded with a X86-64
-/// REX prefix which specifies 1) 64-bit instructions, 2) non-default operand
-/// size, and 3) use of X86-64 extended registers.
-uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
+/// Determine if the MCInst has to be encoded with a X86-64 REX prefix which
+/// specifies 1) 64-bit instructions, 2) non-default operand size, and 3) use
+/// of X86-64 extended registers.
+uint8_t X86MCCodeEmitter::determineREXPrefix(const MCInst &MI, uint64_t TSFlags,
int MemOperand,
const MCInstrDesc &Desc) const {
uint8_t REX = 0;
@@ -1055,7 +1175,8 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
if (TSFlags & X86II::REX_W)
REX |= 1 << 3; // set REX.W
- if (MI.getNumOperands() == 0) return REX;
+ if (MI.getNumOperands() == 0)
+ return REX;
unsigned NumOps = MI.getNumOperands();
unsigned CurOp = X86II::getOperandBias(Desc);
@@ -1063,12 +1184,13 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
// If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
for (unsigned i = CurOp; i != NumOps; ++i) {
const MCOperand &MO = MI.getOperand(i);
- if (!MO.isReg()) continue;
+ if (!MO.isReg())
+ continue;
unsigned Reg = MO.getReg();
if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
UsesHighByteReg = true;
if (X86II::isX86_64NonExtLowByteReg(Reg))
- // FIXME: The caller of DetermineREXPrefix slaps this prefix onto anything
+ // FIXME: The caller of determineREXPrefix slaps this prefix onto anything
// that returns non-zero.
REX |= 0x40; // REX fixed encoding prefix
}
@@ -1084,9 +1206,9 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
break;
case X86II::MRMSrcMem:
case X86II::MRMSrcMemCC:
- REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
- REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
- REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
CurOp += X86::AddrNumOperands;
break;
case X86II::MRMDestReg:
@@ -1094,57 +1216,82 @@ uint8_t X86MCCodeEmitter::DetermineREXPrefix(const MCInst &MI, uint64_t TSFlags,
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
break;
case X86II::MRMDestMem:
- REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
- REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
CurOp += X86::AddrNumOperands;
REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
break;
- case X86II::MRMXmCC: case X86II::MRMXm:
- case X86II::MRM0m: case X86II::MRM1m:
- case X86II::MRM2m: case X86II::MRM3m:
- case X86II::MRM4m: case X86II::MRM5m:
- case X86II::MRM6m: case X86II::MRM7m:
- REX |= isREXExtendedReg(MI, MemOperand+X86::AddrBaseReg) << 0; // REX.B
- REX |= isREXExtendedReg(MI, MemOperand+X86::AddrIndexReg) << 1; // REX.X
+ case X86II::MRMXmCC:
+ case X86II::MRMXm:
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m:
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0; // REX.B
+ REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
break;
- case X86II::MRMXrCC: case X86II::MRMXr:
- case X86II::MRM0r: case X86II::MRM1r:
- case X86II::MRM2r: case X86II::MRM3r:
- case X86II::MRM4r: case X86II::MRM5r:
- case X86II::MRM6r: case X86II::MRM7r:
+ case X86II::MRMXrCC:
+ case X86II::MRMXr:
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r:
REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
break;
}
if (REX && UsesHighByteReg)
- report_fatal_error("Cannot encode high byte register in REX-prefixed instruction");
+ report_fatal_error(
+ "Cannot encode high byte register in REX-prefixed instruction");
return REX;
}
-/// EmitSegmentOverridePrefix - Emit segment override opcode prefix as needed
-void X86MCCodeEmitter::EmitSegmentOverridePrefix(unsigned &CurByte,
+/// Emit segment override opcode prefix as needed.
+void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
unsigned SegOperand,
const MCInst &MI,
raw_ostream &OS) const {
// Check for explicit segment override on memory operand.
switch (MI.getOperand(SegOperand).getReg()) {
- default: llvm_unreachable("Unknown segment register!");
- case 0: break;
- case X86::CS: EmitByte(0x2E, CurByte, OS); break;
- case X86::SS: EmitByte(0x36, CurByte, OS); break;
- case X86::DS: EmitByte(0x3E, CurByte, OS); break;
- case X86::ES: EmitByte(0x26, CurByte, OS); break;
- case X86::FS: EmitByte(0x64, CurByte, OS); break;
- case X86::GS: EmitByte(0x65, CurByte, OS); break;
+ default:
+ llvm_unreachable("Unknown segment register!");
+ case 0:
+ break;
+ case X86::CS:
+ emitByte(0x2E, CurByte, OS);
+ break;
+ case X86::SS:
+ emitByte(0x36, CurByte, OS);
+ break;
+ case X86::DS:
+ emitByte(0x3E, CurByte, OS);
+ break;
+ case X86::ES:
+ emitByte(0x26, CurByte, OS);
+ break;
+ case X86::FS:
+ emitByte(0x64, CurByte, OS);
+ break;
+ case X86::GS:
+ emitByte(0x65, CurByte, OS);
+ break;
}
}
/// Emit all instruction prefixes prior to the opcode.
///
-/// MemOperand is the operand # of the start of a memory operand if present. If
-/// Not present, it is -1.
+/// \param MemOperand the operand # of the start of a memory operand if present.
+/// If not present, it is -1.
///
-/// Returns true if a REX prefix was used.
+/// \returns true if a REX prefix was used.
bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
int MemOperand, const MCInst &MI,
const MCInstrDesc &Desc,
@@ -1152,35 +1299,35 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
raw_ostream &OS) const {
bool Ret = false;
// Emit the operand size opcode prefix as needed.
- if ((TSFlags & X86II::OpSizeMask) == (is16BitMode(STI) ? X86II::OpSize32
- : X86II::OpSize16))
- EmitByte(0x66, CurByte, OS);
+ if ((TSFlags & X86II::OpSizeMask) ==
+ (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
+ emitByte(0x66, CurByte, OS);
// Emit the LOCK opcode prefix.
if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
- EmitByte(0xF0, CurByte, OS);
+ emitByte(0xF0, CurByte, OS);
// Emit the NOTRACK opcode prefix.
if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
- EmitByte(0x3E, CurByte, OS);
+ emitByte(0x3E, CurByte, OS);
switch (TSFlags & X86II::OpPrefixMask) {
- case X86II::PD: // 66
- EmitByte(0x66, CurByte, OS);
+ case X86II::PD: // 66
+ emitByte(0x66, CurByte, OS);
break;
- case X86II::XS: // F3
- EmitByte(0xF3, CurByte, OS);
+ case X86II::XS: // F3
+ emitByte(0xF3, CurByte, OS);
break;
- case X86II::XD: // F2
- EmitByte(0xF2, CurByte, OS);
+ case X86II::XD: // F2
+ emitByte(0xF2, CurByte, OS);
break;
}
// Handle REX prefix.
// FIXME: Can this come before F2 etc to simplify emission?
- if (is64BitMode(STI)) {
- if (uint8_t REX = DetermineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
- EmitByte(0x40 | REX, CurByte, OS);
+ if (STI.hasFeature(X86::Mode64Bit)) {
+ if (uint8_t REX = determineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
+ emitByte(0x40 | REX, CurByte, OS);
Ret = true;
}
} else {
@@ -1189,33 +1336,50 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
// 0x0F escape code must be emitted just before the opcode.
switch (TSFlags & X86II::OpMapMask) {
- case X86II::TB: // Two-byte opcode map
- case X86II::T8: // 0F 38
- case X86II::TA: // 0F 3A
- case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
- EmitByte(0x0F, CurByte, OS);
+ case X86II::TB: // Two-byte opcode map
+ case X86II::T8: // 0F 38
+ case X86II::TA: // 0F 3A
+ case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
+ emitByte(0x0F, CurByte, OS);
break;
}
switch (TSFlags & X86II::OpMapMask) {
- case X86II::T8: // 0F 38
- EmitByte(0x38, CurByte, OS);
+ case X86II::T8: // 0F 38
+ emitByte(0x38, CurByte, OS);
break;
- case X86II::TA: // 0F 3A
- EmitByte(0x3A, CurByte, OS);
+ case X86II::TA: // 0F 3A
+ emitByte(0x3A, CurByte, OS);
break;
}
return Ret;
}
-void X86MCCodeEmitter::
-encodeInstruction(const MCInst &MI, raw_ostream &OS,
- SmallVectorImpl<MCFixup> &Fixups,
- const MCSubtargetInfo &STI) const {
+void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
+ const MCSubtargetInfo &STI) const {
+ unsigned Opcode = MI.getOpcode();
+ const MCInstrDesc &Desc = MCII.get(Opcode);
+ uint64_t TSFlags = Desc.TSFlags;
+
+ // Pseudo instructions don't get encoded.
+ if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+ return;
+
+ unsigned CurOp = X86II::getOperandBias(Desc);
+
+ // Keep track of the current byte being emitted.
+ unsigned CurByte = 0;
+
+ bool Rex = false;
+ emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+}
+
+void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+ SmallVectorImpl<MCFixup> &Fixups,
+ const MCSubtargetInfo &STI) const {
unsigned Opcode = MI.getOpcode();
const MCInstrDesc &Desc = MCII.get(Opcode);
uint64_t TSFlags = Desc.TSFlags;
- unsigned Flags = MI.getFlags();
// Pseudo instructions don't get encoded.
if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
@@ -1227,8 +1391,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
// Keep track of the current byte being emitted.
unsigned CurByte = 0;
- // Encoding type for this instruction.
- uint64_t Encoding = TSFlags & X86II::EncodingMask;
+ bool Rex = false;
+ emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
// It uses the VEX.VVVV field?
bool HasVEX_4V = TSFlags & X86II::VEX_4V;
@@ -1241,104 +1405,25 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
// Used if a register is encoded in 7:4 of immediate.
unsigned I8RegNum = 0;
- // Determine where the memory operand starts, if present.
- int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
- if (MemoryOperand != -1) MemoryOperand += CurOp;
-
- // Emit segment override opcode prefix as needed.
- if (MemoryOperand >= 0)
- EmitSegmentOverridePrefix(CurByte, MemoryOperand+X86::AddrSegmentReg,
- MI, OS);
-
- // Emit the repeat opcode prefix as needed.
- if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
- EmitByte(0xF3, CurByte, OS);
- if (Flags & X86::IP_HAS_REPEAT_NE)
- EmitByte(0xF2, CurByte, OS);
-
- // Emit the address size opcode prefix as needed.
- bool need_address_override;
- uint64_t AdSize = TSFlags & X86II::AdSizeMask;
- if ((is16BitMode(STI) && AdSize == X86II::AdSize32) ||
- (is32BitMode(STI) && AdSize == X86II::AdSize16) ||
- (is64BitMode(STI) && AdSize == X86II::AdSize32)) {
- need_address_override = true;
- } else if (MemoryOperand < 0) {
- need_address_override = false;
- } else if (is64BitMode(STI)) {
- assert(!Is16BitMemOperand(MI, MemoryOperand, STI));
- need_address_override = Is32BitMemOperand(MI, MemoryOperand);
- } else if (is32BitMode(STI)) {
- assert(!Is64BitMemOperand(MI, MemoryOperand));
- need_address_override = Is16BitMemOperand(MI, MemoryOperand, STI);
- } else {
- assert(is16BitMode(STI));
- assert(!Is64BitMemOperand(MI, MemoryOperand));
- need_address_override = !Is16BitMemOperand(MI, MemoryOperand, STI);
- }
-
- if (need_address_override)
- EmitByte(0x67, CurByte, OS);
-
- bool Rex = false;
- if (Encoding == 0)
- Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
- else
- EmitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
-
uint8_t BaseOpcode = X86II::getBaseOpcodeFor(TSFlags);
if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
- BaseOpcode = 0x0F; // Weird 3DNow! encoding.
+ BaseOpcode = 0x0F; // Weird 3DNow! encoding.
unsigned OpcodeOffset = 0;
uint64_t Form = TSFlags & X86II::FormMask;
switch (Form) {
- default: errs() << "FORM: " << Form << "\n";
+ default:
+ errs() << "FORM: " << Form << "\n";
llvm_unreachable("Unknown FormMask value in X86MCCodeEmitter!");
case X86II::Pseudo:
llvm_unreachable("Pseudo instruction shouldn't be emitted");
- case X86II::RawFrmDstSrc: {
- unsigned siReg = MI.getOperand(1).getReg();
- assert(((siReg == X86::SI && MI.getOperand(0).getReg() == X86::DI) ||
- (siReg == X86::ESI && MI.getOperand(0).getReg() == X86::EDI) ||
- (siReg == X86::RSI && MI.getOperand(0).getReg() == X86::RDI)) &&
- "SI and DI register sizes do not match");
- // Emit segment override opcode prefix as needed (not for %ds).
- if (MI.getOperand(2).getReg() != X86::DS)
- EmitSegmentOverridePrefix(CurByte, 2, MI, OS);
- // Emit AdSize prefix as needed.
- if ((!is32BitMode(STI) && siReg == X86::ESI) ||
- (is32BitMode(STI) && siReg == X86::SI))
- EmitByte(0x67, CurByte, OS);
- CurOp += 3; // Consume operands.
- EmitByte(BaseOpcode, CurByte, OS);
- break;
- }
- case X86II::RawFrmSrc: {
- unsigned siReg = MI.getOperand(0).getReg();
- // Emit segment override opcode prefix as needed (not for %ds).
- if (MI.getOperand(1).getReg() != X86::DS)
- EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
- // Emit AdSize prefix as needed.
- if ((!is32BitMode(STI) && siReg == X86::ESI) ||
- (is32BitMode(STI) && siReg == X86::SI))
- EmitByte(0x67, CurByte, OS);
- CurOp += 2; // Consume operands.
- EmitByte(BaseOpcode, CurByte, OS);
+ case X86II::RawFrmDstSrc:
+ case X86II::RawFrmSrc:
+ case X86II::RawFrmDst:
+ emitByte(BaseOpcode, CurByte, OS);
break;
- }
- case X86II::RawFrmDst: {
- unsigned siReg = MI.getOperand(0).getReg();
- // Emit AdSize prefix as needed.
- if ((!is32BitMode(STI) && siReg == X86::EDI) ||
- (is32BitMode(STI) && siReg == X86::DI))
- EmitByte(0x67, CurByte, OS);
- ++CurOp; // Consume operand.
- EmitByte(BaseOpcode, CurByte, OS);
- break;
- }
case X86II::AddCCFrm: {
// This will be added to the opcode in the fallthrough.
OpcodeOffset = MI.getOperand(NumOps - 1).getImm();
@@ -1346,49 +1431,47 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
--NumOps; // Drop the operand from the end.
LLVM_FALLTHROUGH;
case X86II::RawFrm:
- EmitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
+ emitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
- if (!is64BitMode(STI) || !isPCRel32Branch(MI))
+ if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
break;
const MCOperand &Op = MI.getOperand(CurOp++);
- EmitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
+ emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS,
Fixups);
break;
}
case X86II::RawFrmMemOffs:
- // Emit segment override opcode prefix as needed.
- EmitSegmentOverridePrefix(CurByte, 1, MI, OS);
- EmitByte(BaseOpcode, CurByte, OS);
- EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ emitByte(BaseOpcode, CurByte, OS);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
CurByte, OS, Fixups);
++CurOp; // skip segment operand
break;
case X86II::RawFrmImm8:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ emitByte(BaseOpcode, CurByte, OS);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
CurByte, OS, Fixups);
- EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
OS, Fixups);
break;
case X86II::RawFrmImm16:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ emitByte(BaseOpcode, CurByte, OS);
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
CurByte, OS, Fixups);
- EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
OS, Fixups);
break;
case X86II::AddRegFrm:
- EmitByte(BaseOpcode + GetX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+ emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
break;
case X86II::MRMDestReg: {
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
unsigned SrcRegNum = CurOp + 1;
if (HasEVEX_K) // Skip writemask
@@ -1397,13 +1480,13 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
- EmitRegModRMByte(MI.getOperand(CurOp),
- GetX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+ emitRegModRMByte(MI.getOperand(CurOp),
+ getX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
CurOp = SrcRegNum + 1;
break;
}
case X86II::MRMDestMem: {
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
if (HasEVEX_K) // Skip writemask
@@ -1412,13 +1495,13 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
- emitMemModRMByte(MI, CurOp, GetX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
+ emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
Rex, CurByte, OS, Fixups, STI);
CurOp = SrcRegNum + 1;
break;
}
case X86II::MRMSrcReg: {
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
unsigned SrcRegNum = CurOp + 1;
if (HasEVEX_K) // Skip writemask
@@ -1427,8 +1510,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
++SrcRegNum;
- EmitRegModRMByte(MI.getOperand(SrcRegNum),
- GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ emitRegModRMByte(MI.getOperand(SrcRegNum),
+ getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
CurOp = SrcRegNum + 1;
if (HasVEX_I8Reg)
I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1438,17 +1521,17 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
break;
}
case X86II::MRMSrcReg4VOp3: {
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
unsigned SrcRegNum = CurOp + 1;
- EmitRegModRMByte(MI.getOperand(SrcRegNum),
- GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ emitRegModRMByte(MI.getOperand(SrcRegNum),
+ getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
CurOp = SrcRegNum + 1;
++CurOp; // Encoded in VEX.VVVV
break;
}
case X86II::MRMSrcRegOp4: {
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
unsigned SrcRegNum = CurOp + 1;
// Skip 1st src (which is encoded in VEX_VVVV)
@@ -1458,8 +1541,8 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
- EmitRegModRMByte(MI.getOperand(SrcRegNum),
- GetX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+ emitRegModRMByte(MI.getOperand(SrcRegNum),
+ getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
CurOp = SrcRegNum + 1;
break;
}
@@ -1468,24 +1551,24 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned SecondOp = CurOp++;
unsigned CC = MI.getOperand(CurOp++).getImm();
- EmitByte(BaseOpcode + CC, CurByte, OS);
+ emitByte(BaseOpcode + CC, CurByte, OS);
- EmitRegModRMByte(MI.getOperand(SecondOp),
- GetX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
+ emitRegModRMByte(MI.getOperand(SecondOp),
+ getX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
break;
}
case X86II::MRMSrcMem: {
- unsigned FirstMemOp = CurOp+1;
+ unsigned FirstMemOp = CurOp + 1;
if (HasEVEX_K) // Skip writemask
++FirstMemOp;
if (HasVEX_4V)
- ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
- emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
TSFlags, Rex, CurByte, OS, Fixups, STI);
CurOp = FirstMemOp + X86::AddrNumOperands;
if (HasVEX_I8Reg)
@@ -1493,28 +1576,28 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
break;
}
case X86II::MRMSrcMem4VOp3: {
- unsigned FirstMemOp = CurOp+1;
+ unsigned FirstMemOp = CurOp + 1;
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
- emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
TSFlags, Rex, CurByte, OS, Fixups, STI);
CurOp = FirstMemOp + X86::AddrNumOperands;
++CurOp; // Encoded in VEX.VVVV.
break;
}
case X86II::MRMSrcMemOp4: {
- unsigned FirstMemOp = CurOp+1;
+ unsigned FirstMemOp = CurOp + 1;
- ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
+ ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
// Capture second register source (encoded in Imm[7:4])
assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
- emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(CurOp)),
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
TSFlags, Rex, CurByte, OS, Fixups, STI);
CurOp = FirstMemOp + X86::AddrNumOperands;
break;
@@ -1525,9 +1608,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = FirstMemOp + X86::AddrNumOperands;
unsigned CC = MI.getOperand(CurOp++).getImm();
- EmitByte(BaseOpcode + CC, CurByte, OS);
+ emitByte(BaseOpcode + CC, CurByte, OS);
- emitMemModRMByte(MI, FirstMemOp, GetX86RegNum(MI.getOperand(RegOp)),
+ emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)),
TSFlags, Rex, CurByte, OS, Fixups, STI);
break;
}
@@ -1536,24 +1619,28 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
unsigned RegOp = CurOp++;
unsigned CC = MI.getOperand(CurOp++).getImm();
- EmitByte(BaseOpcode + CC, CurByte, OS);
- EmitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
+ emitByte(BaseOpcode + CC, CurByte, OS);
+ emitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
break;
}
case X86II::MRMXr:
- case X86II::MRM0r: case X86II::MRM1r:
- case X86II::MRM2r: case X86II::MRM3r:
- case X86II::MRM4r: case X86II::MRM5r:
- case X86II::MRM6r: case X86II::MRM7r:
+ case X86II::MRM0r:
+ case X86II::MRM1r:
+ case X86II::MRM2r:
+ case X86II::MRM3r:
+ case X86II::MRM4r:
+ case X86II::MRM5r:
+ case X86II::MRM6r:
+ case X86II::MRM7r:
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
if (HasEVEX_K) // Skip writemask
++CurOp;
- EmitByte(BaseOpcode, CurByte, OS);
- EmitRegModRMByte(MI.getOperand(CurOp++),
- (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
- CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
+ emitRegModRMByte(MI.getOperand(CurOp++),
+ (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, CurByte,
+ OS);
break;
case X86II::MRMXmCC: {
@@ -1561,52 +1648,98 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
CurOp = FirstMemOp + X86::AddrNumOperands;
unsigned CC = MI.getOperand(CurOp++).getImm();
- EmitByte(BaseOpcode + CC, CurByte, OS);
+ emitByte(BaseOpcode + CC, CurByte, OS);
emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI);
break;
}
case X86II::MRMXm:
- case X86II::MRM0m: case X86II::MRM1m:
- case X86II::MRM2m: case X86II::MRM3m:
- case X86II::MRM4m: case X86II::MRM5m:
- case X86II::MRM6m: case X86II::MRM7m:
+ case X86II::MRM0m:
+ case X86II::MRM1m:
+ case X86II::MRM2m:
+ case X86II::MRM3m:
+ case X86II::MRM4m:
+ case X86II::MRM5m:
+ case X86II::MRM6m:
+ case X86II::MRM7m:
if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
++CurOp;
if (HasEVEX_K) // Skip writemask
++CurOp;
- EmitByte(BaseOpcode, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
emitMemModRMByte(MI, CurOp,
(Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
Rex, CurByte, OS, Fixups, STI);
CurOp += X86::AddrNumOperands;
break;
- case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
- case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
- case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
- case X86II::MRM_C9: case X86II::MRM_CA: case X86II::MRM_CB:
- case X86II::MRM_CC: case X86II::MRM_CD: case X86II::MRM_CE:
- case X86II::MRM_CF: case X86II::MRM_D0: case X86II::MRM_D1:
- case X86II::MRM_D2: case X86II::MRM_D3: case X86II::MRM_D4:
- case X86II::MRM_D5: case X86II::MRM_D6: case X86II::MRM_D7:
- case X86II::MRM_D8: case X86II::MRM_D9: case X86II::MRM_DA:
- case X86II::MRM_DB: case X86II::MRM_DC: case X86II::MRM_DD:
- case X86II::MRM_DE: case X86II::MRM_DF: case X86II::MRM_E0:
- case X86II::MRM_E1: case X86II::MRM_E2: case X86II::MRM_E3:
- case X86II::MRM_E4: case X86II::MRM_E5: case X86II::MRM_E6:
- case X86II::MRM_E7: case X86II::MRM_E8: case X86II::MRM_E9:
- case X86II::MRM_EA: case X86II::MRM_EB: case X86II::MRM_EC:
- case X86II::MRM_ED: case X86II::MRM_EE: case X86II::MRM_EF:
- case X86II::MRM_F0: case X86II::MRM_F1: case X86II::MRM_F2:
- case X86II::MRM_F3: case X86II::MRM_F4: case X86II::MRM_F5:
- case X86II::MRM_F6: case X86II::MRM_F7: case X86II::MRM_F8:
- case X86II::MRM_F9: case X86II::MRM_FA: case X86II::MRM_FB:
- case X86II::MRM_FC: case X86II::MRM_FD: case X86II::MRM_FE:
+ case X86II::MRM_C0:
+ case X86II::MRM_C1:
+ case X86II::MRM_C2:
+ case X86II::MRM_C3:
+ case X86II::MRM_C4:
+ case X86II::MRM_C5:
+ case X86II::MRM_C6:
+ case X86II::MRM_C7:
+ case X86II::MRM_C8:
+ case X86II::MRM_C9:
+ case X86II::MRM_CA:
+ case X86II::MRM_CB:
+ case X86II::MRM_CC:
+ case X86II::MRM_CD:
+ case X86II::MRM_CE:
+ case X86II::MRM_CF:
+ case X86II::MRM_D0:
+ case X86II::MRM_D1:
+ case X86II::MRM_D2:
+ case X86II::MRM_D3:
+ case X86II::MRM_D4:
+ case X86II::MRM_D5:
+ case X86II::MRM_D6:
+ case X86II::MRM_D7:
+ case X86II::MRM_D8:
+ case X86II::MRM_D9:
+ case X86II::MRM_DA:
+ case X86II::MRM_DB:
+ case X86II::MRM_DC:
+ case X86II::MRM_DD:
+ case X86II::MRM_DE:
+ case X86II::MRM_DF:
+ case X86II::MRM_E0:
+ case X86II::MRM_E1:
+ case X86II::MRM_E2:
+ case X86II::MRM_E3:
+ case X86II::MRM_E4:
+ case X86II::MRM_E5:
+ case X86II::MRM_E6:
+ case X86II::MRM_E7:
+ case X86II::MRM_E8:
+ case X86II::MRM_E9:
+ case X86II::MRM_EA:
+ case X86II::MRM_EB:
+ case X86II::MRM_EC:
+ case X86II::MRM_ED:
+ case X86II::MRM_EE:
+ case X86II::MRM_EF:
+ case X86II::MRM_F0:
+ case X86II::MRM_F1:
+ case X86II::MRM_F2:
+ case X86II::MRM_F3:
+ case X86II::MRM_F4:
+ case X86II::MRM_F5:
+ case X86II::MRM_F6:
+ case X86II::MRM_F7:
+ case X86II::MRM_F8:
+ case X86II::MRM_F9:
+ case X86II::MRM_FA:
+ case X86II::MRM_FB:
+ case X86II::MRM_FC:
+ case X86II::MRM_FD:
+ case X86II::MRM_FE:
case X86II::MRM_FF:
- EmitByte(BaseOpcode, CurByte, OS);
- EmitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+ emitByte(BaseOpcode, CurByte, OS);
+ emitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
break;
}
@@ -1620,21 +1753,21 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
assert(Val < 16 && "Immediate operand value out of range");
I8RegNum |= Val;
}
- EmitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
+ emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
CurByte, OS, Fixups);
} else {
// If there is a remaining operand, it must be a trailing immediate. Emit it
// according to the right size for the instruction. Some instructions
// (SSE4a extrq and insertq) have two trailing immediates.
while (CurOp != NumOps && NumOps - CurOp <= 2) {
- EmitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
+ emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
CurByte, OS, Fixups);
}
}
if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
- EmitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+ emitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
#ifndef NDEBUG
// FIXME: Verify.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index ced9eacc8b97..049a3a815984 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -290,12 +290,9 @@ void X86_MC::initLLVMToSEHAndCVRegMapping(MCRegisterInfo *MRI) {
MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
StringRef CPU, StringRef FS) {
std::string ArchFS = X86_MC::ParseX86Triple(TT);
- if (!FS.empty()) {
- if (!ArchFS.empty())
- ArchFS = (Twine(ArchFS) + "," + FS).str();
- else
- ArchFS = FS;
- }
+ assert(!ArchFS.empty() && "Failed to parse X86 triple");
+ if (!FS.empty())
+ ArchFS = (Twine(ArchFS) + "," + FS).str();
std::string CPUName = CPU;
if (CPUName.empty())
@@ -323,7 +320,8 @@ static MCRegisterInfo *createX86MCRegisterInfo(const Triple &TT) {
}
static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
- const Triple &TheTriple) {
+ const Triple &TheTriple,
+ const MCTargetOptions &Options) {
bool is64Bit = TheTriple.getArch() == Triple::x86_64;
MCAsmInfo *MAI;
@@ -554,7 +552,7 @@ static MCInstrAnalysis *createX86MCInstrAnalysis(const MCInstrInfo *Info) {
}
// Force static initialization.
-extern "C" void LLVMInitializeX86TargetMC() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetMC() {
for (Target *T : {&getTheX86_32Target(), &getTheX86_64Target()}) {
// Register the MC asm info.
RegisterMCAsmInfoFn X(*T, createX86MCAsmInfo);
diff --git a/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp b/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
index 47c41626a666..18cda8f591c3 100644
--- a/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
+++ b/llvm/lib/Target/X86/TargetInfo/X86TargetInfo.cpp
@@ -19,7 +19,7 @@ Target &llvm::getTheX86_64Target() {
return TheX86_64Target;
}
-extern "C" void LLVMInitializeX86TargetInfo() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86TargetInfo() {
RegisterTarget<Triple::x86, /*HasJIT=*/true> X(
getTheX86_32Target(), "x86", "32-bit X86: Pentium-Pro and above", "X86");
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6840fc12751d..0481a40d462a 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -150,6 +150,18 @@ void initializeX86ExpandPseudoPass(PassRegistry &);
void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
void initializeX86OptimizeLEAPassPass(PassRegistry &);
void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+
+namespace X86AS {
+enum : unsigned {
+ GS = 256,
+ FS = 257,
+ SS = 258,
+ PTR32_SPTR = 270,
+ PTR32_UPTR = 271,
+ PTR64 = 272
+};
+} // End X86AS namespace
+
} // End llvm namespace
#endif
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index d8631aca2734..a2b11d55f650 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -304,12 +304,12 @@ def FeatureFastVariableShuffle
: SubtargetFeature<"fast-variable-shuffle",
"HasFastVariableShuffle",
"true", "Shuffles with variable masks are fast">;
-// On some X86 processors, there is no performance hazard to writing only the
-// lower parts of a YMM or ZMM register without clearing the upper part.
-def FeatureFastPartialYMMorZMMWrite
- : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
- "HasFastPartialYMMorZMMWrite",
- "true", "Partial writes to YMM/ZMM registers are fast">;
+// On some X86 processors, a vzeroupper instruction should be inserted after
+// using ymm/zmm registers before executing code that may use SSE instructions.
+def FeatureInsertVZEROUPPER
+ : SubtargetFeature<"vzeroupper",
+ "InsertVZEROUPPER",
+ "true", "Should insert vzeroupper instructions">;
// FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
// than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
// vector FSQRT has higher throughput than the corresponding NR code.
@@ -386,6 +386,10 @@ def FeaturePrefer256Bit
: SubtargetFeature<"prefer-256-bit", "Prefer256Bit", "true",
"Prefer 256-bit AVX instructions">;
+def FeaturePreferMaskRegisters
+ : SubtargetFeature<"prefer-mask-registers", "PreferMaskRegisters", "true",
+ "Prefer AVX512 mask registers over PTEST/MOVMSK">;
+
// Lower indirect calls using a special construct called a `retpoline` to
// mitigate potential Spectre v2 attacks against them.
def FeatureRetpolineIndirectCalls
@@ -439,7 +443,7 @@ def FeatureFastHorizontalOps
: SubtargetFeature<
"fast-hops", "HasFastHorizontalOps", "true",
"Prefer horizontal vector math instructions (haddp, phsub, etc.) over "
- "normal vector instructions with shuffles", [FeatureSSE3]>;
+ "normal vector instructions with shuffles">;
def FeatureFastScalarShiftMasks
: SubtargetFeature<
@@ -451,6 +455,10 @@ def FeatureFastVectorShiftMasks
"fast-vector-shift-masks", "HasFastVectorShiftMasks", "true",
"Prefer a left/right vector logical shift pair over a shift+and pair">;
+def FeatureUseGLMDivSqrtCosts
+ : SubtargetFeature<"use-glm-div-sqrt-costs", "UseGLMDivSqrtCosts", "true",
+ "Use Goldmont specific floating point div/sqrt costs">;
+
// Merge branches using three-way conditional code.
def FeatureMergeToThreeWayBranch : SubtargetFeature<"merge-to-threeway-branch",
"ThreewayBranchProfitable", "true",
@@ -465,12 +473,6 @@ def FeatureUseAA : SubtargetFeature<"use-aa", "UseAA", "true",
def ProcIntelAtom : SubtargetFeature<"", "X86ProcFamily", "IntelAtom", "">;
// Silvermont
def ProcIntelSLM : SubtargetFeature<"", "X86ProcFamily", "IntelSLM", "">;
-// Goldmont
-def ProcIntelGLM : SubtargetFeature<"", "X86ProcFamily", "IntelGLM", "">;
-// Goldmont Plus
-def ProcIntelGLP : SubtargetFeature<"", "X86ProcFamily", "IntelGLP", "">;
-// Tremont
-def ProcIntelTRM : SubtargetFeature<"", "X86ProcFamily", "IntelTRM", "">;
//===----------------------------------------------------------------------===//
// Register File Description
@@ -499,6 +501,7 @@ include "X86SchedHaswell.td"
include "X86SchedBroadwell.td"
include "X86ScheduleSLM.td"
include "X86ScheduleZnver1.td"
+include "X86ScheduleZnver2.td"
include "X86ScheduleBdVer2.td"
include "X86ScheduleBtVer2.td"
include "X86SchedSkylakeClient.td"
@@ -521,7 +524,8 @@ def ProcessorFeatures {
FeatureCMPXCHG16B,
FeaturePOPCNT,
FeatureLAHFSAHF,
- FeatureMacroFusion];
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> NHMSpecificFeatures = [];
list<SubtargetFeature> NHMFeatures =
!listconcat(NHMInheritableFeatures, NHMSpecificFeatures);
@@ -701,7 +705,8 @@ def ProcessorFeatures {
FeatureCMPXCHG16B,
FeatureMOVBE,
FeatureSlowTwoMemOps,
- FeatureLAHFSAHF];
+ FeatureLAHFSAHF,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> AtomSpecificFeatures = [ProcIntelAtom,
FeatureSlowUAMem16,
FeatureLEAForSP,
@@ -739,7 +744,7 @@ def ProcessorFeatures {
FeatureXSAVES,
FeatureCLFLUSHOPT,
FeatureFSGSBase];
- list<SubtargetFeature> GLMSpecificFeatures = [ProcIntelGLM,
+ list<SubtargetFeature> GLMSpecificFeatures = [FeatureUseGLMDivSqrtCosts,
FeaturePOPCNTFalseDeps];
list<SubtargetFeature> GLMInheritableFeatures =
!listconcat(SLMInheritableFeatures, GLMAdditionalFeatures);
@@ -750,7 +755,7 @@ def ProcessorFeatures {
list<SubtargetFeature> GLPAdditionalFeatures = [FeaturePTWRITE,
FeatureRDPID,
FeatureSGX];
- list<SubtargetFeature> GLPSpecificFeatures = [ProcIntelGLP];
+ list<SubtargetFeature> GLPSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
list<SubtargetFeature> GLPInheritableFeatures =
!listconcat(GLMInheritableFeatures, GLPAdditionalFeatures);
list<SubtargetFeature> GLPFeatures =
@@ -762,7 +767,7 @@ def ProcessorFeatures {
FeatureMOVDIRI,
FeatureMOVDIR64B,
FeatureWAITPKG];
- list<SubtargetFeature> TRMSpecificFeatures = [ProcIntelTRM];
+ list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
list<SubtargetFeature> TRMFeatures =
!listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
TRMSpecificFeatures);
@@ -801,8 +806,8 @@ def ProcessorFeatures {
FeatureBMI2,
FeatureFMA,
FeaturePRFCHW,
+ FeaturePreferMaskRegisters,
FeatureSlowTwoMemOps,
- FeatureFastPartialYMMorZMMWrite,
FeatureHasFastGather,
FeatureSlowPMADDWD];
// TODO Add AVX5124FMAPS/AVX5124VNNIW features
@@ -823,7 +828,8 @@ def ProcessorFeatures {
FeatureLAHFSAHF,
FeatureCMOV,
Feature64Bit,
- FeatureFastScalarShiftMasks];
+ FeatureFastScalarShiftMasks,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> BarcelonaFeatures = BarcelonaInheritableFeatures;
// Bobcat
@@ -845,7 +851,9 @@ def ProcessorFeatures {
FeatureFast15ByteNOP,
FeatureFastScalarShiftMasks,
FeatureFastVectorShiftMasks];
- list<SubtargetFeature> BtVer1Features = BtVer1InheritableFeatures;
+ list<SubtargetFeature> BtVer1SpecificFeatures = [FeatureInsertVZEROUPPER];
+ list<SubtargetFeature> BtVer1Features =
+ !listconcat(BtVer1InheritableFeatures, BtVer1SpecificFeatures);
// Jaguar
list<SubtargetFeature> BtVer2AdditionalFeatures = [FeatureAVX,
@@ -858,7 +866,6 @@ def ProcessorFeatures {
FeatureXSAVEOPT];
list<SubtargetFeature> BtVer2SpecificFeatures = [FeatureFastLZCNT,
FeatureFastBEXTR,
- FeatureFastPartialYMMorZMMWrite,
FeatureFastHorizontalOps];
list<SubtargetFeature> BtVer2InheritableFeatures =
!listconcat(BtVer1InheritableFeatures, BtVer2AdditionalFeatures);
@@ -886,7 +893,8 @@ def ProcessorFeatures {
FeatureLAHFSAHF,
FeatureFast11ByteNOP,
FeatureFastScalarShiftMasks,
- FeatureBranchFusion];
+ FeatureBranchFusion,
+ FeatureInsertVZEROUPPER];
list<SubtargetFeature> BdVer1Features = BdVer1InheritableFeatures;
// PileDriver
@@ -949,6 +957,7 @@ def ProcessorFeatures {
FeatureSHA,
FeatureSSE4A,
FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER,
FeatureX87,
FeatureXSAVE,
FeatureXSAVEC,
@@ -971,28 +980,32 @@ class Proc<string Name, list<SubtargetFeature> Features>
// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
// if i386/i486 is specifically requested.
def : Proc<"generic", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B]>;
-def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16]>;
-def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16]>;
+ FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
+def : Proc<"i386", [FeatureX87, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
+def : Proc<"i486", [FeatureX87, FeatureSlowUAMem16,
+ FeatureInsertVZEROUPPER]>;
def : Proc<"i586", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B]>;
+ FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
def : Proc<"pentium", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B]>;
+ FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
def : Proc<"pentium-mmx", [FeatureX87, FeatureSlowUAMem16,
- FeatureCMPXCHG8B, FeatureMMX]>;
+ FeatureCMPXCHG8B, FeatureMMX,
+ FeatureInsertVZEROUPPER]>;
def : Proc<"i686", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
def : Proc<"pentiumpro", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureCMOV, FeatureNOPL]>;
+ FeatureCMOV, FeatureNOPL, FeatureInsertVZEROUPPER]>;
def : Proc<"pentium2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureCMOV, FeatureFXSR,
- FeatureNOPL]>;
+ FeatureNOPL, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium3", "pentium3m"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,FeatureMMX,
- FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV]>;
+ FeatureSSE1, FeatureFXSR, FeatureNOPL, FeatureCMOV,
+ FeatureInsertVZEROUPPER]>;
}
// Enable the PostRAScheduler for SSE2 and SSE3 class cpus.
@@ -1008,29 +1021,29 @@ foreach P = ["pentium3", "pentium3m"] in {
def : ProcessorModel<"pentium-m", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
foreach P = ["pentium4", "pentium4m"] in {
def : ProcessorModel<P, GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE2, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
}
// Intel Quark.
-def : Proc<"lakemont", []>;
+def : Proc<"lakemont", [FeatureInsertVZEROUPPER]>;
// Intel Core Duo.
def : ProcessorModel<"yonah", SandyBridgeModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
// NetBurst.
def : ProcessorModel<"prescott", GenericPostRAModel,
[FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE3, FeatureFXSR, FeatureNOPL,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureX87,
FeatureSlowUAMem16,
@@ -1041,7 +1054,8 @@ def : ProcessorModel<"nocona", GenericPostRAModel, [
FeatureFXSR,
FeatureNOPL,
Feature64Bit,
- FeatureCMPXCHG16B
+ FeatureCMPXCHG16B,
+ FeatureInsertVZEROUPPER
]>;
// Intel Core 2 Solo/Duo.
@@ -1057,7 +1071,8 @@ def : ProcessorModel<"core2", SandyBridgeModel, [
Feature64Bit,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
- FeatureMacroFusion
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
]>;
def : ProcessorModel<"penryn", SandyBridgeModel, [
FeatureX87,
@@ -1071,7 +1086,8 @@ def : ProcessorModel<"penryn", SandyBridgeModel, [
Feature64Bit,
FeatureCMPXCHG16B,
FeatureLAHFSAHF,
- FeatureMacroFusion
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
]>;
// Atom CPUs.
@@ -1138,35 +1154,36 @@ def : ProcessorModel<"tigerlake", SkylakeServerModel,
// AMD CPUs.
def : Proc<"k6", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- FeatureMMX]>;
+ FeatureMMX, FeatureInsertVZEROUPPER]>;
def : Proc<"k6-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNow]>;
+ Feature3DNow, FeatureInsertVZEROUPPER]>;
def : Proc<"k6-3", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNow]>;
+ Feature3DNow, FeatureInsertVZEROUPPER]>;
foreach P = ["athlon", "athlon-tbird"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
- Feature3DNowA, FeatureNOPL, FeatureSlowSHLD]>;
+ Feature3DNowA, FeatureNOPL, FeatureSlowSHLD,
+ FeatureInsertVZEROUPPER]>;
}
foreach P = ["athlon-4", "athlon-xp", "athlon-mp"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureCMOV,
FeatureSSE1, Feature3DNowA, FeatureFXSR, FeatureNOPL,
- FeatureSlowSHLD]>;
+ FeatureSlowSHLD, FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8", "opteron", "athlon64", "athlon-fx"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureSSE2, Feature3DNowA, FeatureFXSR, FeatureNOPL,
Feature64Bit, FeatureSlowSHLD, FeatureCMOV,
- FeatureFastScalarShiftMasks]>;
+ FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
}
foreach P = ["k8-sse3", "opteron-sse3", "athlon64-sse3"] in {
def : Proc<P, [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B, FeatureSSE3,
Feature3DNowA, FeatureFXSR, FeatureNOPL, FeatureCMPXCHG16B,
FeatureSlowSHLD, FeatureCMOV, Feature64Bit,
- FeatureFastScalarShiftMasks]>;
+ FeatureFastScalarShiftMasks, FeatureInsertVZEROUPPER]>;
}
foreach P = ["amdfam10", "barcelona"] in {
@@ -1188,17 +1205,20 @@ def : Proc<"bdver3", ProcessorFeatures.BdVer3Features>;
def : Proc<"bdver4", ProcessorFeatures.BdVer4Features>;
def : ProcessorModel<"znver1", Znver1Model, ProcessorFeatures.ZNFeatures>;
-def : ProcessorModel<"znver2", Znver1Model, ProcessorFeatures.ZN2Features>;
+def : ProcessorModel<"znver2", Znver2Model, ProcessorFeatures.ZN2Features>;
def : Proc<"geode", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
- Feature3DNowA]>;
-
-def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX]>;
-def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
-def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow]>;
+ Feature3DNowA, FeatureInsertVZEROUPPER]>;
+
+def : Proc<"winchip-c6", [FeatureX87, FeatureSlowUAMem16, FeatureMMX,
+ FeatureInsertVZEROUPPER]>;
+def : Proc<"winchip2", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
+ FeatureInsertVZEROUPPER]>;
+def : Proc<"c3", [FeatureX87, FeatureSlowUAMem16, Feature3DNow,
+ FeatureInsertVZEROUPPER]>;
def : Proc<"c3-2", [FeatureX87, FeatureSlowUAMem16, FeatureCMPXCHG8B,
FeatureMMX, FeatureSSE1, FeatureFXSR,
- FeatureCMOV]>;
+ FeatureCMOV, FeatureInsertVZEROUPPER]>;
// We also provide a generic 64-bit specific x86 processor model which tries to
// be good for modern chips without enabling instruction set encodings past the
@@ -1221,7 +1241,8 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
Feature64Bit,
FeatureSlow3OpsLEA,
FeatureSlowIncDec,
- FeatureMacroFusion
+ FeatureMacroFusion,
+ FeatureInsertVZEROUPPER
]>;
//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 8d27be30a277..39d16e7999cd 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -218,9 +218,16 @@ void X86AsmPrinter::PrintOperand(const MachineInstr *MI, unsigned OpNo,
O << MO.getImm();
return;
+ case MachineOperand::MO_ConstantPoolIndex:
case MachineOperand::MO_GlobalAddress: {
- if (IsATT)
+ switch (MI->getInlineAsmDialect()) {
+ case InlineAsm::AD_ATT:
O << '$';
+ break;
+ case InlineAsm::AD_Intel:
+ O << "offset ";
+ break;
+ }
PrintSymbolOperand(MO, O);
break;
}
@@ -336,14 +343,22 @@ void X86AsmPrinter::PrintMemReference(const MachineInstr *MI, unsigned OpNo,
PrintLeaMemReference(MI, OpNo, O, Modifier);
}
+
void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
- unsigned OpNo, raw_ostream &O) {
+ unsigned OpNo, raw_ostream &O,
+ const char *Modifier) {
const MachineOperand &BaseReg = MI->getOperand(OpNo + X86::AddrBaseReg);
unsigned ScaleVal = MI->getOperand(OpNo + X86::AddrScaleAmt).getImm();
const MachineOperand &IndexReg = MI->getOperand(OpNo + X86::AddrIndexReg);
const MachineOperand &DispSpec = MI->getOperand(OpNo + X86::AddrDisp);
const MachineOperand &SegReg = MI->getOperand(OpNo + X86::AddrSegmentReg);
+ // If we really don't want to print out (rip), don't.
+ bool HasBaseReg = BaseReg.getReg() != 0;
+ if (HasBaseReg && Modifier && !strcmp(Modifier, "no-rip") &&
+ BaseReg.getReg() == X86::RIP)
+ HasBaseReg = false;
+
// If this has a segment register, print it.
if (SegReg.getReg()) {
PrintOperand(MI, OpNo + X86::AddrSegmentReg, O);
@@ -353,7 +368,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
O << '[';
bool NeedPlus = false;
- if (BaseReg.getReg()) {
+ if (HasBaseReg) {
PrintOperand(MI, OpNo + X86::AddrBaseReg, O);
NeedPlus = true;
}
@@ -371,7 +386,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
PrintOperand(MI, OpNo + X86::AddrDisp, O);
} else {
int64_t DispVal = DispSpec.getImm();
- if (DispVal || (!IndexReg.getReg() && !BaseReg.getReg())) {
+ if (DispVal || (!IndexReg.getReg() && !HasBaseReg)) {
if (NeedPlus) {
if (DispVal > 0)
O << " + ";
@@ -524,11 +539,6 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
const char *ExtraCode,
raw_ostream &O) {
- if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
- PrintIntelMemReference(MI, OpNo, O);
- return false;
- }
-
if (ExtraCode && ExtraCode[0]) {
if (ExtraCode[1] != 0) return true; // Unknown modifier.
@@ -542,14 +552,26 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
// These only apply to registers, ignore on mem.
break;
case 'H':
- PrintMemReference(MI, OpNo, O, "H");
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ return true; // Unsupported modifier in Intel inline assembly.
+ } else {
+ PrintMemReference(MI, OpNo, O, "H");
+ }
return false;
case 'P': // Don't print @PLT, but do print as memory.
- PrintMemReference(MI, OpNo, O, "no-rip");
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ PrintIntelMemReference(MI, OpNo, O, "no-rip");
+ } else {
+ PrintMemReference(MI, OpNo, O, "no-rip");
+ }
return false;
}
}
- PrintMemReference(MI, OpNo, O, nullptr);
+ if (MI->getInlineAsmDialect() == InlineAsm::AD_Intel) {
+ PrintIntelMemReference(MI, OpNo, O, nullptr);
+ } else {
+ PrintMemReference(MI, OpNo, O, nullptr);
+ }
return false;
}
@@ -614,7 +636,7 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
Feat00Flags |= 1;
}
- if (M.getModuleFlag("cfguardtable"))
+ if (M.getModuleFlag("cfguard"))
Feat00Flags |= 0x800; // Object is CFG-aware.
OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
@@ -727,7 +749,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
//===----------------------------------------------------------------------===//
// Force static initialization.
-extern "C" void LLVMInitializeX86AsmPrinter() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86AsmPrinter() {
RegisterAsmPrinter<X86AsmPrinter> X(getTheX86_32Target());
RegisterAsmPrinter<X86AsmPrinter> Y(getTheX86_64Target());
}
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index a011310970b3..ee79401dc80d 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -26,7 +26,7 @@ class MCStreamer;
class MCSymbol;
class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
- const X86Subtarget *Subtarget;
+ const X86Subtarget *Subtarget = nullptr;
StackMaps SM;
FaultMaps FM;
std::unique_ptr<MCCodeEmitter> CodeEmitter;
@@ -60,7 +60,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
// to emit any necessary padding-NOPs.
void emitShadowPadding(MCStreamer &OutStreamer, const MCSubtargetInfo &STI);
private:
- const MachineFunction *MF;
+ const MachineFunction *MF = nullptr;
bool InShadow = false;
// RequiredShadowSize holds the length of the shadow specified in the most
@@ -112,7 +112,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
void PrintMemReference(const MachineInstr *MI, unsigned OpNo, raw_ostream &O,
const char *Modifier);
void PrintIntelMemReference(const MachineInstr *MI, unsigned OpNo,
- raw_ostream &O);
+ raw_ostream &O, const char *Modifier);
public:
X86AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 69c6b3356cbb..0f1d4b51062e 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -46,6 +46,7 @@
#include "llvm/IR/DebugInfoMetadata.h"
#include "llvm/IR/DebugLoc.h"
#include "llvm/IR/Function.h"
+#include "llvm/InitializePasses.h"
#include "llvm/MC/MCInstrDesc.h"
using namespace llvm;
@@ -83,13 +84,13 @@ public:
}
private:
- MachineRegisterInfo *MRI;
- const X86InstrInfo *TII;
- const X86RegisterInfo *TRI;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
SmallVector<std::pair<MachineInstr *, MachineInstr *>, 2>
BlockedLoadsStoresPairs;
SmallVector<MachineInstr *, 2> ForRemoval;
- AliasAnalysis *AA;
+ AliasAnalysis *AA = nullptr;
/// Returns couples of Load then Store to memory which look
/// like a memcpy.
diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index ad7e32b4efc8..f8faa572dffc 100644
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -115,12 +115,12 @@ private:
StringRef getPassName() const override { return "X86 Optimize Call Frame"; }
- const X86InstrInfo *TII;
- const X86FrameLowering *TFL;
- const X86Subtarget *STI;
- MachineRegisterInfo *MRI;
- unsigned SlotSize;
- unsigned Log2SlotSize;
+ const X86InstrInfo *TII = nullptr;
+ const X86FrameLowering *TFL = nullptr;
+ const X86Subtarget *STI = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ unsigned SlotSize = 0;
+ unsigned Log2SlotSize = 0;
};
} // end anonymous namespace
diff --git a/llvm/lib/Target/X86/X86CallLowering.cpp b/llvm/lib/Target/X86/X86CallLowering.cpp
index 7ee637cfd523..57bf799cf89c 100644
--- a/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -115,7 +115,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
MIRBuilder.buildConstant(OffsetReg, Offset);
Register AddrReg = MRI.createGenericVirtualRegister(p0);
- MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+ MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
return AddrReg;
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index 4c49d68bec99..db1aef2fd09d 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -346,6 +346,10 @@ def RetCC_X86_Win64_C : CallingConv<[
// The X86-Win64 calling convention always returns __m64 values in RAX.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+ // GCC returns FP values in RAX on Win64.
+ CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
+ CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
+
// Otherwise, everything is the same as 'normal' X86-64 C CC.
CCDelegateTo<RetCC_X86_64_C>
]>;
@@ -434,6 +438,7 @@ def RetCC_X86_32 : CallingConv<[
// If FastCC, use RetCC_X86_32_Fast.
CCIfCC<"CallingConv::Fast", CCDelegateTo<RetCC_X86_32_Fast>>,
CCIfCC<"CallingConv::Tail", CCDelegateTo<RetCC_X86_32_Fast>>,
+ // CFGuard_Check never returns a value so does not need a RetCC.
// If HiPE, use RetCC_X86_32_HiPE.
CCIfCC<"CallingConv::HiPE", CCDelegateTo<RetCC_X86_32_HiPE>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<RetCC_X86_32_VectorCall>>,
@@ -606,10 +611,12 @@ def CC_X86_Win64_C : CallingConv<[
// A SwiftError is passed in R12.
CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[R12]>>>,
+ // The 'CFGuardTarget' parameter, if any, is passed in RAX.
+ CCIfCFGuardTarget<CCAssignToReg<[RAX]>>,
+
// 128 bit vectors are passed by pointer
CCIfType<[v16i8, v8i16, v4i32, v2i64, v4f32, v2f64], CCPassIndirect<i64>>,
-
// 256 bit vectors are passed by pointer
CCIfType<[v32i8, v16i16, v8i32, v4i64, v8f32, v4f64], CCPassIndirect<i64>>,
@@ -622,6 +629,16 @@ def CC_X86_Win64_C : CallingConv<[
// The first 4 MMX vector arguments are passed in GPRs.
CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+ // If SSE was disabled, pass FP values smaller than 64-bits as integers in
+ // GPRs or on the stack.
+ CCIfType<[f32], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i32>>>,
+ CCIfType<[f64], CCIfNotSubtarget<"hasSSE1()", CCBitConvertToType<i64>>>,
+
+ // The first 4 FP/Vector arguments are passed in XMM registers.
+ CCIfType<[f32, f64],
+ CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
+ [RCX , RDX , R8 , R9 ]>>,
+
// The first 4 integer arguments are passed in integer registers.
CCIfType<[i8 ], CCAssignToRegWithShadow<[CL , DL , R8B , R9B ],
[XMM0, XMM1, XMM2, XMM3]>>,
@@ -639,11 +656,6 @@ def CC_X86_Win64_C : CallingConv<[
CCIfType<[i64], CCAssignToRegWithShadow<[RCX , RDX , R8 , R9 ],
[XMM0, XMM1, XMM2, XMM3]>>,
- // The first 4 FP/Vector arguments are passed in XMM registers.
- CCIfType<[f32, f64, v16i8, v8i16, v4i32, v2i64, v4f32, v2f64],
- CCAssignToRegWithShadow<[XMM0, XMM1, XMM2, XMM3],
- [RCX , RDX , R8 , R9 ]>>,
-
// Integer/FP values get stored in stack slots that are 8 bytes in size and
// 8-byte aligned if there are no more registers to hold them.
CCIfType<[i8, i16, i32, i64, f32, f64], CCAssignToStack<8, 8>>
@@ -936,6 +948,12 @@ def CC_X86_32_FastCC : CallingConv<[
CCDelegateTo<CC_X86_32_Common>
]>;
+def CC_X86_Win32_CFGuard_Check : CallingConv<[
+ // The CFGuard check call takes exactly one integer argument
+ // (i.e. the target function address), which is passed in ECX.
+ CCIfType<[i32], CCAssignToReg<[ECX]>>
+]>;
+
def CC_X86_32_GHC : CallingConv<[
// Promote i8/i16 arguments to i32.
CCIfType<[i8, i16], CCPromoteToType<i32>>,
@@ -1000,6 +1018,7 @@ def CC_X86_32 : CallingConv<[
CCIfCC<"CallingConv::X86_FastCall", CCDelegateTo<CC_X86_32_FastCall>>,
CCIfCC<"CallingConv::X86_VectorCall", CCDelegateTo<CC_X86_Win32_VectorCall>>,
CCIfCC<"CallingConv::X86_ThisCall", CCDelegateTo<CC_X86_32_ThisCall>>,
+ CCIfCC<"CallingConv::CFGuard_Check", CCDelegateTo<CC_X86_Win32_CFGuard_Check>>,
CCIfCC<"CallingConv::Fast", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::Tail", CCDelegateTo<CC_X86_32_FastCC>>,
CCIfCC<"CallingConv::GHC", CCDelegateTo<CC_X86_32_GHC>>,
@@ -1136,7 +1155,9 @@ def CSR_64_HHVM : CalleeSavedRegs<(add R12)>;
// Register calling convention preserves few GPR and XMM8-15
def CSR_32_RegCall_NoSSE : CalleeSavedRegs<(add ESI, EDI, EBX, EBP, ESP)>;
def CSR_32_RegCall : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE,
- (sequence "XMM%u", 4, 7))>;
+ (sequence "XMM%u", 4, 7))>;
+def CSR_Win32_CFGuard_Check_NoSSE : CalleeSavedRegs<(add CSR_32_RegCall_NoSSE, ECX)>;
+def CSR_Win32_CFGuard_Check : CalleeSavedRegs<(add CSR_32_RegCall, ECX)>;
def CSR_Win64_RegCall_NoSSE : CalleeSavedRegs<(add RBX, RBP, RSP,
(sequence "R%u", 10, 15))>;
def CSR_Win64_RegCall : CalleeSavedRegs<(add CSR_Win64_RegCall_NoSSE,
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index 5123853f5455..fe43bf4cbbce 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -61,6 +61,7 @@
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
#include "llvm/MC/MCSchedule.h"
#include "llvm/Pass.h"
#include "llvm/Support/CommandLine.h"
@@ -111,9 +112,9 @@ public:
static char ID;
private:
- MachineRegisterInfo *MRI;
- const TargetInstrInfo *TII;
- const TargetRegisterInfo *TRI;
+ MachineRegisterInfo *MRI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
TargetSchedModel TSchedModel;
/// List of consecutive CMOV instructions.
diff --git a/llvm/lib/Target/X86/X86CondBrFolding.cpp b/llvm/lib/Target/X86/X86CondBrFolding.cpp
index 1bf2d5ba7b8f..7ede94664bf6 100644
--- a/llvm/lib/Target/X86/X86CondBrFolding.cpp
+++ b/llvm/lib/Target/X86/X86CondBrFolding.cpp
@@ -115,8 +115,6 @@ private:
void optimizeCondBr(MachineBasicBlock &MBB,
SmallVectorImpl<MachineBasicBlock *> &BranchPath);
- void fixBranchProb(MachineBasicBlock *NextMBB, MachineBasicBlock *RootMBB,
- SmallVectorImpl<MachineBasicBlock *> &BranchPath);
void replaceBrDest(MachineBasicBlock *MBB, MachineBasicBlock *OrigDest,
MachineBasicBlock *NewDest);
void fixupModifiedCond(MachineBasicBlock *MBB);
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index b4cf5cafbc6e..438b9fd8eebb 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -373,9 +373,9 @@ public:
};
class X86DomainReassignment : public MachineFunctionPass {
- const X86Subtarget *STI;
- MachineRegisterInfo *MRI;
- const X86InstrInfo *TII;
+ const X86Subtarget *STI = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
/// All edges that are included in some closure
DenseSet<unsigned> EnclosedEdges;
diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp
index 24c8e6d6f6eb..f1cf9b94c9e5 100755
--- a/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -84,7 +84,7 @@ public:
private:
/// Machine instruction info used throughout the class.
- const X86InstrInfo *TII;
+ const X86InstrInfo *TII = nullptr;
};
} // end anonymous namespace
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index 9126a1fbea52..d35d65914b34 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -41,11 +41,11 @@ public:
MachineFunctionPass::getAnalysisUsage(AU);
}
- const X86Subtarget *STI;
- const X86InstrInfo *TII;
- const X86RegisterInfo *TRI;
- const X86MachineFunctionInfo *X86FI;
- const X86FrameLowering *X86FL;
+ const X86Subtarget *STI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+ const X86MachineFunctionInfo *X86FI = nullptr;
+ const X86FrameLowering *X86FL = nullptr;
bool runOnMachineFunction(MachineFunction &Fn) override;
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index e5e089d07d55..1dbf40683564 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -35,6 +35,7 @@
#include "llvm/IR/GlobalVariable.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/Operator.h"
#include "llvm/MC/MCAsmInfo.h"
#include "llvm/MC/MCSymbol.h"
@@ -3218,6 +3219,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
case CallingConv::X86_ThisCall:
case CallingConv::Win64:
case CallingConv::X86_64_SysV:
+ case CallingConv::CFGuard_Check:
break;
}
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index 9f7c4afde760..f8c4a2adb851 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -48,11 +48,14 @@
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/LivePhysRegs.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineLoopInfo.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetInstrInfo.h"
#include "llvm/Support/Debug.h"
@@ -113,6 +116,8 @@ public:
void getAnalysisUsage(AnalysisUsage &AU) const override {
AU.addRequired<MachineLoopInfo>(); // Machine loop info is used to
// guide some heuristics.
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
MachineFunctionPass::getAnalysisUsage(AU);
}
@@ -127,19 +132,22 @@ public:
}
private:
- MachineFunction *MF;
+ MachineFunction *MF = nullptr;
/// Machine instruction info used throughout the class.
- const X86InstrInfo *TII;
+ const X86InstrInfo *TII = nullptr;
/// Local member for function's OptForSize attribute.
- bool OptForSize;
+ bool OptForSize = false;
/// Machine loop info used for guiding some heruistics.
- MachineLoopInfo *MLI;
+ MachineLoopInfo *MLI = nullptr;
/// Register Liveness information after the current instruction.
LivePhysRegs LiveRegs;
+
+ ProfileSummaryInfo *PSI;
+ MachineBlockFrequencyInfo *MBFI;
};
char FixupBWInstPass::ID = 0;
}
@@ -154,8 +162,11 @@ bool FixupBWInstPass::runOnMachineFunction(MachineFunction &MF) {
this->MF = &MF;
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
- OptForSize = MF.getFunction().hasOptSize();
MLI = &getAnalysis<MachineLoopInfo>();
+ PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ MBFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
LiveRegs.init(TII->getRegisterInfo());
LLVM_DEBUG(dbgs() << "Start X86FixupBWInsts\n";);
@@ -426,6 +437,9 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
// We run after PEI, so we need to AddPristinesAndCSRs.
LiveRegs.addLiveOuts(MBB);
+ OptForSize = MF.getFunction().hasOptSize() ||
+ llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+
for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
MachineInstr *MI = &*I;
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 543dc8b00fa0..9ac401bb0253 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -113,8 +113,8 @@ public:
private:
TargetSchedModel TSM;
- const X86InstrInfo *TII;
- const X86RegisterInfo *TRI;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
};
}
@@ -650,6 +650,9 @@ void FixupLEAPass::processInstrForSlow3OpLEA(MachineBasicBlock::iterator &I,
.addReg(DestReg)
.add(Index);
LLVM_DEBUG(NewMI->dump(););
+
+ MBB.erase(I);
+ I = NewMI;
return;
}
diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index cbde280aa280..924f429fc138 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -43,19 +43,8 @@ public:
bool runOnMachineFunction(MachineFunction &MF) override;
private:
- // Find the preceding instruction that imp-defs eflags.
- MachineInstr *findFlagsImpDef(MachineBasicBlock *MBB,
- MachineBasicBlock::reverse_iterator MI);
-
- // Return true if MI imp-uses eflags.
- bool impUsesFlags(MachineInstr *MI);
-
- // Return true if this is the opcode of a SetCC instruction with a register
- // output.
- bool isSetCCr(unsigned Opode);
-
- MachineRegisterInfo *MRI;
- const X86InstrInfo *TII;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
enum { SearchBound = 16 };
@@ -67,31 +56,6 @@ char X86FixupSetCCPass::ID = 0;
FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
-// We expect the instruction *immediately* before the setcc to imp-def
-// EFLAGS (because of scheduling glue). To make this less brittle w.r.t
-// scheduling, look backwards until we hit the beginning of the
-// basic-block, or a small bound (to avoid quadratic behavior).
-MachineInstr *
-X86FixupSetCCPass::findFlagsImpDef(MachineBasicBlock *MBB,
- MachineBasicBlock::reverse_iterator MI) {
- // FIXME: Should this be instr_rend(), and MI be reverse_instr_iterator?
- auto MBBStart = MBB->rend();
- for (int i = 0; (i < SearchBound) && (MI != MBBStart); ++i, ++MI)
- for (auto &Op : MI->implicit_operands())
- if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isDef())
- return &*MI;
-
- return nullptr;
-}
-
-bool X86FixupSetCCPass::impUsesFlags(MachineInstr *MI) {
- for (auto &Op : MI->implicit_operands())
- if (Op.isReg() && (Op.getReg() == X86::EFLAGS) && Op.isUse())
- return true;
-
- return false;
-}
-
bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
bool Changed = false;
MRI = &MF.getRegInfo();
@@ -100,7 +64,12 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
SmallVector<MachineInstr*, 4> ToErase;
for (auto &MBB : MF) {
+ MachineInstr *FlagsDefMI = nullptr;
for (auto &MI : MBB) {
+ // Remember the most recent preceding eflags defining instruction.
+ if (MI.definesRegister(X86::EFLAGS))
+ FlagsDefMI = &MI;
+
// Find a setcc that is used by a zext.
// This doesn't have to be the only use, the transformation is safe
// regardless.
@@ -115,9 +84,6 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
if (!ZExt)
continue;
- // Find the preceding instruction that imp-defs eflags.
- MachineInstr *FlagsDefMI = findFlagsImpDef(
- MI.getParent(), MachineBasicBlock::reverse_iterator(&MI));
if (!FlagsDefMI)
continue;
@@ -126,7 +92,7 @@ bool X86FixupSetCCPass::runOnMachineFunction(MachineFunction &MF) {
// it, itself, by definition, clobbers eflags. But it may happen that
// FlagsDefMI also *uses* eflags, in which case the transformation is
// invalid.
- if (impUsesFlags(FlagsDefMI))
+ if (FlagsDefMI->readsRegister(X86::EFLAGS))
continue;
++NumSubstZexts;
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index cfba06fb6533..b1d2de29c896 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -87,12 +87,12 @@ public:
static char ID;
private:
- MachineRegisterInfo *MRI;
- const X86Subtarget *Subtarget;
- const X86InstrInfo *TII;
- const TargetRegisterInfo *TRI;
- const TargetRegisterClass *PromoteRC;
- MachineDominatorTree *MDT;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86Subtarget *Subtarget = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
+ const TargetRegisterClass *PromoteRC = nullptr;
+ MachineDominatorTree *MDT = nullptr;
CondRegArray collectCondsInRegs(MachineBasicBlock &MBB,
MachineBasicBlock::iterator CopyDefI);
@@ -115,6 +115,10 @@ private:
MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
MachineInstr &CMovI, MachineOperand &FlagUse,
CondRegArray &CondRegs);
+ void rewriteFCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
+ MachineInstr &CMovI, MachineOperand &FlagUse,
+ CondRegArray &CondRegs);
void rewriteCondJmp(MachineBasicBlock &TestMBB,
MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
MachineInstr &JmpI, CondRegArray &CondRegs);
@@ -334,6 +338,28 @@ static MachineBasicBlock &splitBlock(MachineBasicBlock &MBB,
return NewMBB;
}
+static X86::CondCode getCondFromFCMOV(unsigned Opcode) {
+ switch (Opcode) {
+ default: return X86::COND_INVALID;
+ case X86::CMOVBE_Fp32: case X86::CMOVBE_Fp64: case X86::CMOVBE_Fp80:
+ return X86::COND_BE;
+ case X86::CMOVB_Fp32: case X86::CMOVB_Fp64: case X86::CMOVB_Fp80:
+ return X86::COND_B;
+ case X86::CMOVE_Fp32: case X86::CMOVE_Fp64: case X86::CMOVE_Fp80:
+ return X86::COND_E;
+ case X86::CMOVNBE_Fp32: case X86::CMOVNBE_Fp64: case X86::CMOVNBE_Fp80:
+ return X86::COND_A;
+ case X86::CMOVNB_Fp32: case X86::CMOVNB_Fp64: case X86::CMOVNB_Fp80:
+ return X86::COND_AE;
+ case X86::CMOVNE_Fp32: case X86::CMOVNE_Fp64: case X86::CMOVNE_Fp80:
+ return X86::COND_NE;
+ case X86::CMOVNP_Fp32: case X86::CMOVNP_Fp64: case X86::CMOVNP_Fp80:
+ return X86::COND_NP;
+ case X86::CMOVP_Fp32: case X86::CMOVP_Fp64: case X86::CMOVP_Fp80:
+ return X86::COND_P;
+ }
+}
+
bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
<< " **********\n");
@@ -593,6 +619,8 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
// Otherwise we can just rewrite in-place.
if (X86::getCondFromCMov(MI) != X86::COND_INVALID) {
rewriteCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
+ } else if (getCondFromFCMOV(MI.getOpcode()) != X86::COND_INVALID) {
+ rewriteFCMov(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
} else if (X86::getCondFromSETCC(MI) != X86::COND_INVALID) {
rewriteSetCC(*TestMBB, TestPos, TestLoc, MI, *FlagUse, CondRegs);
} else if (MI.getOpcode() == TargetOpcode::COPY) {
@@ -674,6 +702,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
}
Blocks.push_back(SuccMBB);
+
+ // After this, EFLAGS will be recreated before each use.
+ SuccMBB->removeLiveIn(X86::EFLAGS);
}
} while (!Blocks.empty());
@@ -779,10 +810,10 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
CondRegArray &CondRegs) {
// Arithmetic is either reading CF or OF. Figure out which condition we need
// to preserve in a register.
- X86::CondCode Cond;
+ X86::CondCode Cond = X86::COND_INVALID;
// The addend to use to reset CF or OF when added to the flag value.
- int Addend;
+ int Addend = 0;
switch (getMnemonicFromOpcode(MI.getOpcode())) {
case FlagArithMnemonic::ADC:
@@ -852,6 +883,51 @@ void X86FlagsCopyLoweringPass::rewriteCMov(MachineBasicBlock &TestMBB,
LLVM_DEBUG(dbgs() << " fixed cmov: "; CMovI.dump());
}
+void X86FlagsCopyLoweringPass::rewriteFCMov(MachineBasicBlock &TestMBB,
+ MachineBasicBlock::iterator TestPos,
+ DebugLoc TestLoc,
+ MachineInstr &CMovI,
+ MachineOperand &FlagUse,
+ CondRegArray &CondRegs) {
+ // First get the register containing this specific condition.
+ X86::CondCode Cond = getCondFromFCMOV(CMovI.getOpcode());
+ unsigned CondReg;
+ bool Inverted;
+ std::tie(CondReg, Inverted) =
+ getCondOrInverseInReg(TestMBB, TestPos, TestLoc, Cond, CondRegs);
+
+ MachineBasicBlock &MBB = *CMovI.getParent();
+
+ // Insert a direct test of the saved register.
+ insertTest(MBB, CMovI.getIterator(), CMovI.getDebugLoc(), CondReg);
+
+ auto getFCMOVOpcode = [](unsigned Opcode, bool Inverted) {
+ switch (Opcode) {
+ default: llvm_unreachable("Unexpected opcode!");
+ case X86::CMOVBE_Fp32: case X86::CMOVNBE_Fp32:
+ case X86::CMOVB_Fp32: case X86::CMOVNB_Fp32:
+ case X86::CMOVE_Fp32: case X86::CMOVNE_Fp32:
+ case X86::CMOVP_Fp32: case X86::CMOVNP_Fp32:
+ return Inverted ? X86::CMOVE_Fp32 : X86::CMOVNE_Fp32;
+ case X86::CMOVBE_Fp64: case X86::CMOVNBE_Fp64:
+ case X86::CMOVB_Fp64: case X86::CMOVNB_Fp64:
+ case X86::CMOVE_Fp64: case X86::CMOVNE_Fp64:
+ case X86::CMOVP_Fp64: case X86::CMOVNP_Fp64:
+ return Inverted ? X86::CMOVE_Fp64 : X86::CMOVNE_Fp64;
+ case X86::CMOVBE_Fp80: case X86::CMOVNBE_Fp80:
+ case X86::CMOVB_Fp80: case X86::CMOVNB_Fp80:
+ case X86::CMOVE_Fp80: case X86::CMOVNE_Fp80:
+ case X86::CMOVP_Fp80: case X86::CMOVNP_Fp80:
+ return Inverted ? X86::CMOVE_Fp80 : X86::CMOVNE_Fp80;
+ }
+ };
+
+ // Rewrite the CMov to use the !ZF flag from the test.
+ CMovI.setDesc(TII->get(getFCMOVOpcode(CMovI.getOpcode(), Inverted)));
+ FlagUse.setIsKill(true);
+ LLVM_DEBUG(dbgs() << " fixed fcmov: "; CMovI.dump());
+}
+
void X86FlagsCopyLoweringPass::rewriteCondJmp(
MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
DebugLoc TestLoc, MachineInstr &JmpI, CondRegArray &CondRegs) {
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index fcfb5bc91314..13bbd6ccfce4 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -40,6 +40,7 @@
#include "llvm/CodeGen/TargetSubtargetInfo.h"
#include "llvm/Config/llvm-config.h"
#include "llvm/IR/InlineAsm.h"
+#include "llvm/InitializePasses.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
#include "llvm/Support/raw_ostream.h"
@@ -83,7 +84,7 @@ namespace {
StringRef getPassName() const override { return "X86 FP Stackifier"; }
private:
- const TargetInstrInfo *TII; // Machine instruction info.
+ const TargetInstrInfo *TII = nullptr; // Machine instruction info.
// Two CFG edges are related if they leave the same block, or enter the same
// block. The transitive closure of an edge under this relation is a
@@ -119,7 +120,7 @@ namespace {
SmallVector<LiveBundle, 8> LiveBundles;
// The edge bundle analysis provides indices into the LiveBundles vector.
- EdgeBundles *Bundles;
+ EdgeBundles *Bundles = nullptr;
// Return a bitmask of FP registers in block's live-in list.
static unsigned calcLiveInMask(MachineBasicBlock *MBB, bool RemoveFPs) {
@@ -143,14 +144,14 @@ namespace {
// Partition all the CFG edges into LiveBundles.
void bundleCFGRecomputeKillFlags(MachineFunction &MF);
- MachineBasicBlock *MBB; // Current basic block
+ MachineBasicBlock *MBB = nullptr; // Current basic block
// The hardware keeps track of how many FP registers are live, so we have
// to model that exactly. Usually, each live register corresponds to an
// FP<n> register, but when dealing with calls, returns, and inline
// assembly, it is sometimes necessary to have live scratch registers.
unsigned Stack[8]; // FP<n> Registers in each stack slot...
- unsigned StackTop; // The current top of the FP stack.
+ unsigned StackTop = 0; // The current top of the FP stack.
enum {
NumFPRegs = 8 // Including scratch pseudo-registers.
@@ -666,9 +667,12 @@ static const TableEntry OpcodeTable[] = {
{ X86::CMOVP_Fp32 , X86::CMOVP_F },
{ X86::CMOVP_Fp64 , X86::CMOVP_F },
{ X86::CMOVP_Fp80 , X86::CMOVP_F },
- { X86::COS_Fp32 , X86::COS_F },
- { X86::COS_Fp64 , X86::COS_F },
- { X86::COS_Fp80 , X86::COS_F },
+ { X86::COM_FpIr32 , X86::COM_FIr },
+ { X86::COM_FpIr64 , X86::COM_FIr },
+ { X86::COM_FpIr80 , X86::COM_FIr },
+ { X86::COM_Fpr32 , X86::COM_FST0r },
+ { X86::COM_Fpr64 , X86::COM_FST0r },
+ { X86::COM_Fpr80 , X86::COM_FST0r },
{ X86::DIVR_Fp32m , X86::DIVR_F32m },
{ X86::DIVR_Fp64m , X86::DIVR_F64m },
{ X86::DIVR_Fp64m32 , X86::DIVR_F32m },
@@ -741,9 +745,6 @@ static const TableEntry OpcodeTable[] = {
{ X86::MUL_FpI32m32 , X86::MUL_FI32m },
{ X86::MUL_FpI32m64 , X86::MUL_FI32m },
{ X86::MUL_FpI32m80 , X86::MUL_FI32m },
- { X86::SIN_Fp32 , X86::SIN_F },
- { X86::SIN_Fp64 , X86::SIN_F },
- { X86::SIN_Fp80 , X86::SIN_F },
{ X86::SQRT_Fp32 , X86::SQRT_F },
{ X86::SQRT_Fp64 , X86::SQRT_F },
{ X86::SQRT_Fp80 , X86::SQRT_F },
@@ -803,6 +804,10 @@ static unsigned getConcreteOpcode(unsigned Opcode) {
static const TableEntry PopTable[] = {
{ X86::ADD_FrST0 , X86::ADD_FPrST0 },
+ { X86::COMP_FST0r, X86::FCOMPP },
+ { X86::COM_FIr , X86::COM_FIPr },
+ { X86::COM_FST0r , X86::COMP_FST0r },
+
{ X86::DIVR_FrST0, X86::DIVR_FPrST0 },
{ X86::DIV_FrST0 , X86::DIV_FPrST0 },
@@ -841,7 +846,7 @@ void FPS::popStackAfter(MachineBasicBlock::iterator &I) {
int Opcode = Lookup(PopTable, I->getOpcode());
if (Opcode != -1) {
I->setDesc(TII->get(Opcode));
- if (Opcode == X86::UCOM_FPPr)
+ if (Opcode == X86::FCOMPP || Opcode == X86::UCOM_FPPr)
I->RemoveOperand(0);
} else { // Insert an explicit pop
I = BuildMI(*MBB, ++I, dl, TII->get(X86::ST_FPrr)).addReg(X86::ST0);
@@ -971,22 +976,23 @@ void FPS::shuffleStackTop(const unsigned char *FixStack,
//===----------------------------------------------------------------------===//
void FPS::handleCall(MachineBasicBlock::iterator &I) {
+ MachineInstr &MI = *I;
unsigned STReturns = 0;
- const MachineFunction* MF = I->getParent()->getParent();
- for (const auto &MO : I->operands()) {
- if (!MO.isReg())
+ for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+ MachineOperand &Op = MI.getOperand(i);
+ if (!Op.isReg() || Op.getReg() < X86::FP0 || Op.getReg() > X86::FP6)
continue;
- unsigned R = MO.getReg() - X86::FP0;
+ assert(Op.isImplicit() && "Expected implicit def/use");
- if (R < 8) {
- if (MF->getFunction().getCallingConv() != CallingConv::X86_RegCall) {
- assert(MO.isDef() && MO.isImplicit());
- }
+ if (Op.isDef())
+ STReturns |= 1 << getFPReg(Op);
- STReturns |= 1 << R;
- }
+ // Remove the operand so that later passes don't see it.
+ MI.RemoveOperand(i);
+ --i;
+ --e;
}
unsigned N = countTrailingOnes(STReturns);
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 1b469a814adc..799c1f5d1285 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -92,7 +92,7 @@ bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
MFI.hasCopyImplyingStackAdjustment());
}
-static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
if (IsLP64) {
if (isInt<8>(Imm))
return X86::SUB64ri8;
@@ -104,7 +104,7 @@ static unsigned getSUBriOpcode(unsigned IsLP64, int64_t Imm) {
}
}
-static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
+static unsigned getADDriOpcode(bool IsLP64, int64_t Imm) {
if (IsLP64) {
if (isInt<8>(Imm))
return X86::ADD64ri8;
@@ -116,12 +116,12 @@ static unsigned getADDriOpcode(unsigned IsLP64, int64_t Imm) {
}
}
-static unsigned getSUBrrOpcode(unsigned isLP64) {
- return isLP64 ? X86::SUB64rr : X86::SUB32rr;
+static unsigned getSUBrrOpcode(bool IsLP64) {
+ return IsLP64 ? X86::SUB64rr : X86::SUB32rr;
}
-static unsigned getADDrrOpcode(unsigned isLP64) {
- return isLP64 ? X86::ADD64rr : X86::ADD32rr;
+static unsigned getADDrrOpcode(bool IsLP64) {
+ return IsLP64 ? X86::ADD64rr : X86::ADD32rr;
}
static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
@@ -135,7 +135,7 @@ static unsigned getANDriOpcode(bool IsLP64, int64_t Imm) {
return X86::AND32ri;
}
-static unsigned getLEArOpcode(unsigned IsLP64) {
+static unsigned getLEArOpcode(bool IsLP64) {
return IsLP64 ? X86::LEA64r : X86::LEA32r;
}
@@ -993,8 +993,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
bool NeedsWinFPO =
!IsFunclet && STI.isTargetWin32() && MMI.getModule()->getCodeViewFlag();
bool NeedsWinCFI = NeedsWin64CFI || NeedsWinFPO;
- bool NeedsDwarfCFI =
- !IsWin64Prologue && (MMI.hasDebugInfo() || Fn.needsUnwindTableEntry());
+ bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
Register FramePtr = TRI->getFrameRegister(MF);
const Register MachineFramePtr =
STI.isTarget64BitILP32()
@@ -1262,7 +1261,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
if (Is64Bit) {
// Handle the 64-bit Windows ABI case where we need to call __chkstk.
// Function prologue is responsible for adjusting the stack pointer.
- int Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
+ int64_t Alloc = isEAXAlive ? NumBytes - 8 : NumBytes;
if (isUInt<32>(Alloc)) {
BuildMI(MBB, MBBI, DL, TII.get(X86::MOV32ri), X86::EAX)
.addImm(Alloc)
@@ -1614,10 +1613,9 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
bool HasFP = hasFP(MF);
uint64_t NumBytes = 0;
- bool NeedsDwarfCFI =
- (!MF.getTarget().getTargetTriple().isOSDarwin() &&
- !MF.getTarget().getTargetTriple().isOSWindows()) &&
- (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry());
+ bool NeedsDwarfCFI = (!MF.getTarget().getTargetTriple().isOSDarwin() &&
+ !MF.getTarget().getTargetTriple().isOSWindows()) &&
+ MF.needsFrameMoves();
if (IsFunclet) {
assert(HasFP && "EH funclets without FP not yet implemented");
@@ -1862,7 +1860,7 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
return getFrameIndexReference(MF, FI, FrameReg);
FrameReg = TRI->getStackRegister();
- return alignTo(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+ return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
}
int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
@@ -2812,11 +2810,9 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
unsigned StackAlign = getStackAlignment();
Amount = alignTo(Amount, StackAlign);
- MachineModuleInfo &MMI = MF.getMMI();
const Function &F = MF.getFunction();
bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool DwarfCFI = !WindowsCFI &&
- (MMI.hasDebugInfo() || F.needsUnwindTableEntry());
+ bool DwarfCFI = !WindowsCFI && MF.needsFrameMoves();
// If we have any exception handlers in this function, and we adjust
// the SP before calls, we may need to indicate this to the unwinder
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 5b546d42d98a..bf33f399db28 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -25,6 +25,7 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/Instructions.h"
#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/Type.h"
#include "llvm/Support/Debug.h"
#include "llvm/Support/ErrorHandling.h"
@@ -335,7 +336,7 @@ namespace {
// Do not want to hoist if we're not optimizing for size.
// TODO: We'd like to remove this restriction.
// See the comment in X86InstrInfo.td for more info.
- if (!OptForSize)
+ if (!CurDAG->shouldOptForSize())
return false;
// Walk all the users of the immediate.
@@ -536,12 +537,17 @@ namespace {
// type.
static bool isLegalMaskCompare(SDNode *N, const X86Subtarget *Subtarget) {
unsigned Opcode = N->getOpcode();
- if (Opcode == X86ISD::CMPM || Opcode == ISD::SETCC ||
- Opcode == X86ISD::CMPM_SAE || Opcode == X86ISD::VFPCLASS) {
+ if (Opcode == X86ISD::CMPM || Opcode == X86ISD::STRICT_CMPM ||
+ Opcode == ISD::SETCC || Opcode == X86ISD::CMPM_SAE ||
+ Opcode == X86ISD::VFPCLASS) {
// We can get 256-bit 8 element types here without VLX being enabled. When
// this happens we will use 512-bit operations and the mask will not be
// zero extended.
EVT OpVT = N->getOperand(0).getValueType();
+ // The first operand of X86ISD::STRICT_CMPM is chain, so we need to get the
+ // second operand.
+ if (Opcode == X86ISD::STRICT_CMPM)
+ OpVT = N->getOperand(1).getValueType();
if (OpVT.is256BitVector() || OpVT.is128BitVector())
return Subtarget->hasVLX();
@@ -575,6 +581,12 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
if (!N.hasOneUse())
return false;
+ // FIXME: Temporary hack to prevent strict floating point nodes from
+ // folding into masked operations illegally.
+ if (U == Root && Root->getOpcode() == ISD::VSELECT &&
+ N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD)
+ return false;
+
if (N.getOpcode() != ISD::LOAD)
return true;
@@ -804,8 +816,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
}
switch (N->getOpcode()) {
+ case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: {
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::STRICT_FP_TO_UINT: {
// Replace vector fp_to_s/uint with their X86 specific equivalent so we
// don't need 2 sets of patterns.
if (!N->getSimpleValueType(0).isVector())
@@ -814,13 +830,24 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
unsigned NewOpc;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
- case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
- case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
+ case ISD::FP_ROUND: NewOpc = X86ISD::VFPROUND; break;
+ case ISD::STRICT_FP_ROUND: NewOpc = X86ISD::STRICT_VFPROUND; break;
+ case ISD::STRICT_FP_TO_SINT: NewOpc = X86ISD::STRICT_CVTTP2SI; break;
+ case ISD::FP_TO_SINT: NewOpc = X86ISD::CVTTP2SI; break;
+ case ISD::STRICT_FP_TO_UINT: NewOpc = X86ISD::STRICT_CVTTP2UI; break;
+ case ISD::FP_TO_UINT: NewOpc = X86ISD::CVTTP2UI; break;
}
- SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
- N->getOperand(0));
+ SDValue Res;
+ if (N->isStrictFPOpcode())
+ Res =
+ CurDAG->getNode(NewOpc, SDLoc(N), {N->getValueType(0), MVT::Other},
+ {N->getOperand(0), N->getOperand(1)});
+ else
+ Res =
+ CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
+ N->getOperand(0));
--I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
++I;
CurDAG->DeleteNode(N);
continue;
@@ -869,27 +896,45 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
continue;
}
case ISD::FCEIL:
+ case ISD::STRICT_FCEIL:
case ISD::FFLOOR:
+ case ISD::STRICT_FFLOOR:
case ISD::FTRUNC:
+ case ISD::STRICT_FTRUNC:
case ISD::FNEARBYINT:
- case ISD::FRINT: {
+ case ISD::STRICT_FNEARBYINT:
+ case ISD::FRINT:
+ case ISD::STRICT_FRINT: {
// Replace fp rounding with their X86 specific equivalent so we don't
// need 2 sets of patterns.
unsigned Imm;
switch (N->getOpcode()) {
default: llvm_unreachable("Unexpected opcode!");
+ case ISD::STRICT_FCEIL:
case ISD::FCEIL: Imm = 0xA; break;
+ case ISD::STRICT_FFLOOR:
case ISD::FFLOOR: Imm = 0x9; break;
+ case ISD::STRICT_FTRUNC:
case ISD::FTRUNC: Imm = 0xB; break;
+ case ISD::STRICT_FNEARBYINT:
case ISD::FNEARBYINT: Imm = 0xC; break;
+ case ISD::STRICT_FRINT:
case ISD::FRINT: Imm = 0x4; break;
}
SDLoc dl(N);
- SDValue Res = CurDAG->getNode(
- X86ISD::VRNDSCALE, dl, N->getValueType(0), N->getOperand(0),
- CurDAG->getTargetConstant(Imm, dl, MVT::i8));
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Res;
+ if (IsStrict)
+ Res = CurDAG->getNode(X86ISD::STRICT_VRNDSCALE, dl,
+ {N->getValueType(0), MVT::Other},
+ {N->getOperand(0), N->getOperand(1),
+ CurDAG->getTargetConstant(Imm, dl, MVT::i8)});
+ else
+ Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, N->getValueType(0),
+ N->getOperand(0),
+ CurDAG->getTargetConstant(Imm, dl, MVT::i8));
--I;
- CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
+ CurDAG->ReplaceAllUsesWith(N, Res.getNode());
++I;
CurDAG->DeleteNode(N);
continue;
@@ -1017,12 +1062,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
// FPStack has extload and truncstore. SSE can fold direct loads into other
// operations. Based on this, decide what we want to do.
- MVT MemVT;
- if (N->getOpcode() == ISD::FP_ROUND)
- MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
- else
- MemVT = SrcIsSSE ? SrcVT : DstVT;
-
+ MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
SDLoc dl(N);
@@ -1075,22 +1115,47 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
// Here we could have an FP stack truncation or an FPStack <-> SSE convert.
// FPStack has extload and truncstore. SSE can fold direct loads into other
// operations. Based on this, decide what we want to do.
- MVT MemVT;
- if (N->getOpcode() == ISD::STRICT_FP_ROUND)
- MemVT = DstVT; // FP_ROUND must use DstVT, we can't do a 'trunc load'.
- else
- MemVT = SrcIsSSE ? SrcVT : DstVT;
-
+ MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
SDLoc dl(N);
// FIXME: optimize the case where the src/dest is a load or store?
//Since the operation is StrictFP, use the preexisting chain.
- SDValue Store = CurDAG->getTruncStore(N->getOperand(0), dl, N->getOperand(1),
- MemTmp, MachinePointerInfo(), MemVT);
- SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
- MachinePointerInfo(), MemVT);
+ SDValue Store, Result;
+ if (!SrcIsSSE) {
+ SDVTList VTs = CurDAG->getVTList(MVT::Other);
+ SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
+ Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
+ MachinePointerInfo(), 0,
+ MachineMemOperand::MOStore);
+ if (N->getFlags().hasNoFPExcept()) {
+ SDNodeFlags Flags = Store->getFlags();
+ Flags.setNoFPExcept(true);
+ Store->setFlags(Flags);
+ }
+ } else {
+ assert(SrcVT == MemVT && "Unexpected VT!");
+ Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
+ MachinePointerInfo());
+ }
+
+ if (!DstIsSSE) {
+ SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
+ SDValue Ops[] = {Store, MemTmp};
+ Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT,
+ MachinePointerInfo(), 0,
+ MachineMemOperand::MOLoad);
+ if (N->getFlags().hasNoFPExcept()) {
+ SDNodeFlags Flags = Result->getFlags();
+ Flags.setNoFPExcept(true);
+ Result->setFlags(Flags);
+ }
+ } else {
+ assert(DstVT == MemVT && "Unexpected VT!");
+ Result =
+ CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo());
+ }
// We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
// extload we created. This will cause general havok on the dag because
@@ -2224,12 +2289,11 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
- // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
- if (AddrSpace == 256)
+ if (AddrSpace == X86AS::GS)
AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
- if (AddrSpace == 257)
+ if (AddrSpace == X86AS::FS)
AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
- if (AddrSpace == 258)
+ if (AddrSpace == X86AS::SS)
AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
SDLoc DL(N);
@@ -3019,7 +3083,7 @@ bool X86DAGToDAGISel::foldLoadStoreIntoMemOperand(SDNode *Node) {
LLVM_FALLTHROUGH;
case X86ISD::ADD:
// Try to match inc/dec.
- if (!Subtarget->slowIncDec() || OptForSize) {
+ if (!Subtarget->slowIncDec() || CurDAG->shouldOptForSize()) {
bool IsOne = isOneConstant(StoredVal.getOperand(1));
bool IsNegOne = isAllOnesConstant(StoredVal.getOperand(1));
// ADD/SUB with 1/-1 and carry flag isn't used can use inc/dec.
@@ -4410,6 +4474,8 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
ReplaceNode(Node, CNode);
return;
}
+
+ break;
}
}
@@ -5094,6 +5160,17 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
MachineSDNode *NewNode;
SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
if (tryFoldLoad(Node, N0.getNode(), Reg, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4)) {
+ if (auto *LoadN = dyn_cast<LoadSDNode>(N0.getOperand(0).getNode())) {
+ if (!LoadN->isSimple()) {
+ unsigned NumVolBits = LoadN->getValueType(0).getSizeInBits();
+ if (MOpc == X86::TEST8mi && NumVolBits != 8)
+ break;
+ else if (MOpc == X86::TEST16mi && NumVolBits != 16)
+ break;
+ else if (MOpc == X86::TEST32mi && NumVolBits != 32)
+ break;
+ }
+ }
SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, Imm,
Reg.getOperand(0) };
NewNode = CurDAG->getMachineNode(MOpc, dl, MVT::i32, MVT::Other, Ops);
@@ -5190,34 +5267,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
if (foldLoadStoreIntoMemOperand(Node))
return;
break;
- case ISD::FCEIL:
- case ISD::FFLOOR:
- case ISD::FTRUNC:
- case ISD::FNEARBYINT:
- case ISD::FRINT: {
- // Replace fp rounding with their X86 specific equivalent so we don't
- // need 2 sets of patterns.
- // FIXME: This can only happen when the nodes started as STRICT_* and have
- // been mutated into their non-STRICT equivalents. Eventually this
- // mutation will be removed and we should switch the STRICT_ nodes to a
- // strict version of RNDSCALE in PreProcessISelDAG.
- unsigned Imm;
- switch (Node->getOpcode()) {
- default: llvm_unreachable("Unexpected opcode!");
- case ISD::FCEIL: Imm = 0xA; break;
- case ISD::FFLOOR: Imm = 0x9; break;
- case ISD::FTRUNC: Imm = 0xB; break;
- case ISD::FNEARBYINT: Imm = 0xC; break;
- case ISD::FRINT: Imm = 0x4; break;
- }
- SDLoc dl(Node);
- SDValue Res = CurDAG->getNode(X86ISD::VRNDSCALE, dl, Node->getValueType(0),
- Node->getOperand(0),
- CurDAG->getTargetConstant(Imm, dl, MVT::i8));
- ReplaceNode(Node, Res.getNode());
- SelectCode(Res.getNode());
- return;
- }
}
SelectCode(Node);
@@ -5230,10 +5279,6 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
switch (ConstraintID) {
default:
llvm_unreachable("Unexpected asm memory constraint");
- case InlineAsm::Constraint_i:
- // FIXME: It seems strange that 'i' is needed here since it's supposed to
- // be an immediate and not a memory constraint.
- LLVM_FALLTHROUGH;
case InlineAsm::Constraint_o: // offsetable ??
case InlineAsm::Constraint_v: // not offsetable ??
case InlineAsm::Constraint_m: // memory
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index ed975e9248a8..0f152968ddfd 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -25,7 +25,9 @@
#include "llvm/ADT/Statistic.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
#include "llvm/CodeGen/IntrinsicLowering.h"
#include "llvm/CodeGen/MachineFrameInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
@@ -154,17 +156,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::X86_StdCall);
}
- if (Subtarget.isTargetDarwin()) {
- // Darwin should use _setjmp/_longjmp instead of setjmp/longjmp.
- setUseUnderscoreSetJmp(false);
- setUseUnderscoreLongJmp(false);
- } else if (Subtarget.isTargetWindowsGNU()) {
- // MS runtime is weird: it exports _setjmp, but longjmp!
- setUseUnderscoreSetJmp(true);
- setUseUnderscoreLongJmp(false);
- } else {
- setUseUnderscoreSetJmp(true);
- setUseUnderscoreLongJmp(true);
+ if (Subtarget.getTargetTriple().isOSMSVCRT()) {
+ // MSVCRT doesn't have powi; fall back to pow
+ setLibcallName(RTLIB::POWI_F32, nullptr);
+ setLibcallName(RTLIB::POWI_F64, nullptr);
}
// If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
@@ -217,72 +212,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ShiftOp , MVT::i64 , Custom);
}
- // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
- // operation.
- setOperationAction(ISD::UINT_TO_FP , MVT::i1 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i8 , Promote);
- setOperationAction(ISD::UINT_TO_FP , MVT::i16 , Promote);
-
if (!Subtarget.useSoftFloat()) {
- // We have an algorithm for SSE2->double, and we turn this into a
- // 64-bit FILD followed by conditional FADD for other targets.
- setOperationAction(ISD::UINT_TO_FP , MVT::i64 , Custom);
+ // Promote all UINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have this
+ // operation.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::UINT_TO_FP, MVT::i16, Promote);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i16, Promote);
// We have an algorithm for SSE2, and we turn this into a 64-bit
// FILD or VCVTUSI2SS/SD for other targets.
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::UINT_TO_FP , MVT::i32 , Expand);
- }
-
- // Promote i1/i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
- // this operation.
- setOperationAction(ISD::SINT_TO_FP , MVT::i1 , Promote);
- setOperationAction(ISD::SINT_TO_FP , MVT::i8 , Promote);
-
- if (!Subtarget.useSoftFloat()) {
- // SSE has no i16 to fp conversion, only i32.
- if (X86ScalarSSEf32) {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
- // f32 and f64 cases are Legal, f80 case is not
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Custom);
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Custom);
- }
- } else {
- setOperationAction(ISD::SINT_TO_FP , MVT::i16 , Promote);
- setOperationAction(ISD::SINT_TO_FP , MVT::i32 , Expand);
- }
-
- // Promote i1/i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
- // this operation.
- setOperationAction(ISD::FP_TO_SINT , MVT::i1 , Promote);
- setOperationAction(ISD::FP_TO_SINT , MVT::i8 , Promote);
-
- if (!Subtarget.useSoftFloat()) {
+ setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
+ // We have an algorithm for SSE2->double, and we turn this into a
+ // 64-bit FILD followed by conditional FADD for other targets.
+ setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
+
+ // Promote i8 SINT_TO_FP to larger SINT_TO_FP's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::SINT_TO_FP, MVT::i8, Promote);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i8, Promote);
+ // SSE has no i16 to fp conversion, only i32. We promote in the handler
+ // to allow f80 to use i16 and f64 to use i16 with sse1 only
+ setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i16, Custom);
+ // f32 and f64 cases are Legal with SSE1/SSE2, f80 case is not
+ setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
// In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
// are Legal, f80 is custom lowered.
- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Custom);
- setOperationAction(ISD::SINT_TO_FP , MVT::i64 , Custom);
-
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Custom);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Custom);
- } else {
- setOperationAction(ISD::FP_TO_SINT , MVT::i16 , Promote);
- setOperationAction(ISD::FP_TO_SINT , MVT::i32 , Expand);
- setOperationAction(ISD::FP_TO_SINT , MVT::i64 , Expand);
- }
-
- // Handle FP_TO_UINT by promoting the destination to a larger signed
- // conversion.
- setOperationAction(ISD::FP_TO_UINT , MVT::i1 , Promote);
- setOperationAction(ISD::FP_TO_UINT , MVT::i8 , Promote);
- setOperationAction(ISD::FP_TO_UINT , MVT::i16 , Promote);
-
- if (!Subtarget.useSoftFloat()) {
- setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
- }
+ setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+
+ // Promote i8 FP_TO_SINT to larger FP_TO_SINTS's, as X86 doesn't have
+ // this operation.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i8, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i16, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+ // In 32-bit mode these are custom lowered. In 64-bit mode F32 and F64
+ // are Legal, f80 is custom lowered.
+ setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+
+ // Handle FP_TO_UINT by promoting the destination to a larger signed
+ // conversion.
+ setOperationAction(ISD::FP_TO_UINT, MVT::i8, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i8, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i16, Promote);
+ // FIXME: This doesn't generate invalid exception when it should. PR44019.
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i16, Promote);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+ }
+
+ // Handle address space casts between mixed sized pointers.
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i32, Custom);
+ setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
// TODO: when we have SSE, these could be more efficient, by using movd/movq.
if (!X86ScalarSSEf64) {
@@ -409,12 +401,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.hasMOVBE())
setOperationAction(ISD::BSWAP , MVT::i16 , Expand);
- // These should be promoted to a larger select which is supported.
- setOperationAction(ISD::SELECT , MVT::i1 , Promote);
// X86 wants to expand cmov itself.
for (auto VT : { MVT::f32, MVT::f64, MVT::f80, MVT::f128 }) {
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
}
for (auto VT : { MVT::i8, MVT::i16, MVT::i32, MVT::i64 }) {
if (VT == MVT::i64 && !Subtarget.is64Bit())
@@ -619,6 +611,20 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
} else // SSE immediates.
addLegalFPImmediate(APFloat(+0.0)); // xorpd
}
+ // Handle constrained floating-point operations of scalar.
+ setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
// We don't support FMA.
setOperationAction(ISD::FMA, MVT::f64, Expand);
@@ -659,6 +665,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::LLROUND, MVT::f80, Expand);
setOperationAction(ISD::LRINT, MVT::f80, Expand);
setOperationAction(ISD::LLRINT, MVT::f80, Expand);
+
+ // Handle constrained floating-point operations of scalar.
+ setOperationAction(ISD::STRICT_FADD , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSUB , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FMUL , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FDIV , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FSQRT , MVT::f80, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f80, Legal);
+ // FIXME: When the target is 64-bit, STRICT_FP_ROUND will be overwritten
+ // as Custom.
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::f80, Legal);
}
// f128 uses xmm registers, but most operations require libcalls.
@@ -668,22 +685,32 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
addLegalFPImmediate(APFloat::getZero(APFloat::IEEEquad())); // xorps
- setOperationAction(ISD::FADD, MVT::f128, Custom);
- setOperationAction(ISD::FSUB, MVT::f128, Custom);
- setOperationAction(ISD::FDIV, MVT::f128, Custom);
- setOperationAction(ISD::FMUL, MVT::f128, Custom);
- setOperationAction(ISD::FMA, MVT::f128, Expand);
+ setOperationAction(ISD::FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FADD, MVT::f128, LibCall);
+ setOperationAction(ISD::FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSUB, MVT::f128, LibCall);
+ setOperationAction(ISD::FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FDIV, MVT::f128, LibCall);
+ setOperationAction(ISD::FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMUL, MVT::f128, LibCall);
+ setOperationAction(ISD::FMA, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FMA, MVT::f128, LibCall);
setOperationAction(ISD::FABS, MVT::f128, Custom);
setOperationAction(ISD::FNEG, MVT::f128, Custom);
setOperationAction(ISD::FCOPYSIGN, MVT::f128, Custom);
- setOperationAction(ISD::FSIN, MVT::f128, Expand);
- setOperationAction(ISD::FCOS, MVT::f128, Expand);
- setOperationAction(ISD::FSINCOS, MVT::f128, Expand);
- setOperationAction(ISD::FSQRT, MVT::f128, Expand);
-
- setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::FSIN, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSIN, MVT::f128, LibCall);
+ setOperationAction(ISD::FCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FCOS, MVT::f128, LibCall);
+ setOperationAction(ISD::FSINCOS, MVT::f128, LibCall);
+ // No STRICT_FSINCOS
+ setOperationAction(ISD::FSQRT, MVT::f128, LibCall);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::f128, LibCall);
+
+ setOperationAction(ISD::FP_EXTEND, MVT::f128, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Custom);
// We need to custom handle any FP_ROUND with an f128 input, but
// LegalizeDAG uses the result type to know when to run a custom handler.
// So we have to list all legal floating point result types here.
@@ -820,12 +847,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::VSELECT, MVT::v4f32, Custom);
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v4f32, Custom);
setOperationAction(ISD::SELECT, MVT::v4f32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
setOperationAction(ISD::LOAD, MVT::v2f32, Custom);
setOperationAction(ISD::STORE, MVT::v2f32, Custom);
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Custom);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE2()) {
@@ -895,6 +925,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::ABS, VT, Custom);
@@ -933,37 +965,38 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FP_TO_SINT, MVT::v4i32, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i32, Custom);
// Custom legalize these to avoid over promotion or custom promotion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i8, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i16, Custom);
- setOperationAction(ISD::FP_TO_SINT, MVT::v4i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i8, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i16, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i16, Custom);
-
- // By marking FP_TO_SINT v8i16 as Custom, will trick type legalization into
- // promoting v8i8 FP_TO_UINT into FP_TO_SINT. When the v8i16 FP_TO_SINT is
- // split again based on the input type, this will cause an AssertSExt i16 to
- // be emitted instead of an AssertZExt. This will allow packssdw followed by
- // packuswb to be used to truncate to v8i8. This is necessary since packusdw
- // isn't available until sse4.1.
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i16, Custom);
+ for (auto VT : {MVT::v2i8, MVT::v4i8, MVT::v8i8, MVT::v2i16, MVT::v4i16}) {
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT, Custom);
+ }
setOperationAction(ISD::SINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2i32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2i32, Custom);
+
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32, Custom);
// Fast v2f32 UINT_TO_FP( v2i32 ) custom conversion.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::UINT_TO_FP, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v2f32, Custom);
setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v2f32, Custom);
setOperationAction(ISD::FP_ROUND, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v2f32, Custom);
// We want to legalize this to an f64 load rather than an i64 load on
// 64-bit targets and two 32-bit loads on a 32-bit target. Similar for
@@ -1008,6 +1041,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// With AVX512, expanding (and promoting the shifts) is better.
if (!Subtarget.hasAVX512())
setOperationAction(ISD::ROTL, MVT::v16i8, Custom);
+
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
}
if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
@@ -1029,11 +1068,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasSSE41()) {
for (MVT RoundedTy : {MVT::f32, MVT::f64, MVT::v4f32, MVT::v2f64}) {
- setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
- setOperationAction(ISD::FCEIL, RoundedTy, Legal);
- setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
- setOperationAction(ISD::FRINT, RoundedTy, Legal);
- setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, RoundedTy, Legal);
+ setOperationAction(ISD::FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, RoundedTy, Legal);
+ setOperationAction(ISD::FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, RoundedTy, Legal);
+ setOperationAction(ISD::FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FRINT, RoundedTy, Legal);
+ setOperationAction(ISD::FNEARBYINT, RoundedTy, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy, Legal);
}
setOperationAction(ISD::SMAX, MVT::v16i8, Legal);
@@ -1072,6 +1116,17 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// i8 vectors are custom because the source register and source
// source memory operand types are not the same width.
setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v16i8, Custom);
+
+ if (Subtarget.is64Bit() && !Subtarget.hasAVX512()) {
+ // We need to scalarize v4i64->v432 uint_to_fp using cvtsi2ss, but we can
+ // do the pre and post work in the vector domain.
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i64, Custom);
+ // We need to mark SINT_TO_FP as Custom even though we want to expand it
+ // so that DAG combine doesn't try to turn it into uint_to_fp.
+ setOperationAction(ISD::SINT_TO_FP, MVT::v4i64, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v4i64, Custom);
+ }
}
if (!Subtarget.useSoftFloat() && Subtarget.hasXOP()) {
@@ -1105,25 +1160,45 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
: &X86::VR256RegClass);
for (auto VT : { MVT::v8f32, MVT::v4f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
- setOperationAction(ISD::FNEG, VT, Custom);
- setOperationAction(ISD::FABS, VT, Custom);
- setOperationAction(ISD::FCOPYSIGN, VT, Custom);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FNEG, VT, Custom);
+ setOperationAction(ISD::FABS, VT, Custom);
+ setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
// (fp_to_int:v8i16 (v8f32 ..)) requires the result type to be promoted
// even though v8i16 is a legal type.
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i16, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i16, MVT::v8i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v8i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i32, Legal);
setOperationAction(ISD::SINT_TO_FP, MVT::v8i32, Legal);
-
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i32, Legal);
+
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v4f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v4f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v4f64, Legal);
if (!Subtarget.hasAVX512())
setOperationAction(ISD::BITCAST, MVT::v32i1, Custom);
@@ -1169,6 +1244,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::CTPOP, VT, Custom);
setOperationAction(ISD::CTLZ, VT, Custom);
@@ -1180,8 +1257,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasAnyFMA()) {
for (auto VT : { MVT::f32, MVT::f64, MVT::v4f32, MVT::v8f32,
- MVT::v2f64, MVT::v4f64 })
+ MVT::v2f64, MVT::v4f64 }) {
setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
+ }
}
for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32, MVT::v4i64 }) {
@@ -1233,6 +1312,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
// The custom lowering for UINT_TO_FP for v8i32 becomes interesting
// when we have a 256bit-wide blend with immediate.
setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32, Custom);
// AVX2 also has wider vector sign/zero extending loads, VPMOV[SZ]X
for (auto LoadExtOp : { ISD::SEXTLOAD, ISD::ZEXTLOAD }) {
@@ -1299,12 +1379,18 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v1i1, Custom);
setOperationAction(ISD::BUILD_VECTOR, MVT::v1i1, Custom);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
- setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v8i1, MVT::v8i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, MVT::v4i1, MVT::v4i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, MVT::v4i1, MVT::v4i32);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2i1, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i1, Custom);
// There is no byte sized k-register load or store without AVX512DQ.
if (!Subtarget.hasDQI()) {
@@ -1331,6 +1417,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SUB, VT, Custom);
setOperationAction(ISD::MUL, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
setOperationAction(ISD::TRUNCATE, VT, Custom);
setOperationAction(ISD::UADDSAT, VT, Custom);
@@ -1372,21 +1460,37 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::FNEG, VT, Custom);
setOperationAction(ISD::FABS, VT, Custom);
setOperationAction(ISD::FMA, VT, Legal);
+ setOperationAction(ISD::STRICT_FMA, VT, Legal);
setOperationAction(ISD::FCOPYSIGN, VT, Custom);
}
- setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i16, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i8, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_SINT, MVT::v16i1, MVT::v16i32);
- setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i1, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i8, MVT::v16i32);
- setOperationPromotedToType(ISD::FP_TO_UINT, MVT::v16i16, MVT::v16i32);
- setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
-
- setOperationAction(ISD::STRICT_FP_ROUND, MVT::v16f32, Custom);
+ for (MVT VT : { MVT::v16i1, MVT::v16i8, MVT::v16i16 }) {
+ setOperationPromotedToType(ISD::FP_TO_SINT , VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::FP_TO_UINT , VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_SINT, VT, MVT::v16i32);
+ setOperationPromotedToType(ISD::STRICT_FP_TO_UINT, VT, MVT::v16i32);
+ }
+ setOperationAction(ISD::FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v16i32, Legal);
+ setOperationAction(ISD::SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v16i32, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v16i32, Legal);
+
+ setOperationAction(ISD::STRICT_FADD, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FADD, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSUB, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FMUL, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FDIV, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v16f32, Legal);
+ setOperationAction(ISD::STRICT_FSQRT, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_EXTEND, MVT::v8f64, Legal);
+ setOperationAction(ISD::STRICT_FP_ROUND, MVT::v8f32, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i8, Legal);
setTruncStoreAction(MVT::v8i64, MVT::v8i16, Legal);
@@ -1420,11 +1524,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::SIGN_EXTEND, MVT::v8i8, Custom);
for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
- setOperationAction(ISD::FFLOOR, VT, Legal);
- setOperationAction(ISD::FCEIL, VT, Legal);
- setOperationAction(ISD::FTRUNC, VT, Legal);
- setOperationAction(ISD::FRINT, VT, Legal);
- setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::FFLOOR, VT, Legal);
+ setOperationAction(ISD::STRICT_FFLOOR, VT, Legal);
+ setOperationAction(ISD::FCEIL, VT, Legal);
+ setOperationAction(ISD::STRICT_FCEIL, VT, Legal);
+ setOperationAction(ISD::FTRUNC, VT, Legal);
+ setOperationAction(ISD::STRICT_FTRUNC, VT, Legal);
+ setOperationAction(ISD::FRINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FRINT, VT, Legal);
+ setOperationAction(ISD::FNEARBYINT, VT, Legal);
+ setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
setOperationAction(ISD::SELECT, VT, Custom);
}
@@ -1459,6 +1568,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setOperationAction(ISD::ROTL, VT, Custom);
setOperationAction(ISD::ROTR, VT, Custom);
setOperationAction(ISD::SETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCC, VT, Custom);
+ setOperationAction(ISD::STRICT_FSETCCS, VT, Custom);
setOperationAction(ISD::SELECT, VT, Custom);
// The condition codes aren't legal in SSE/AVX and under AVX512 we use
@@ -1470,8 +1581,12 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI()) {
setOperationAction(ISD::SINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::UINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_SINT, MVT::v8i64, Legal);
setOperationAction(ISD::FP_TO_UINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v8i64, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i64, Legal);
setOperationAction(ISD::MUL, MVT::v8i64, Legal);
}
@@ -1532,13 +1647,25 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (!Subtarget.useSoftFloat() && Subtarget.hasAVX512()) {
// These operations are handled on non-VLX by artificially widening in
// isel patterns.
- // TODO: Custom widen in lowering on non-VLX and drop the isel patterns?
- setOperationAction(ISD::FP_TO_UINT, MVT::v8i32, Legal);
- setOperationAction(ISD::FP_TO_UINT, MVT::v4i32, Legal);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
setOperationAction(ISD::FP_TO_UINT, MVT::v2i32, Custom);
- setOperationAction(ISD::UINT_TO_FP, MVT::v8i32, Legal);
- setOperationAction(ISD::UINT_TO_FP, MVT::v4i32, Legal);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2i32, Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::UINT_TO_FP, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v8i32,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
+ Subtarget.hasVLX() ? Legal : Custom);
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
setOperationAction(ISD::SMAX, VT, Legal);
@@ -1563,12 +1690,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI()) {
for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
- setOperationAction(ISD::SINT_TO_FP, VT, Legal);
- setOperationAction(ISD::UINT_TO_FP, VT, Legal);
- setOperationAction(ISD::FP_TO_SINT, VT, Legal);
- setOperationAction(ISD::FP_TO_UINT, VT, Legal);
-
- setOperationAction(ISD::MUL, VT, Legal);
+ setOperationAction(ISD::SINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::UINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_SINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_UINT_TO_FP, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_SINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, VT,
+ Subtarget.hasVLX() ? Legal : Custom);
+ setOperationAction(ISD::MUL, VT, Legal);
}
}
@@ -1739,12 +1877,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.hasDQI()) {
// Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
// v2f32 UINT_TO_FP is already custom under SSE2.
- setOperationAction(ISD::SINT_TO_FP, MVT::v2f32, Custom);
assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+ isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
"Unexpected operation action!");
// v2i64 FP_TO_S/UINT(v2f32) custom conversion.
- setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
- setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::FP_TO_UINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+ setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
}
if (Subtarget.hasBWI()) {
@@ -1828,8 +1968,15 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
if (Subtarget.is32Bit() &&
(Subtarget.isTargetWindowsMSVC() || Subtarget.isTargetWindowsItanium()))
for (ISD::NodeType Op :
- {ISD::FCEIL, ISD::FCOS, ISD::FEXP, ISD::FFLOOR, ISD::FREM, ISD::FLOG,
- ISD::FLOG10, ISD::FPOW, ISD::FSIN})
+ {ISD::FCEIL, ISD::STRICT_FCEIL,
+ ISD::FCOS, ISD::STRICT_FCOS,
+ ISD::FEXP, ISD::STRICT_FEXP,
+ ISD::FFLOOR, ISD::STRICT_FFLOOR,
+ ISD::FREM, ISD::STRICT_FREM,
+ ISD::FLOG, ISD::STRICT_FLOG,
+ ISD::FLOG10, ISD::STRICT_FLOG10,
+ ISD::FPOW, ISD::STRICT_FPOW,
+ ISD::FSIN, ISD::STRICT_FSIN})
if (isOperationExpand(Op, MVT::f32))
setOperationAction(Op, MVT::f32, Promote);
@@ -1870,6 +2017,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
setTargetDAGCombine(ISD::SINT_TO_FP);
setTargetDAGCombine(ISD::UINT_TO_FP);
+ setTargetDAGCombine(ISD::STRICT_SINT_TO_FP);
+ setTargetDAGCombine(ISD::STRICT_UINT_TO_FP);
setTargetDAGCombine(ISD::SETCC);
setTargetDAGCombine(ISD::MUL);
setTargetDAGCombine(ISD::XOR);
@@ -1901,6 +2050,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
setPrefFunctionAlignment(Align(16));
verifyIntrinsicTables();
+
+ // Default to having -disable-strictnode-mutation on
+ IsStrictFPEnabled = true;
}
// This has so far only been implemented for 64-bit MachO.
@@ -1910,7 +2062,7 @@ bool X86TargetLowering::useLoadStackGuardNode() const {
bool X86TargetLowering::useStackGuardXorFP() const {
// Currently only MSVC CRTs XOR the frame pointer into the stack guard value.
- return Subtarget.getTargetTriple().isOSMSVCRT();
+ return Subtarget.getTargetTriple().isOSMSVCRT() && !Subtarget.isTargetMachO();
}
SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
@@ -1946,9 +2098,13 @@ MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return MVT::i8;
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall)
+ return MVT::v32i1;
// FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
- Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
+ Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return MVT::v16i32;
return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
}
@@ -1966,9 +2122,13 @@ unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
(VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
(VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
return VT.getVectorNumElements();
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall)
+ return 2;
// FIXME: Should we just make these types legal and custom split operations?
- if ((VT == MVT::v32i16 || VT == MVT::v64i8) &&
- Subtarget.hasAVX512() && !Subtarget.hasBWI() && !EnableOldKNLABI)
+ if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
+ Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
return 1;
return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
}
@@ -1988,6 +2148,15 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
return NumIntermediates;
}
+ // Split v64i1 vectors if we don't have v64i8 available.
+ if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
+ CC != CallingConv::X86_RegCall) {
+ RegisterVT = MVT::v32i1;
+ IntermediateVT = MVT::v32i1;
+ NumIntermediates = 2;
+ return 2;
+ }
+
return TargetLowering::getVectorTypeBreakdownForCallingConv(Context, CC, VT, IntermediateVT,
NumIntermediates, RegisterVT);
}
@@ -2383,6 +2552,10 @@ bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
unsigned DestAS) const {
assert(SrcAS != DestAS && "Expected different address spaces!");
+ const TargetMachine &TM = getTargetMachine();
+ if (TM.getPointerSize(SrcAS) != TM.getPointerSize(DestAS))
+ return false;
+
return SrcAS < 256 && DestAS < 256;
}
@@ -2520,18 +2693,16 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
assert(VA.getLocInfo() != CCValAssign::FPExt &&
"Unexpected FP-extend for return value.");
- // If this is x86-64, and we disabled SSE, we can't return FP values,
- // or SSE or MMX vectors.
- if ((ValVT == MVT::f32 || ValVT == MVT::f64 ||
- VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE1())) {
+ // Report an error if we have attempted to return a value via an XMM
+ // register and SSE was disabled.
+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
- } else if (ValVT == MVT::f64 &&
- (Subtarget.is64Bit() && !Subtarget.hasSSE2())) {
- // Likewise we can't return F64 values with SSE1 only. gcc does so, but
- // llvm-gcc has never done it right and no one has noticed, so this
- // should be OK for now.
+ } else if (!Subtarget.hasSSE2() &&
+ X86::FR64XRegClass.contains(VA.getLocReg()) &&
+ ValVT == MVT::f64) {
+ // When returning a double via an XMM register, report an error if SSE2 is
+ // not enabled.
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
@@ -2826,7 +2997,6 @@ SDValue X86TargetLowering::LowerCallResult(
const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
// Assign locations to each value returned by this call.
SmallVector<CCValAssign, 16> RVLocs;
- bool Is64Bit = Subtarget.is64Bit();
CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
*DAG.getContext());
CCInfo.AnalyzeCallResult(Ins, RetCC_X86);
@@ -2845,15 +3015,22 @@ SDValue X86TargetLowering::LowerCallResult(
RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
}
- // If this is x86-64, and we disabled SSE, we can't return FP values
- if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
- ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
+ // Report an error if there was an attempt to return FP values via XMM
+ // registers.
+ if (!Subtarget.hasSSE1() && X86::FR32XRegClass.contains(VA.getLocReg())) {
errorUnsupported(DAG, dl, "SSE register return with SSE disabled");
- VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
- } else if (CopyVT == MVT::f64 &&
- (Is64Bit && !Subtarget.hasSSE2())) {
+ if (VA.getLocReg() == X86::XMM1)
+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
+ else
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ } else if (!Subtarget.hasSSE2() &&
+ X86::FR64XRegClass.contains(VA.getLocReg()) &&
+ CopyVT == MVT::f64) {
errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled");
- VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
+ if (VA.getLocReg() == X86::XMM1)
+ VA.convertToReg(X86::FP1); // Set reg to FP1, avoid hitting asserts.
+ else
+ VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts.
}
// If we prefer to use the value in xmm registers, copy it out as f80 and
@@ -2895,6 +3072,9 @@ SDValue X86TargetLowering::LowerCallResult(
Val = DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val);
}
+ if (VA.getLocInfo() == CCValAssign::BCvt)
+ Val = DAG.getBitcast(VA.getValVT(), Val);
+
InVals.push_back(Val);
}
@@ -2993,9 +3173,7 @@ static bool shouldGuaranteeTCO(CallingConv::ID CC, bool GuaranteedTailCallOpt) {
}
bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
- auto Attr =
- CI->getParent()->getParent()->getFnAttribute("disable-tail-calls");
- if (!CI->isTailCall() || Attr.getValueAsString() == "true")
+ if (!CI->isTailCall())
return false;
ImmutableCallSite CS(CI);
@@ -3464,8 +3642,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
FuncInfo->getForwardedMustTailRegParms();
CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
- // Conservatively forward AL on x86_64, since it might be used for varargs.
- if (Is64Bit && !CCInfo.isAllocated(X86::AL)) {
+ // Forward AL for SysV x86_64 targets, since it is used for varargs.
+ if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
}
@@ -3618,7 +3796,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
CallConv == CallingConv::Tail;
X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
- auto Attr = MF.getFunction().getFnAttribute("disable-tail-calls");
const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
@@ -3634,9 +3811,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
if (CallConv == CallingConv::X86_INTR)
report_fatal_error("X86 interrupts may not be called directly");
- if (Attr.getValueAsString() == "true")
- isTailCall = false;
-
if (Subtarget.isPICStyleGOT() && !IsGuaranteeTCO) {
// If we are using a GOT, disable tail calls to external symbols with
// default visibility. Tail calling such a symbol requires using a GOT
@@ -3728,7 +3902,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
"the only memory argument");
}
- if (!IsSibcall)
+ if (!IsSibcall && !IsMustTail)
Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush,
NumBytes - NumBytesToPush, dl);
@@ -4013,7 +4187,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
SmallVector<SDValue, 8> Ops;
- if (!IsSibcall && isTailCall) {
+ if (!IsSibcall && isTailCall && !IsMustTail) {
Chain = DAG.getCALLSEQ_END(Chain,
DAG.getIntPtrConstant(NumBytesToPop, dl, true),
DAG.getIntPtrConstant(0, dl, true), InFlag, dl);
@@ -4183,23 +4357,13 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
/// Make the stack size align e.g 16n + 12 aligned for a 16-byte align
/// requirement.
unsigned
-X86TargetLowering::GetAlignedArgumentStackSize(unsigned StackSize,
- SelectionDAG& DAG) const {
- const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
- const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- unsigned StackAlignment = TFI.getStackAlignment();
- uint64_t AlignMask = StackAlignment - 1;
- int64_t Offset = StackSize;
- unsigned SlotSize = RegInfo->getSlotSize();
- if ( (Offset & AlignMask) <= (StackAlignment - SlotSize) ) {
- // Number smaller than 12 so just add the difference.
- Offset += ((StackAlignment - SlotSize) - (Offset & AlignMask));
- } else {
- // Mask out lower bits, add stackalignment once plus the 12 bytes.
- Offset = ((~AlignMask) & Offset) + StackAlignment +
- (StackAlignment-SlotSize);
- }
- return Offset;
+X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
+ SelectionDAG &DAG) const {
+ const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
+ const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
+ assert(StackSize % SlotSize == 0 &&
+ "StackSize must be a multiple of SlotSize");
+ return alignTo(StackSize + SlotSize, StackAlignment) - SlotSize;
}
/// Return true if the given stack call argument is already available in the
@@ -4643,8 +4807,8 @@ bool X86::isCalleePop(CallingConv::ID CallingConv,
}
}
-/// Return true if the condition is an unsigned comparison operation.
-static bool isX86CCUnsigned(unsigned X86CC) {
+/// Return true if the condition is an signed comparison operation.
+static bool isX86CCSigned(unsigned X86CC) {
switch (X86CC) {
default:
llvm_unreachable("Invalid integer condition!");
@@ -4654,12 +4818,12 @@ static bool isX86CCUnsigned(unsigned X86CC) {
case X86::COND_A:
case X86::COND_BE:
case X86::COND_AE:
- return true;
+ return false;
case X86::COND_G:
case X86::COND_GE:
case X86::COND_L:
case X86::COND_LE:
- return false;
+ return true;
}
}
@@ -4700,7 +4864,7 @@ static X86::CondCode TranslateX86CC(ISD::CondCode SetCCOpcode, const SDLoc &DL,
// X >= 0 -> X == 0, jump on !sign.
return X86::COND_NS;
}
- if (SetCCOpcode == ISD::SETLT && RHSC->getAPIntValue() == 1) {
+ if (SetCCOpcode == ISD::SETLT && RHSC->isOne()) {
// X < 1 -> X <= 0
RHS = DAG.getConstant(0, DL, RHS.getValueType());
return X86::COND_LE;
@@ -4949,12 +5113,6 @@ bool X86TargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
(1 - MulC).isPowerOf2() || (-(MulC + 1)).isPowerOf2();
}
-bool X86TargetLowering::shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
- bool IsSigned) const {
- // f80 UINT_TO_FP is more efficient using Strict code if FCMOV is available.
- return !IsSigned && FpVT == MVT::f80 && Subtarget.hasCMov();
-}
-
bool X86TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
unsigned Index) const {
if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
@@ -5334,15 +5492,18 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
static bool canWidenShuffleElements(ArrayRef<int> Mask,
const APInt &Zeroable,
+ bool V2IsZero,
SmallVectorImpl<int> &WidenedMask) {
- SmallVector<int, 32> TargetMask(Mask.begin(), Mask.end());
- for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
- if (TargetMask[i] == SM_SentinelUndef)
- continue;
- if (Zeroable[i])
- TargetMask[i] = SM_SentinelZero;
+ // Create an alternative mask with info about zeroable elements.
+ // Here we do not set undef elements as zeroable.
+ SmallVector<int, 64> ZeroableMask(Mask.begin(), Mask.end());
+ if (V2IsZero) {
+ assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
+ for (int i = 0, Size = Mask.size(); i != Size; ++i)
+ if (Mask[i] != SM_SentinelUndef && Zeroable[i])
+ ZeroableMask[i] = SM_SentinelZero;
}
- return canWidenShuffleElements(TargetMask, WidenedMask);
+ return canWidenShuffleElements(ZeroableMask, WidenedMask);
}
static bool canWidenShuffleElements(ArrayRef<int> Mask) {
@@ -5764,11 +5925,29 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
// Widen the vector if needed.
Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT, Undef, Vec, ZeroIdx);
- // Clear the upper bits of the subvector and move it to its insert position.
unsigned ShiftLeft = NumElems - SubVecNumElems;
+ unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
+
+ // Do an optimization for the the most frequently used types.
+ if (WideOpVT != MVT::v64i1 || Subtarget.is64Bit()) {
+ APInt Mask0 = APInt::getBitsSet(NumElems, IdxVal, IdxVal + SubVecNumElems);
+ Mask0.flipAllBits();
+ SDValue CMask0 = DAG.getConstant(Mask0, dl, MVT::getIntegerVT(NumElems));
+ SDValue VMask0 = DAG.getNode(ISD::BITCAST, dl, WideOpVT, CMask0);
+ Vec = DAG.getNode(ISD::AND, dl, WideOpVT, Vec, VMask0);
+ SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
+ SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
+ DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
+ Op = DAG.getNode(ISD::OR, dl, WideOpVT, Vec, SubVec);
+
+ // Reduce to original width if needed.
+ return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OpVT, Op, ZeroIdx);
+ }
+
+ // Clear the upper bits of the subvector and move it to its insert position.
SubVec = DAG.getNode(X86ISD::KSHIFTL, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftLeft, dl, MVT::i8));
- unsigned ShiftRight = NumElems - SubVecNumElems - IdxVal;
SubVec = DAG.getNode(X86ISD::KSHIFTR, dl, WideOpVT, SubVec,
DAG.getTargetConstant(ShiftRight, dl, MVT::i8));
@@ -5850,7 +6029,7 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
"Expected VTs to be the same size!");
unsigned Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
In = extractSubVector(In, 0, DAG, DL,
- std::max(128U, VT.getSizeInBits() / Scale));
+ std::max(128U, (unsigned)VT.getSizeInBits() / Scale));
InVT = In.getValueType();
}
@@ -6719,9 +6898,97 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
return true;
}
+/// Compute whether each element of a shuffle is zeroable.
+///
+/// A "zeroable" vector shuffle element is one which can be lowered to zero.
+/// Either it is an undef element in the shuffle mask, the element of the input
+/// referenced is undef, or the element of the input referenced is known to be
+/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
+/// as many lanes with this technique as possible to simplify the remaining
+/// shuffle.
+static void computeZeroableShuffleElements(ArrayRef<int> Mask,
+ SDValue V1, SDValue V2,
+ APInt &KnownUndef, APInt &KnownZero) {
+ int Size = Mask.size();
+ KnownUndef = KnownZero = APInt::getNullValue(Size);
+
+ V1 = peekThroughBitcasts(V1);
+ V2 = peekThroughBitcasts(V2);
+
+ bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
+ bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
+
+ int VectorSizeInBits = V1.getValueSizeInBits();
+ int ScalarSizeInBits = VectorSizeInBits / Size;
+ assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
+
+ for (int i = 0; i < Size; ++i) {
+ int M = Mask[i];
+ // Handle the easy cases.
+ if (M < 0) {
+ KnownUndef.setBit(i);
+ continue;
+ }
+ if ((M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
+ KnownZero.setBit(i);
+ continue;
+ }
+
+ // Determine shuffle input and normalize the mask.
+ SDValue V = M < Size ? V1 : V2;
+ M %= Size;
+
+ // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
+ if (V.getOpcode() != ISD::BUILD_VECTOR)
+ continue;
+
+ // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
+ // the (larger) source element must be UNDEF/ZERO.
+ if ((Size % V.getNumOperands()) == 0) {
+ int Scale = Size / V->getNumOperands();
+ SDValue Op = V.getOperand(M / Scale);
+ if (Op.isUndef())
+ KnownUndef.setBit(i);
+ if (X86::isZeroNode(Op))
+ KnownZero.setBit(i);
+ else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
+ APInt Val = Cst->getAPIntValue();
+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
+ if (Val == 0)
+ KnownZero.setBit(i);
+ } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
+ APInt Val = Cst->getValueAPF().bitcastToAPInt();
+ Val = Val.extractBits(ScalarSizeInBits, (M % Scale) * ScalarSizeInBits);
+ if (Val == 0)
+ KnownZero.setBit(i);
+ }
+ continue;
+ }
+
+ // If the BUILD_VECTOR has more elements then all the (smaller) source
+ // elements must be UNDEF or ZERO.
+ if ((V.getNumOperands() % Size) == 0) {
+ int Scale = V->getNumOperands() / Size;
+ bool AllUndef = true;
+ bool AllZero = true;
+ for (int j = 0; j < Scale; ++j) {
+ SDValue Op = V.getOperand((M * Scale) + j);
+ AllUndef &= Op.isUndef();
+ AllZero &= X86::isZeroNode(Op);
+ }
+ if (AllUndef)
+ KnownUndef.setBit(i);
+ if (AllZero)
+ KnownZero.setBit(i);
+ continue;
+ }
+ }
+}
+
/// Decode a target shuffle mask and inputs and see if any values are
/// known to be undef or zero from their inputs.
/// Returns true if the target shuffle mask was decoded.
+/// FIXME: Merge this with computeZeroableShuffleElements?
static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
SmallVectorImpl<SDValue> &Ops,
APInt &KnownUndef, APInt &KnownZero) {
@@ -6741,7 +7008,7 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
V1 = peekThroughBitcasts(V1);
V2 = peekThroughBitcasts(V2);
- assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+ assert((VT.getSizeInBits() % Size) == 0 &&
"Illegal split of shuffle value type");
unsigned EltSizeInBits = VT.getSizeInBits() / Size;
@@ -6810,7 +7077,8 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
// Replace target shuffle mask elements with known undef/zero sentinels.
static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
const APInt &KnownUndef,
- const APInt &KnownZero) {
+ const APInt &KnownZero,
+ bool ResolveKnownZeros= true) {
unsigned NumElts = Mask.size();
assert(KnownUndef.getBitWidth() == NumElts &&
KnownZero.getBitWidth() == NumElts && "Shuffle mask size mismatch");
@@ -6818,7 +7086,7 @@ static void resolveTargetShuffleFromZeroables(SmallVectorImpl<int> &Mask,
for (unsigned i = 0; i != NumElts; ++i) {
if (KnownUndef[i])
Mask[i] = SM_SentinelUndef;
- else if (KnownZero[i])
+ else if (ResolveKnownZeros && KnownZero[i])
Mask[i] = SM_SentinelZero;
}
}
@@ -8306,7 +8574,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
// TODO: If multiple splats are generated to load the same constant,
// it may be detrimental to overall size. There needs to be a way to detect
// that condition to know if this is truly a size win.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
// Handle broadcasting a single constant scalar from the constant pool
// into a vector.
@@ -8552,7 +8820,7 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
ImmH = DAG.getBitcast(MVT::v32i1, ImmH);
DstVec = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, ImmL, ImmH);
} else {
- MVT ImmVT = MVT::getIntegerVT(std::max(VT.getSizeInBits(), 8U));
+ MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
SDValue Imm = DAG.getConstant(Immediate, dl, ImmVT);
MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
DstVec = DAG.getBitcast(VecVT, Imm);
@@ -10130,13 +10398,18 @@ static bool isNoopShuffleMask(ArrayRef<int> Mask) {
return true;
}
-/// Test whether there are elements crossing 128-bit lanes in this
+/// Test whether there are elements crossing LaneSizeInBits lanes in this
/// shuffle mask.
///
/// X86 divides up its shuffles into in-lane and cross-lane shuffle operations
/// and we routinely test for these.
-static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
- int LaneSize = 128 / VT.getScalarSizeInBits();
+static bool isLaneCrossingShuffleMask(unsigned LaneSizeInBits,
+ unsigned ScalarSizeInBits,
+ ArrayRef<int> Mask) {
+ assert(LaneSizeInBits && ScalarSizeInBits &&
+ (LaneSizeInBits % ScalarSizeInBits) == 0 &&
+ "Illegal shuffle lane size");
+ int LaneSize = LaneSizeInBits / ScalarSizeInBits;
int Size = Mask.size();
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
@@ -10144,6 +10417,12 @@ static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
return false;
}
+/// Test whether there are elements crossing 128-bit lanes in this
+/// shuffle mask.
+static bool is128BitLaneCrossingShuffleMask(MVT VT, ArrayRef<int> Mask) {
+ return isLaneCrossingShuffleMask(128, VT.getScalarSizeInBits(), Mask);
+}
+
/// Test whether a shuffle mask is equivalent within each sub-lane.
///
/// This checks a shuffle mask to see if it is performing the same
@@ -10424,84 +10703,6 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
return DAG.getTargetConstant(getV4X86ShuffleImm(Mask), DL, MVT::i8);
}
-/// Compute whether each element of a shuffle is zeroable.
-///
-/// A "zeroable" vector shuffle element is one which can be lowered to zero.
-/// Either it is an undef element in the shuffle mask, the element of the input
-/// referenced is undef, or the element of the input referenced is known to be
-/// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
-/// as many lanes with this technique as possible to simplify the remaining
-/// shuffle.
-static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
- SDValue V1, SDValue V2) {
- APInt Zeroable(Mask.size(), 0);
- V1 = peekThroughBitcasts(V1);
- V2 = peekThroughBitcasts(V2);
-
- bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
- bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-
- int VectorSizeInBits = V1.getValueSizeInBits();
- int ScalarSizeInBits = VectorSizeInBits / Mask.size();
- assert(!(VectorSizeInBits % ScalarSizeInBits) && "Illegal shuffle mask size");
-
- for (int i = 0, Size = Mask.size(); i < Size; ++i) {
- int M = Mask[i];
- // Handle the easy cases.
- if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
- Zeroable.setBit(i);
- continue;
- }
-
- // Determine shuffle input and normalize the mask.
- SDValue V = M < Size ? V1 : V2;
- M %= Size;
-
- // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
- if (V.getOpcode() != ISD::BUILD_VECTOR)
- continue;
-
- // If the BUILD_VECTOR has fewer elements then the bitcasted portion of
- // the (larger) source element must be UNDEF/ZERO.
- if ((Size % V.getNumOperands()) == 0) {
- int Scale = Size / V->getNumOperands();
- SDValue Op = V.getOperand(M / Scale);
- if (Op.isUndef() || X86::isZeroNode(Op))
- Zeroable.setBit(i);
- else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
- APInt Val = Cst->getAPIntValue();
- Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
- Val = Val.getLoBits(ScalarSizeInBits);
- if (Val == 0)
- Zeroable.setBit(i);
- } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
- APInt Val = Cst->getValueAPF().bitcastToAPInt();
- Val.lshrInPlace((M % Scale) * ScalarSizeInBits);
- Val = Val.getLoBits(ScalarSizeInBits);
- if (Val == 0)
- Zeroable.setBit(i);
- }
- continue;
- }
-
- // If the BUILD_VECTOR has more elements then all the (smaller) source
- // elements must be UNDEF or ZERO.
- if ((V.getNumOperands() % Size) == 0) {
- int Scale = V->getNumOperands() / Size;
- bool AllZeroable = true;
- for (int j = 0; j < Scale; ++j) {
- SDValue Op = V.getOperand((M * Scale) + j);
- AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
- }
- if (AllZeroable)
- Zeroable.setBit(i);
- continue;
- }
- }
-
- return Zeroable;
-}
-
// The Shuffle result is as follow:
// 0*a[0]0*a[1]...0*a[n] , n >=0 where a[] elements in a ascending order.
// Each Zeroable's element correspond to a particular Mask's element.
@@ -10616,11 +10817,11 @@ static SDValue lowerShuffleToEXPAND(const SDLoc &DL, MVT VT,
return DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector, ZeroVector, VMask);
}
-static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
- unsigned &UnpackOpcode, bool IsUnary,
- ArrayRef<int> TargetMask,
- const SDLoc &DL, SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static bool matchShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+ unsigned &UnpackOpcode, bool IsUnary,
+ ArrayRef<int> TargetMask, const SDLoc &DL,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
int NumElts = VT.getVectorNumElements();
bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
@@ -10728,8 +10929,8 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
return SDValue();
}
-static bool matchVectorShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
- int Delta) {
+static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
+ int Delta) {
int Size = (int)Mask.size();
int Split = Size / Delta;
int TruncatedVectorStart = SwappedOps ? Size : 0;
@@ -10814,8 +11015,8 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
// The first half/quarter of the mask should refer to every second/fourth
// element of the vector truncated and bitcasted.
- if (!matchVectorShuffleAsVPMOV(Mask, SwappedOps, 2) &&
- !matchVectorShuffleAsVPMOV(Mask, SwappedOps, 4))
+ if (!matchShuffleAsVPMOV(Mask, SwappedOps, 2) &&
+ !matchShuffleAsVPMOV(Mask, SwappedOps, 4))
return SDValue();
return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
@@ -10823,11 +11024,10 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
// X86 has dedicated pack instructions that can handle specific truncation
// operations: PACKSS and PACKUS.
-static bool matchVectorShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1,
- SDValue &V2, unsigned &PackOpcode,
- ArrayRef<int> TargetMask,
- SelectionDAG &DAG,
- const X86Subtarget &Subtarget) {
+static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
+ unsigned &PackOpcode, ArrayRef<int> TargetMask,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
unsigned NumElts = VT.getVectorNumElements();
unsigned BitSize = VT.getScalarSizeInBits();
MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
@@ -10880,8 +11080,8 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
const X86Subtarget &Subtarget) {
MVT PackVT;
unsigned PackOpcode;
- if (matchVectorShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
- Subtarget))
+ if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+ Subtarget))
return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
DAG.getBitcast(PackVT, V2));
@@ -10972,10 +11172,10 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
const X86Subtarget &Subtarget,
SelectionDAG &DAG);
-static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
- MutableArrayRef<int> Mask,
- const APInt &Zeroable, bool &ForceV1Zero,
- bool &ForceV2Zero, uint64_t &BlendMask) {
+static bool matchShuffleAsBlend(SDValue V1, SDValue V2,
+ MutableArrayRef<int> Mask,
+ const APInt &Zeroable, bool &ForceV1Zero,
+ bool &ForceV2Zero, uint64_t &BlendMask) {
bool V1IsZeroOrUndef =
V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
bool V2IsZeroOrUndef =
@@ -11038,8 +11238,8 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 64> Mask(Original.begin(), Original.end());
- if (!matchVectorShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
- BlendMask))
+ if (!matchShuffleAsBlend(V1, V2, Mask, Zeroable, ForceV1Zero, ForceV2Zero,
+ BlendMask))
return SDValue();
// Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
@@ -11161,7 +11361,7 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
case MVT::v32i16:
case MVT::v64i8: {
// Attempt to lower to a bitmask if we can. Only if not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize) {
if (SDValue Masked = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
Subtarget, DAG))
@@ -11609,9 +11809,11 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
}
/// Try to lower a vector shuffle as a byte shift sequence.
-static SDValue lowerVectorShuffleAsByteShiftMask(
- const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
- const APInt &Zeroable, const X86Subtarget &Subtarget, SelectionDAG &DAG) {
+static SDValue lowerShuffleAsByteShiftMask(const SDLoc &DL, MVT VT, SDValue V1,
+ SDValue V2, ArrayRef<int> Mask,
+ const APInt &Zeroable,
+ const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
assert(VT.is128BitVector() && "Only 128-bit vectors supported");
@@ -14056,8 +14258,8 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return BitBlend;
// Try to use byte shift instructions to mask.
- if (SDValue V = lowerVectorShuffleAsByteShiftMask(
- DL, MVT::v8i16, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v8i16, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
// Try to lower by permuting the inputs into an unpack instruction.
@@ -14318,8 +14520,8 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
return V;
// Try to use byte shift instructions to mask.
- if (SDValue V = lowerVectorShuffleAsByteShiftMask(
- DL, MVT::v16i8, V1, V2, Mask, Zeroable, Subtarget, DAG))
+ if (SDValue V = lowerShuffleAsByteShiftMask(DL, MVT::v16i8, V1, V2, Mask,
+ Zeroable, Subtarget, DAG))
return V;
// Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
@@ -14686,6 +14888,36 @@ static SDValue lowerShuffleAsSplitOrBlend(const SDLoc &DL, MVT VT, SDValue V1,
DAG);
}
+// Lower as SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+// TODO: Extend to support v8f32 (+ 512-bit shuffles).
+static SDValue lowerShuffleAsLanePermuteAndSHUFP(const SDLoc &DL, MVT VT,
+ SDValue V1, SDValue V2,
+ ArrayRef<int> Mask,
+ SelectionDAG &DAG) {
+ assert(VT == MVT::v4f64 && "Only for v4f64 shuffles");
+
+ int LHSMask[4] = {-1, -1, -1, -1};
+ int RHSMask[4] = {-1, -1, -1, -1};
+ unsigned SHUFPMask = 0;
+
+ // As SHUFPD uses a single LHS/RHS element per lane, we can always
+ // perform the shuffle once the lanes have been shuffled in place.
+ for (int i = 0; i != 4; ++i) {
+ int M = Mask[i];
+ if (M < 0)
+ continue;
+ int LaneBase = i & ~1;
+ auto &LaneMask = (i & 1) ? RHSMask : LHSMask;
+ LaneMask[LaneBase + (M & 1)] = M;
+ SHUFPMask |= (M & 1) << i;
+ }
+
+ SDValue LHS = DAG.getVectorShuffle(VT, DL, V1, V2, LHSMask);
+ SDValue RHS = DAG.getVectorShuffle(VT, DL, V1, V2, RHSMask);
+ return DAG.getNode(X86ISD::SHUFP, DL, VT, LHS, RHS,
+ DAG.getTargetConstant(SHUFPMask, DL, MVT::i8));
+}
+
/// Lower a vector shuffle crossing multiple 128-bit lanes as
/// a lane permutation followed by a per-lane permutation.
///
@@ -14764,13 +14996,22 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
int Size = Mask.size();
int LaneSize = Size / 2;
+ // Fold to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+ // Only do this if the elements aren't all from the lower lane,
+ // otherwise we're (probably) better off doing a split.
+ if (VT == MVT::v4f64 &&
+ !all_of(Mask, [LaneSize](int M) { return M < LaneSize; }))
+ if (SDValue V =
+ lowerShuffleAsLanePermuteAndSHUFP(DL, VT, V1, V2, Mask, DAG))
+ return V;
+
// If there are only inputs from one 128-bit lane, splitting will in fact be
// less expensive. The flags track whether the given lane contains an element
// that crosses to another lane.
if (!Subtarget.hasAVX2()) {
bool LaneCrossing[2] = {false, false};
for (int i = 0; i < Size; ++i)
- if (Mask[i] >= 0 && (Mask[i] % Size) / LaneSize != i / LaneSize)
+ if (Mask[i] >= 0 && ((Mask[i] % Size) / LaneSize) != (i / LaneSize))
LaneCrossing[(Mask[i] % Size) / LaneSize] = true;
if (!LaneCrossing[0] || !LaneCrossing[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
@@ -14778,7 +15019,7 @@ static SDValue lowerShuffleAsLanePermuteAndShuffle(
bool LaneUsed[2] = {false, false};
for (int i = 0; i < Size; ++i)
if (Mask[i] >= 0)
- LaneUsed[(Mask[i] / LaneSize)] = true;
+ LaneUsed[(Mask[i] % Size) / LaneSize] = true;
if (!LaneUsed[0] || !LaneUsed[1])
return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
}
@@ -14817,8 +15058,10 @@ static SDValue lowerV2X128Shuffle(const SDLoc &DL, MVT VT, SDValue V1,
if (Subtarget.hasAVX2() && V2.isUndef())
return SDValue();
+ bool V2IsZero = !V2.isUndef() && ISD::isBuildVectorAllZeros(V2.getNode());
+
SmallVector<int, 4> WidenedMask;
- if (!canWidenShuffleElements(Mask, Zeroable, WidenedMask))
+ if (!canWidenShuffleElements(Mask, Zeroable, V2IsZero, WidenedMask))
return SDValue();
bool IsLowZero = (Zeroable & 0x3) == 0x3;
@@ -15637,6 +15880,18 @@ static SDValue lowerV4F64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
Zeroable, Subtarget, DAG))
return Op;
+ // If we have lane crossing shuffles AND they don't all come from the lower
+ // lane elements, lower to SHUFPD(VPERM2F128(V1, V2), VPERM2F128(V1, V2)).
+ // TODO: Handle BUILD_VECTOR sources which getVectorShuffle currently
+ // canonicalize to a blend of splat which isn't necessary for this combine.
+ if (is128BitLaneCrossingShuffleMask(MVT::v4f64, Mask) &&
+ !all_of(Mask, [](int M) { return M < 2 || (4 <= M && M < 6); }) &&
+ (V1.getOpcode() != ISD::BUILD_VECTOR) &&
+ (V2.getOpcode() != ISD::BUILD_VECTOR))
+ if (SDValue Op = lowerShuffleAsLanePermuteAndSHUFP(DL, MVT::v4f64, V1, V2,
+ Mask, DAG))
+ return Op;
+
// If we have one input in place, then we can permute the other input and
// blend the result.
if (isShuffleMaskInputInPlace(0, Mask) || isShuffleMaskInputInPlace(1, Mask))
@@ -16950,6 +17205,10 @@ static SDValue lower1BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
ExtVT = Subtarget.canExtendTo512BW() ? MVT::v32i16 : MVT::v32i8;
break;
case MVT::v64i1:
+ // Fall back to scalarization. FIXME: We can do better if the shuffle
+ // can be partitioned cleanly.
+ if (!Subtarget.useBWIRegs())
+ return SDValue();
ExtVT = MVT::v64i8;
break;
}
@@ -17039,8 +17298,8 @@ static bool canonicalizeShuffleMaskWithCommute(ArrayRef<int> Mask) {
/// above in helper routines. The canonicalization attempts to widen shuffles
/// to involve fewer lanes of wider elements, consolidate symmetric patterns
/// s.t. only one of the two inputs needs to be tested, etc.
-static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
- SelectionDAG &DAG) {
+static SDValue lowerVECTOR_SHUFFLE(SDValue Op, const X86Subtarget &Subtarget,
+ SelectionDAG &DAG) {
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> OrigMask = SVOp->getMask();
SDValue V1 = Op.getOperand(0);
@@ -17086,29 +17345,22 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
// We actually see shuffles that are entirely re-arrangements of a set of
// zero inputs. This mostly happens while decomposing complex shuffles into
// simple ones. Directly lower these as a buildvector of zeros.
- APInt Zeroable = computeZeroableShuffleElements(OrigMask, V1, V2);
+ APInt KnownUndef, KnownZero;
+ computeZeroableShuffleElements(OrigMask, V1, V2, KnownUndef, KnownZero);
+
+ APInt Zeroable = KnownUndef | KnownZero;
if (Zeroable.isAllOnesValue())
return getZeroVector(VT, Subtarget, DAG, DL);
bool V2IsZero = !V2IsUndef && ISD::isBuildVectorAllZeros(V2.getNode());
- // Create an alternative mask with info about zeroable elements.
- // Here we do not set undef elements as zeroable.
- SmallVector<int, 64> ZeroableMask(OrigMask.begin(), OrigMask.end());
- if (V2IsZero) {
- assert(!Zeroable.isNullValue() && "V2's non-undef elements are used?!");
- for (int i = 0; i != NumElements; ++i)
- if (OrigMask[i] != SM_SentinelUndef && Zeroable[i])
- ZeroableMask[i] = SM_SentinelZero;
- }
-
// Try to collapse shuffles into using a vector type with fewer elements but
// wider element types. We cap this to not form integers or floating point
// elements wider than 64 bits, but it might be interesting to form i128
// integers to handle flipping the low and high halves of AVX 256-bit vectors.
SmallVector<int, 16> WidenedMask;
if (VT.getScalarSizeInBits() < 64 && !Is1BitVector &&
- canWidenShuffleElements(ZeroableMask, WidenedMask)) {
+ canWidenShuffleElements(OrigMask, Zeroable, V2IsZero, WidenedMask)) {
// Shuffle mask widening should not interfere with a broadcast opportunity
// by obfuscating the operands with bitcasts.
// TODO: Avoid lowering directly from this top-level function: make this
@@ -18307,7 +18559,7 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
"Unexpected funnel shift type!");
// Expand slow SHLD/SHRD cases if we are not optimizing for size.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
if (!OptForSize && Subtarget.isSHLDSlow())
return SDValue();
@@ -18328,8 +18580,13 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert((Op.getOpcode() == ISD::SINT_TO_FP ||
- Op.getOpcode() == ISD::UINT_TO_FP) && "Unexpected opcode!");
- SDValue Src = Op.getOperand(0);
+ Op.getOpcode() == ISD::STRICT_SINT_TO_FP ||
+ Op.getOpcode() == ISD::STRICT_UINT_TO_FP ||
+ Op.getOpcode() == ISD::UINT_TO_FP) &&
+ "Unexpected opcode!");
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
@@ -18346,7 +18603,17 @@ static SDValue LowerI64IntToFP_AVX512DQ(SDValue Op, SelectionDAG &DAG,
SDLoc dl(Op);
SDValue InVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VecInVT, Src);
+ if (IsStrict) {
+ SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, {VecVT, MVT::Other},
+ {Op.getOperand(0), InVec});
+ SDValue Chain = CvtVec.getValue(1);
+ SDValue Value = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
+ DAG.getIntPtrConstant(0, dl));
+ return DAG.getMergeValues({Value, Chain}, dl);
+ }
+
SDValue CvtVec = DAG.getNode(Op.getOpcode(), dl, VecVT, InVec);
+
return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, CvtVec,
DAG.getIntPtrConstant(0, dl));
}
@@ -18415,44 +18682,157 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
DAG.getIntPtrConstant(0, DL));
}
+static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue Src = Op->getOperand(IsStrict ? 1 : 0);
+
+ if (Subtarget.hasDQI()) {
+ assert(!Subtarget.hasVLX() && "Unexpected features");
+
+ assert((Src.getSimpleValueType() == MVT::v2i64 ||
+ Src.getSimpleValueType() == MVT::v4i64) &&
+ "Unsupported custom type");
+
+ // With AVX512DQ, but not VLX we need to widen to get a 512-bit result type.
+ assert((VT == MVT::v4f32 || VT == MVT::v2f64 || VT == MVT::v4f64) &&
+ "Unexpected VT!");
+ MVT WideVT = VT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
+
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ SDValue Tmp = IsStrict ? DAG.getConstant(0, DL, MVT::v8i64)
+ : DAG.getUNDEF(MVT::v8i64);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i64, Tmp, Src,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Op.getOpcode(), DL, {WideVT, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Op.getOpcode(), DL, WideVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+ }
+
+ bool IsSigned = Op->getOpcode() == ISD::SINT_TO_FP ||
+ Op->getOpcode() == ISD::STRICT_SINT_TO_FP;
+ if (VT != MVT::v4f32 || IsSigned)
+ return SDValue();
+
+ SDValue Zero = DAG.getConstant(0, DL, MVT::v4i64);
+ SDValue One = DAG.getConstant(1, DL, MVT::v4i64);
+ SDValue Sign = DAG.getNode(ISD::OR, DL, MVT::v4i64,
+ DAG.getNode(ISD::SRL, DL, MVT::v4i64, Src, One),
+ DAG.getNode(ISD::AND, DL, MVT::v4i64, Src, One));
+ SDValue IsNeg = DAG.getSetCC(DL, MVT::v4i64, Src, Zero, ISD::SETLT);
+ SDValue SignSrc = DAG.getSelect(DL, MVT::v4i64, IsNeg, Sign, Src);
+ SmallVector<SDValue, 4> SignCvts(4);
+ SmallVector<SDValue, 4> Chains(4);
+ for (int i = 0; i != 4; ++i) {
+ SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
+ DAG.getIntPtrConstant(i, DL));
+ if (IsStrict) {
+ SignCvts[i] =
+ DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
+ {Op.getOperand(0), Src});
+ Chains[i] = SignCvts[i].getValue(1);
+ } else {
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
+ }
+ }
+ SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
+
+ SDValue Slow, Chain;
+ if (IsStrict) {
+ Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+ Slow = DAG.getNode(ISD::STRICT_FADD, DL, {MVT::v4f32, MVT::Other},
+ {Chain, SignCvt, SignCvt});
+ Chain = Slow.getValue(1);
+ } else {
+ Slow = DAG.getNode(ISD::FADD, DL, MVT::v4f32, SignCvt, SignCvt);
+ }
+
+ IsNeg = DAG.getNode(ISD::TRUNCATE, DL, MVT::v4i32, IsNeg);
+ SDValue Cvt = DAG.getSelect(DL, MVT::v4f32, IsNeg, Slow, SignCvt);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cvt, Chain}, DL);
+
+ return Cvt;
+}
+
SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- SDValue Src = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
+ SDValue Chain = IsStrict ? Op->getOperand(0) : DAG.getEntryNode();
MVT SrcVT = Src.getSimpleValueType();
MVT VT = Op.getSimpleValueType();
SDLoc dl(Op);
- if (VT == MVT::f128)
- return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
-
if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
return Extract;
if (SrcVT.isVector()) {
if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
+ // Note: Since v2f64 is a legal type. We don't need to zero extend the
+ // source for strict FP.
+ if (IsStrict)
+ return DAG.getNode(
+ X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
+ {Chain, DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getUNDEF(SrcVT))});
return DAG.getNode(X86ISD::CVTSI2P, dl, VT,
DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
DAG.getUNDEF(SrcVT)));
}
+ if (SrcVT == MVT::v2i64 || SrcVT == MVT::v4i64)
+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
+
return SDValue();
}
assert(SrcVT <= MVT::i64 && SrcVT >= MVT::i16 &&
"Unknown SINT_TO_FP to lower!");
+ bool UseSSEReg = isScalarFPTypeInSSEReg(VT);
+
// These are really Legal; return the operand so the caller accepts it as
// Legal.
- if (SrcVT == MVT::i32 && isScalarFPTypeInSSEReg(VT))
+ if (SrcVT == MVT::i32 && UseSSEReg)
return Op;
- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) && Subtarget.is64Bit())
+ if (SrcVT == MVT::i64 && UseSSEReg && Subtarget.is64Bit())
return Op;
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
return V;
- SDValue ValueToStore = Op.getOperand(0);
- if (SrcVT == MVT::i64 && isScalarFPTypeInSSEReg(VT) &&
- !Subtarget.is64Bit())
+ // SSE doesn't have an i16 conversion so we need to promote.
+ if (SrcVT == MVT::i16 && (UseSSEReg || VT == MVT::f128)) {
+ SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i32, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {Chain, Ext});
+
+ return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Ext);
+ }
+
+ if (VT == MVT::f128)
+ return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
+
+ SDValue ValueToStore = Src;
+ if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
@@ -18463,13 +18843,18 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
auto PtrVT = getPointerTy(MF.getDataLayout());
int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- SDValue Chain = DAG.getStore(
- DAG.getEntryNode(), dl, ValueToStore, StackSlot,
+ Chain = DAG.getStore(
+ Chain, dl, ValueToStore, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
- return BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+ std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
+
+ return Tmp.first;
}
-SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue StackSlot,
SelectionDAG &DAG) const {
// Build the FILD
@@ -18498,9 +18883,9 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
SDValue Result =
DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
Tys, FILDOps, SrcVT, LoadMMO);
+ Chain = Result.getValue(1);
if (useSSE) {
- Chain = Result.getValue(1);
SDValue InFlag = Result.getValue(2);
// FIXME: Currently the FST is glued to the FILD_FLAG. This
@@ -18522,9 +18907,10 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
Result = DAG.getLoad(
Op.getValueType(), DL, Chain, StackSlot,
MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
+ Chain = Result.getValue(1);
}
- return Result;
+ return { Result, Chain };
}
/// Horizontal vector math instructions may be slower than normal math with
@@ -18532,7 +18918,7 @@ SDValue X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
/// implementation, and likely shuffle complexity of the alternate sequence.
static bool shouldUseHorizontalOp(bool IsSingleSource, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- bool IsOptimizingSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool IsOptimizingSize = DAG.shouldOptForSize();
bool HasFastHOps = Subtarget.hasFastHorizontalOps();
return !IsSingleSource || IsOptimizingSize || HasFastHOps;
}
@@ -18553,6 +18939,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
#endif
*/
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
SDLoc dl(Op);
LLVMContext *Context = DAG.getContext();
@@ -18573,8 +18961,8 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
// Load the 64-bit value into an XMM register.
- SDValue XR1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
- Op.getOperand(0));
+ SDValue XR1 =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64, Op.getOperand(OpNo));
SDValue CLod0 =
DAG.getLoad(MVT::v4i32, dl, DAG.getEntryNode(), CPIdx0,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
@@ -18587,51 +18975,81 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
/* Alignment = */ 16);
SDValue XR2F = DAG.getBitcast(MVT::v2f64, Unpck1);
+ SDValue Sub;
+ SDValue Chain;
// TODO: Are there any fast-math-flags to propagate here?
- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
+ if (IsStrict) {
+ Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), XR2F, CLod1});
+ Chain = Sub.getValue(1);
+ } else
+ Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, XR2F, CLod1);
SDValue Result;
- if (Subtarget.hasSSE3() && shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ if (!IsStrict && Subtarget.hasSSE3() &&
+ shouldUseHorizontalOp(true, DAG, Subtarget)) {
+ // FIXME: Do we need a STRICT version of FHADD?
Result = DAG.getNode(X86ISD::FHADD, dl, MVT::v2f64, Sub, Sub);
} else {
SDValue Shuffle = DAG.getVectorShuffle(MVT::v2f64, dl, Sub, Sub, {1,-1});
- Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
+ if (IsStrict) {
+ Result = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v2f64, MVT::Other},
+ {Chain, Shuffle, Sub});
+ Chain = Result.getValue(1);
+ } else
+ Result = DAG.getNode(ISD::FADD, dl, MVT::v2f64, Shuffle, Sub);
}
+ Result = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Result, Chain}, dl);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Result,
- DAG.getIntPtrConstant(0, dl));
+ return Result;
}
/// 32-bit unsigned integer to float expansion.
static SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
SDLoc dl(Op);
// FP constant to bias correct the final result.
SDValue Bias = DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl,
MVT::f64);
// Load the 32-bit value into an XMM register.
- SDValue Load = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32,
- Op.getOperand(0));
+ SDValue Load =
+ DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Op.getOperand(OpNo));
// Zero out the upper parts of the register.
Load = getShuffleVectorZeroOrUndef(Load, 0, true, Subtarget, DAG);
- Load = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
- DAG.getBitcast(MVT::v2f64, Load),
- DAG.getIntPtrConstant(0, dl));
-
// Or the load with the bias.
SDValue Or = DAG.getNode(
ISD::OR, dl, MVT::v2i64,
- DAG.getBitcast(MVT::v2i64,
- DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Load)),
+ DAG.getBitcast(MVT::v2i64, Load),
DAG.getBitcast(MVT::v2i64,
DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2f64, Bias)));
Or =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
DAG.getBitcast(MVT::v2f64, Or), DAG.getIntPtrConstant(0, dl));
+ if (Op.getNode()->isStrictFPOpcode()) {
+ // Subtract the bias.
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Chain = Op.getOperand(0);
+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::f64, MVT::Other},
+ {Chain, Or, Bias});
+
+ if (Op.getValueType() == Sub.getValueType())
+ return Sub;
+
+ // Handle final rounding.
+ std::pair<SDValue, SDValue> ResultPair = DAG.getStrictFPExtendOrRound(
+ Sub, Sub.getValue(1), dl, Op.getSimpleValueType());
+
+ return DAG.getMergeValues({ResultPair.first, ResultPair.second}, dl);
+ }
+
// Subtract the bias.
// TODO: Are there any fast-math-flags to propagate here?
SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::f64, Or, Bias);
@@ -18646,38 +19064,123 @@ static SDValue lowerUINT_TO_FP_v2i32(SDValue Op, SelectionDAG &DAG,
if (Op.getSimpleValueType() != MVT::v2f64)
return SDValue();
- SDValue N0 = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+
+ SDValue N0 = Op.getOperand(IsStrict ? 1 : 0);
assert(N0.getSimpleValueType() == MVT::v2i32 && "Unexpected input type");
- // Legalize to v4i32 type.
- N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
- DAG.getUNDEF(MVT::v2i32));
+ if (Subtarget.hasAVX512()) {
+ if (!Subtarget.hasVLX()) {
+ // Let generic type legalization widen this.
+ if (!IsStrict)
+ return SDValue();
+ // Otherwise pad the integer input with 0s and widen the operation.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getConstant(0, DL, MVT::v2i32));
+ SDValue Res = DAG.getNode(Op->getOpcode(), DL, {MVT::v4f64, MVT::Other},
+ {Op.getOperand(0), N0});
+ SDValue Chain = Res.getValue(1);
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2f64, Res,
+ DAG.getIntPtrConstant(0, DL));
+ return DAG.getMergeValues({Res, Chain}, DL);
+ }
- if (Subtarget.hasAVX512())
+ // Legalize to v4i32 type.
+ N0 = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+ DAG.getUNDEF(MVT::v2i32));
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_CVTUI2P, DL, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), N0});
return DAG.getNode(X86ISD::CVTUI2P, DL, MVT::v2f64, N0);
+ }
- // Same implementation as VectorLegalizer::ExpandUINT_TO_FLOAT,
- // but using v2i32 to v2f64 with X86ISD::CVTSI2P.
- SDValue HalfWord = DAG.getConstant(16, DL, MVT::v4i32);
- SDValue HalfWordMask = DAG.getConstant(0x0000FFFF, DL, MVT::v4i32);
-
- // Two to the power of half-word-size.
- SDValue TWOHW = DAG.getConstantFP((double)(1 << 16), DL, MVT::v2f64);
-
- // Clear upper part of LO, lower HI.
- SDValue HI = DAG.getNode(ISD::SRL, DL, MVT::v4i32, N0, HalfWord);
- SDValue LO = DAG.getNode(ISD::AND, DL, MVT::v4i32, N0, HalfWordMask);
-
- SDValue fHI = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, HI);
- fHI = DAG.getNode(ISD::FMUL, DL, MVT::v2f64, fHI, TWOHW);
- SDValue fLO = DAG.getNode(X86ISD::CVTSI2P, DL, MVT::v2f64, LO);
+ // Zero extend to 2i64, OR with the floating point representation of 2^52.
+ // This gives us the floating point equivalent of 2^52 + the i32 integer
+ // since double has 52-bits of mantissa. Then subtract 2^52 in floating
+ // point leaving just our i32 integers in double format.
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v2i64, N0);
+ SDValue VBias =
+ DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), DL, MVT::v2f64);
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v2i64, ZExtIn,
+ DAG.getBitcast(MVT::v2i64, VBias));
+ Or = DAG.getBitcast(MVT::v2f64, Or);
- // Add the two halves.
- return DAG.getNode(ISD::FADD, DL, MVT::v2f64, fHI, fLO);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v2f64, MVT::Other},
+ {Op.getOperand(0), Or, VBias});
+ return DAG.getNode(ISD::FSUB, DL, MVT::v2f64, Or, VBias);
}
static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ SDLoc DL(Op);
+ bool IsStrict = Op->isStrictFPOpcode();
+ SDValue V = Op->getOperand(IsStrict ? 1 : 0);
+ MVT VecIntVT = V.getSimpleValueType();
+ assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
+ "Unsupported custom type");
+
+ if (Subtarget.hasAVX512()) {
+ // With AVX512, but not VLX we need to widen to get a 512-bit result type.
+ assert(!Subtarget.hasVLX() && "Unexpected features");
+ MVT VT = Op->getSimpleValueType(0);
+
+ // v8i32->v8f64 is legal with AVX512 so just return it.
+ if (VT == MVT::v8f64)
+ return Op;
+
+ assert((VT == MVT::v4f32 || VT == MVT::v8f32 || VT == MVT::v4f64) &&
+ "Unexpected VT!");
+ MVT WideVT = VT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
+ MVT WideIntVT = VT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ SDValue Tmp =
+ IsStrict ? DAG.getConstant(0, DL, WideIntVT) : DAG.getUNDEF(WideIntVT);
+ V = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, WideIntVT, Tmp, V,
+ DAG.getIntPtrConstant(0, DL));
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_UINT_TO_FP, DL, {WideVT, MVT::Other},
+ {Op->getOperand(0), V});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(ISD::UINT_TO_FP, DL, WideVT, V);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Res,
+ DAG.getIntPtrConstant(0, DL));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, DL);
+ return Res;
+ }
+
+ if (Subtarget.hasAVX() && VecIntVT == MVT::v4i32 &&
+ Op->getSimpleValueType(0) == MVT::v4f64) {
+ SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::v4i64, V);
+ Constant *Bias = ConstantFP::get(
+ *DAG.getContext(),
+ APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
+ auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+ SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
+ SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
+ SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
+ SDValue VBias = DAG.getMemIntrinsicNode(
+ X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+ MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
+ /*Alignment*/ 8, MachineMemOperand::MOLoad);
+
+ SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
+ DAG.getBitcast(MVT::v4i64, VBias));
+ Or = DAG.getBitcast(MVT::v4f64, Or);
+
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_FSUB, DL, {MVT::v4f64, MVT::Other},
+ {Op.getOperand(0), Or, VBias});
+ return DAG.getNode(ISD::FSUB, DL, MVT::v4f64, Or, VBias);
+ }
+
// The algorithm is the following:
// #ifdef __SSE4_1__
// uint4 lo = _mm_blend_epi16( v, (uint4) 0x4b000000, 0xaa);
@@ -18690,18 +19193,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
// return (float4) lo + fhi;
- // We shouldn't use it when unsafe-fp-math is enabled though: we might later
- // reassociate the two FADDs, and if we do that, the algorithm fails
- // spectacularly (PR24512).
- // FIXME: If we ever have some kind of Machine FMF, this should be marked
- // as non-fast and always be enabled. Why isn't SDAG FMF enough? Because
- // there's also the MachineCombiner reassociations happening on Machine IR.
- if (DAG.getTarget().Options.UnsafeFPMath)
- return SDValue();
-
- SDLoc DL(Op);
- SDValue V = Op->getOperand(0);
- MVT VecIntVT = V.getSimpleValueType();
bool Is128 = VecIntVT == MVT::v4i32;
MVT VecFloatVT = Is128 ? MVT::v4f32 : MVT::v8f32;
// If we convert to something else than the supported type, e.g., to v4f64,
@@ -18709,9 +19200,6 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
if (VecFloatVT != Op->getSimpleValueType(0))
return SDValue();
- assert((VecIntVT == MVT::v4i32 || VecIntVT == MVT::v8i32) &&
- "Unsupported custom type");
-
// In the #idef/#else code, we have in common:
// - The vector of constants:
// -- 0x4b000000
@@ -18756,23 +19244,35 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
High = DAG.getNode(ISD::OR, DL, VecIntVT, HighShift, VecCstHigh);
}
- // Create the vector constant for -(0x1.0p39f + 0x1.0p23f).
- SDValue VecCstFAdd = DAG.getConstantFP(
- APFloat(APFloat::IEEEsingle(), APInt(32, 0xD3000080)), DL, VecFloatVT);
+ // Create the vector constant for (0x1.0p39f + 0x1.0p23f).
+ SDValue VecCstFSub = DAG.getConstantFP(
+ APFloat(APFloat::IEEEsingle(), APInt(32, 0x53000080)), DL, VecFloatVT);
// float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
+ // NOTE: By using fsub of a positive constant instead of fadd of a negative
+ // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+ // enabled. See PR24512.
SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
// TODO: Are there any fast-math-flags to propagate here?
- SDValue FHigh =
- DAG.getNode(ISD::FADD, DL, VecFloatVT, HighBitcast, VecCstFAdd);
- // return (float4) lo + fhi;
+ // (float4) lo;
SDValue LowBitcast = DAG.getBitcast(VecFloatVT, Low);
+ // return (float4) lo + fhi;
+ if (IsStrict) {
+ SDValue FHigh = DAG.getNode(ISD::STRICT_FSUB, DL, {VecFloatVT, MVT::Other},
+ {Op.getOperand(0), HighBitcast, VecCstFSub});
+ return DAG.getNode(ISD::STRICT_FADD, DL, {VecFloatVT, MVT::Other},
+ {FHigh.getValue(1), LowBitcast, FHigh});
+ }
+
+ SDValue FHigh =
+ DAG.getNode(ISD::FSUB, DL, VecFloatVT, HighBitcast, VecCstFSub);
return DAG.getNode(ISD::FADD, DL, VecFloatVT, LowBitcast, FHigh);
}
static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDValue N0 = Op.getOperand(0);
+ unsigned OpNo = Op.getNode()->isStrictFPOpcode() ? 1 : 0;
+ SDValue N0 = Op.getOperand(OpNo);
MVT SrcVT = N0.getSimpleValueType();
SDLoc dl(Op);
@@ -18783,18 +19283,23 @@ static SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG,
return lowerUINT_TO_FP_v2i32(Op, DAG, Subtarget, dl);
case MVT::v4i32:
case MVT::v8i32:
- assert(!Subtarget.hasAVX512());
return lowerUINT_TO_FP_vXi32(Op, DAG, Subtarget);
+ case MVT::v2i64:
+ case MVT::v4i64:
+ return lowerINT_TO_FP_vXi64(Op, DAG, Subtarget);
}
}
SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SelectionDAG &DAG) const {
- SDValue N0 = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned OpNo = IsStrict ? 1 : 0;
+ SDValue Src = Op.getOperand(OpNo);
SDLoc dl(Op);
auto PtrVT = getPointerTy(DAG.getDataLayout());
- MVT SrcVT = N0.getSimpleValueType();
- MVT DstVT = Op.getSimpleValueType();
+ MVT SrcVT = Src.getSimpleValueType();
+ MVT DstVT = Op->getSimpleValueType(0);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
if (DstVT == MVT::f128)
return LowerF128Call(Op, DAG, RTLIB::getUINTTOFP(SrcVT, DstVT));
@@ -18814,8 +19319,11 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// Promote i32 to i64 and use a signed conversion on 64-bit targets.
if (SrcVT == MVT::i32 && Subtarget.is64Bit()) {
- N0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, N0);
- return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, N0);
+ Src = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, Src);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {DstVT, MVT::Other},
+ {Chain, Src});
+ return DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
}
if (SDValue V = LowerI64IntToFP_AVX512DQ(Op, DAG, Subtarget))
@@ -18823,7 +19331,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
if (SrcVT == MVT::i64 && DstVT == MVT::f64 && X86ScalarSSEf64)
return LowerUINT_TO_FP_i64(Op, DAG, Subtarget);
- if (SrcVT == MVT::i32 && X86ScalarSSEf64)
+ if (SrcVT == MVT::i32 && X86ScalarSSEf64 && DstVT != MVT::f80)
return LowerUINT_TO_FP_i32(Op, DAG, Subtarget);
if (Subtarget.is64Bit() && SrcVT == MVT::i64 && DstVT == MVT::f32)
return SDValue();
@@ -18832,23 +19340,28 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
if (SrcVT == MVT::i32) {
SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
- SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op.getOperand(0),
- StackSlot, MachinePointerInfo());
+ SDValue Store1 =
+ DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
OffsetSlot, MachinePointerInfo());
- SDValue Fild = BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
- return Fild;
+ std::pair<SDValue, SDValue> Tmp =
+ BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+ if (IsStrict)
+ return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
+
+ return Tmp.first;
}
assert(SrcVT == MVT::i64 && "Unexpected type in UINT_TO_FP");
- SDValue ValueToStore = Op.getOperand(0);
- if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit())
+ SDValue ValueToStore = Src;
+ if (isScalarFPTypeInSSEReg(Op.getValueType()) && !Subtarget.is64Bit()) {
// Bitcasting to f64 here allows us to do a single 64-bit store from
// an SSE register, avoiding the store forwarding penalty that would come
// with two 32-bit stores.
ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
- SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, ValueToStore, StackSlot,
- MachinePointerInfo());
+ }
+ SDValue Store =
+ DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
// For i64 source, we need to add the appropriate power of 2 if the input
// was negative. This is the same as the optimization in
// DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
@@ -18863,32 +19376,42 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
SDValue Ops[] = { Store, StackSlot };
SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
MVT::i64, MMO);
+ Chain = Fild.getValue(1);
- APInt FF(32, 0x5F800000ULL);
// Check whether the sign bit is set.
SDValue SignSet = DAG.getSetCC(
dl, getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
- Op.getOperand(0), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
+ Op.getOperand(OpNo), DAG.getConstant(0, dl, MVT::i64), ISD::SETLT);
- // Build a 64 bit pair (0, FF) in the constant pool, with FF in the lo bits.
+ // Build a 64 bit pair (FF, 0) in the constant pool, with FF in the hi bits.
+ APInt FF(64, 0x5F80000000000000ULL);
SDValue FudgePtr = DAG.getConstantPool(
- ConstantInt::get(*DAG.getContext(), FF.zext(64)), PtrVT);
+ ConstantInt::get(*DAG.getContext(), FF), PtrVT);
// Get a pointer to FF if the sign bit was set, or to 0 otherwise.
SDValue Zero = DAG.getIntPtrConstant(0, dl);
SDValue Four = DAG.getIntPtrConstant(4, dl);
- SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four);
+ SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Four, Zero);
FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset);
// Load the value out, extending it from f32 to f80.
- // FIXME: Avoid the extend by constructing the right constant pool?
SDValue Fudge = DAG.getExtLoad(
- ISD::EXTLOAD, dl, MVT::f80, DAG.getEntryNode(), FudgePtr,
+ ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
/* Alignment = */ 4);
+ Chain = Fudge.getValue(1);
// Extend everything to 80 bits to force it to be done on x87.
// TODO: Are there any fast-math-flags to propagate here?
+ if (IsStrict) {
+ SDValue Add = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::f80, MVT::Other},
+ {Chain, Fild, Fudge});
+ // STRICT_FP_ROUND can't handle equal types.
+ if (DstVT == MVT::f80)
+ return Add;
+ return DAG.getNode(ISD::STRICT_FP_ROUND, dl, {DstVT, MVT::Other},
+ {Add.getValue(1), Add, DAG.getIntPtrConstant(0, dl)});
+ }
SDValue Add = DAG.getNode(ISD::FADD, dl, MVT::f80, Fild, Fudge);
return DAG.getNode(ISD::FP_ROUND, dl, DstVT, Add,
DAG.getIntPtrConstant(0, dl));
@@ -18902,11 +19425,13 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
// result.
SDValue
X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
- bool IsSigned) const {
+ bool IsSigned, SDValue &Chain) const {
+ bool IsStrict = Op->isStrictFPOpcode();
SDLoc DL(Op);
EVT DstTy = Op.getValueType();
- EVT TheVT = Op.getOperand(0).getValueType();
+ SDValue Value = Op.getOperand(IsStrict ? 1 : 0);
+ EVT TheVT = Value.getValueType();
auto PtrVT = getPointerTy(DAG.getDataLayout());
if (TheVT != MVT::f32 && TheVT != MVT::f64 && TheVT != MVT::f80) {
@@ -18920,6 +19445,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// used for the 32-bit subtarget, but also for f80 on a 64-bit target.
bool UnsignedFixup = !IsSigned && DstTy == MVT::i64;
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i32. PR44019
if (!IsSigned && DstTy != MVT::i64) {
// Replace the fp-to-uint32 operation with an fp-to-sint64 FIST.
// The low 32 bits of the fist result will have the correct uint32 result.
@@ -18938,8 +19465,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
- SDValue Chain = DAG.getEntryNode();
- SDValue Value = Op.getOperand(0);
+ Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
+
SDValue Adjust; // 0x0 or 0x80000000, for result sign bit adjustment.
if (UnsignedFixup) {
@@ -18949,8 +19476,9 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
// of a signed i64. Let Thresh be the FP equivalent of
// 0x8000000000000000ULL.
//
- // Adjust i32 = (Value < Thresh) ? 0 : 0x80000000;
- // FistSrc = (Value < Thresh) ? Value : (Value - Thresh);
+ // Adjust = (Value < Thresh) ? 0 : 0x80000000;
+ // FltOfs = (Value < Thresh) ? 0 : 0x80000000;
+ // FistSrc = (Value - FltOfs);
// Fist-to-mem64 FistSrc
// Add 0 or 0x800...0ULL to the 64-bit result, which is equivalent
// to XOR'ing the high 32 bits with Adjust.
@@ -18975,19 +19503,31 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
SDValue ThreshVal = DAG.getConstantFP(Thresh, DL, TheVT);
- SDValue Cmp = DAG.getSetCC(DL,
- getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), TheVT),
- Value, ThreshVal, ISD::SETLT);
+ EVT ResVT = getSetCCResultType(DAG.getDataLayout(),
+ *DAG.getContext(), TheVT);
+ SDValue Cmp;
+ if (IsStrict) {
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT,
+ Chain, /*IsSignaling*/ true);
+ Chain = Cmp.getValue(1);
+ } else {
+ Cmp = DAG.getSetCC(DL, ResVT, Value, ThreshVal, ISD::SETLT);
+ }
+
Adjust = DAG.getSelect(DL, MVT::i64, Cmp,
DAG.getConstant(0, DL, MVT::i64),
DAG.getConstant(APInt::getSignMask(64),
DL, MVT::i64));
- SDValue Sub = DAG.getNode(ISD::FSUB, DL, TheVT, Value, ThreshVal);
- Cmp = DAG.getSetCC(DL, getSetCCResultType(DAG.getDataLayout(),
- *DAG.getContext(), TheVT),
- Value, ThreshVal, ISD::SETLT);
- Value = DAG.getSelect(DL, TheVT, Cmp, Value, Sub);
+ SDValue FltOfs = DAG.getSelect(DL, TheVT, Cmp,
+ DAG.getConstantFP(0.0, DL, TheVT),
+ ThreshVal);
+
+ if (IsStrict) {
+ Value = DAG.getNode(ISD::STRICT_FSUB, DL, { TheVT, MVT::Other},
+ { Chain, Value, FltOfs });
+ Chain = Value.getValue(1);
+ } else
+ Value = DAG.getNode(ISD::FSUB, DL, TheVT, Value, FltOfs);
}
MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
@@ -19017,6 +19557,7 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
Ops, DstTy, MMO);
SDValue Res = DAG.getLoad(Op.getValueType(), SDLoc(Op), FIST, StackSlot, MPI);
+ Chain = Res.getValue(1);
// If we need an unsigned fixup, XOR the result with adjust.
if (UnsignedFixup)
@@ -19036,7 +19577,7 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
assert((Opc == ISD::ANY_EXTEND || Opc == ISD::ZERO_EXTEND) &&
"Unexpected extension opcode");
- assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
@@ -19512,48 +20053,137 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
}
SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
- bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
- MVT VT = Op.getSimpleValueType();
- SDValue Src = Op.getOperand(0);
+ bool IsStrict = Op->isStrictFPOpcode();
+ bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT ||
+ Op.getOpcode() == ISD::STRICT_FP_TO_SINT;
+ MVT VT = Op->getSimpleValueType(0);
+ SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
MVT SrcVT = Src.getSimpleValueType();
SDLoc dl(Op);
- if (SrcVT == MVT::f128) {
- RTLIB::Libcall LC;
- if (Op.getOpcode() == ISD::FP_TO_SINT)
- LC = RTLIB::getFPTOSINT(SrcVT, VT);
- else
- LC = RTLIB::getFPTOUINT(SrcVT, VT);
-
- MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, VT, Src, CallOptions, SDLoc(Op)).first;
- }
-
if (VT.isVector()) {
if (VT == MVT::v2i1 && SrcVT == MVT::v2f64) {
MVT ResVT = MVT::v4i32;
MVT TruncVT = MVT::v4i1;
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ unsigned Opc;
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+
if (!IsSigned && !Subtarget.hasVLX()) {
+ assert(Subtarget.useAVX512Regs() && "Unexpected features!");
// Widen to 512-bits.
ResVT = MVT::v8i32;
TruncVT = MVT::v8i1;
- Opc = ISD::FP_TO_UINT;
- Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64,
- DAG.getUNDEF(MVT::v8f64),
- Src, DAG.getIntPtrConstant(0, dl));
+ Opc = Op.getOpcode();
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp = IsStrict ? DAG.getConstantFP(0.0, dl, MVT::v8f64)
+ : DAG.getUNDEF(MVT::v8f64);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, MVT::v8f64, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+ }
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res =
+ DAG.getNode(Opc, dl, {ResVT, MVT::Other}, {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Opc, dl, ResVT, Src);
}
- SDValue Res = DAG.getNode(Opc, dl, ResVT, Src);
+
Res = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Res);
- return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
- DAG.getIntPtrConstant(0, dl));
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i1, Res,
+ DAG.getIntPtrConstant(0, dl));
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ // v8f64->v8i32 is legal, but we need v8i32 to be custom for v8f32.
+ if (VT == MVT::v8i32 && SrcVT == MVT::v8f64) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ assert(Subtarget.useAVX512Regs() && "Requires avx512f");
+ return Op;
+ }
+
+ // Widen vXi32 fp_to_uint with avx512f to 512-bit source.
+ if ((VT == MVT::v4i32 || VT == MVT::v8i32) &&
+ (SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32 || SrcVT == MVT::v8f32)) {
+ assert(!IsSigned && "Expected unsigned conversion!");
+ assert(Subtarget.useAVX512Regs() && !Subtarget.hasVLX() &&
+ "Unexpected features!");
+ MVT WideVT = SrcVT == MVT::v4f64 ? MVT::v8f64 : MVT::v16f32;
+ MVT ResVT = SrcVT == MVT::v4f64 ? MVT::v8i32 : MVT::v16i32;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp =
+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_UINT, dl, {ResVT, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(ISD::FP_TO_UINT, dl, ResVT, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
+ }
+
+ // Widen vXi64 fp_to_uint/fp_to_sint with avx512dq to 512-bit source.
+ if ((VT == MVT::v2i64 || VT == MVT::v4i64) &&
+ (SrcVT == MVT::v2f64 || SrcVT == MVT::v4f64 || SrcVT == MVT::v4f32)) {
+ assert(Subtarget.useAVX512Regs() && Subtarget.hasDQI() &&
+ !Subtarget.hasVLX() && "Unexpected features!");
+ MVT WideVT = SrcVT == MVT::v4f32 ? MVT::v8f32 : MVT::v8f64;
+ // Need to concat with zero vector for strict fp to avoid spurious
+ // exceptions.
+ // TODO: Should we just do this for non-strict as well?
+ SDValue Tmp =
+ IsStrict ? DAG.getConstantFP(0.0, dl, WideVT) : DAG.getUNDEF(WideVT);
+ Src = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideVT, Tmp, Src,
+ DAG.getIntPtrConstant(0, dl));
+
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+ {Op->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Op.getOpcode(), dl, MVT::v8i64, Src);
+ }
+
+ Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+ DAG.getIntPtrConstant(0, dl));
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+ return Res;
}
- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
if (VT == MVT::v2i64 && SrcVT == MVT::v2f32) {
- return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
- DAG.getUNDEF(MVT::v2f32)));
+ assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
+ SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getUNDEF(MVT::v2f32));
+ if (IsStrict) {
+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI
+ : X86ISD::STRICT_CVTTP2UI;
+ return DAG.getNode(Opc, dl, {VT, MVT::Other}, {Op->getOperand(0), Tmp});
+ }
+ unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ return DAG.getNode(Opc, dl, VT, Tmp);
}
return SDValue();
@@ -19575,9 +20205,21 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
assert(VT == MVT::i32 && "Unexpected VT!");
// Promote i32 to i64 and use a signed operation on 64-bit targets.
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i32. PR44019
if (Subtarget.is64Bit()) {
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i64, MVT::Other},
+ { Op.getOperand(0), Src });
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i64, Src);
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (IsStrict)
+ return DAG.getMergeValues({ Res, Chain }, dl);
+ return Res;
}
// Use default expansion for SSE1/2 targets without SSE3. With SSE3 we can
@@ -19586,28 +20228,65 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
return SDValue();
}
- // Promote i16 to i32 if we can use a SSE operation.
- if (VT == MVT::i16 && UseSSEReg) {
+ // Promote i16 to i32 if we can use a SSE operation or the type is f128.
+ // FIXME: This does not generate an invalid exception if the input does not
+ // fit in i16. PR44019
+ if (VT == MVT::i16 && (UseSSEReg || SrcVT == MVT::f128)) {
assert(IsSigned && "Expected i16 FP_TO_UINT to have been promoted!");
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
- return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ SDValue Res, Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, { MVT::i32, MVT::Other},
+ { Op.getOperand(0), Src });
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, MVT::i32, Src);
+
+ Res = DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+ if (IsStrict)
+ return DAG.getMergeValues({ Res, Chain }, dl);
+ return Res;
}
- // If this is a SINT_TO_FP using SSEReg we're done.
+ // If this is a FP_TO_SINT using SSEReg we're done.
if (UseSSEReg && IsSigned)
return Op;
+ // fp128 needs to use a libcall.
+ if (SrcVT == MVT::f128) {
+ RTLIB::Libcall LC;
+ if (IsSigned)
+ LC = RTLIB::getFPTOSINT(SrcVT, VT);
+ else
+ LC = RTLIB::getFPTOUINT(SrcVT, VT);
+
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ MakeLibCallOptions CallOptions;
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, Src, CallOptions,
+ SDLoc(Op), Chain);
+
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
+ }
+
// Fall back to X87.
- if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned))
+ SDValue Chain;
+ if (SDValue V = FP_TO_INTHelper(Op, DAG, IsSigned, Chain)) {
+ if (IsStrict)
+ return DAG.getMergeValues({V, Chain}, dl);
return V;
+ }
llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
}
SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+
SDLoc DL(Op);
MVT VT = Op.getSimpleValueType();
- SDValue In = Op.getOperand(0);
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
if (VT == MVT::f128) {
@@ -19617,14 +20296,19 @@ SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
assert(SVT == MVT::v2f32 && "Only customize MVT::v2f32 type legalization!");
- return DAG.getNode(X86ISD::VFPEXT, DL, VT,
- DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32,
- In, DAG.getUNDEF(SVT)));
+ SDValue Res =
+ DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4f32, In, DAG.getUNDEF(SVT));
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_VFPEXT, DL, {VT, MVT::Other},
+ {Op->getOperand(0), Res});
+ return DAG.getNode(X86ISD::VFPEXT, DL, VT, Res);
}
SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
+ bool IsStrict = Op->isStrictFPOpcode();
+
MVT VT = Op.getSimpleValueType();
- SDValue In = Op.getOperand(0);
+ SDValue In = Op.getOperand(IsStrict ? 1 : 0);
MVT SVT = In.getSimpleValueType();
// It's legal except when f128 is involved
@@ -19636,17 +20320,17 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
// FP_ROUND node has a second operand indicating whether it is known to be
// precise. That doesn't take part in the LibCall so we can't directly use
// LowerF128Call.
+
+ SDLoc dl(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, LC, VT, In, CallOptions, SDLoc(Op)).first;
-}
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, LC, VT, In, CallOptions,
+ dl, Chain);
-// FIXME: This is a hack to allow FP_ROUND to be marked Custom without breaking
-// the default expansion of STRICT_FP_ROUND.
-static SDValue LowerSTRICT_FP_ROUND(SDValue Op, SelectionDAG &DAG) {
- // FIXME: Need to form a libcall with an input chain for f128.
- assert(Op.getOperand(0).getValueType() != MVT::f128 &&
- "Don't know how to handle f128 yet!");
- return Op;
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
}
/// Depending on uarch and/or optimizing for size, we might prefer to use a
@@ -19724,12 +20408,6 @@ static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
/// Depending on uarch and/or optimizing for size, we might prefer to use a
/// vector operation in place of the typical scalar operation.
SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
- if (Op.getValueType() == MVT::f128) {
- RTLIB::Libcall LC = Op.getOpcode() == ISD::FADD ? RTLIB::ADD_F128
- : RTLIB::SUB_F128;
- return LowerF128Call(Op, DAG, LC);
- }
-
assert((Op.getValueType() == MVT::f32 || Op.getValueType() == MVT::f64) &&
"Only expecting float/double");
return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
@@ -20013,6 +20691,19 @@ static bool hasNonFlagsUse(SDValue Op) {
return false;
}
+// Transform to an x86-specific ALU node with flags if there is a chance of
+// using an RMW op or only the flags are used. Otherwise, leave
+// the node alone and emit a 'cmp' or 'test' instruction.
+static bool isProfitableToUseFlagOp(SDValue Op) {
+ for (SDNode *U : Op->uses())
+ if (U->getOpcode() != ISD::CopyToReg &&
+ U->getOpcode() != ISD::SETCC &&
+ U->getOpcode() != ISD::STORE)
+ return false;
+
+ return true;
+}
+
/// Emit nodes that will be selected as "test Op0,Op0", or something
/// equivalent.
static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
@@ -20076,15 +20767,8 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
case ISD::SUB:
case ISD::OR:
case ISD::XOR:
- // Transform to an x86-specific ALU node with flags if there is a chance of
- // using an RMW op or only the flags are used. Otherwise, leave
- // the node alone and emit a 'test' instruction.
- for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
- UE = Op.getNode()->use_end(); UI != UE; ++UI)
- if (UI->getOpcode() != ISD::CopyToReg &&
- UI->getOpcode() != ISD::SETCC &&
- UI->getOpcode() != ISD::STORE)
- goto default_case;
+ if (!isProfitableToUseFlagOp(Op))
+ break;
// Otherwise use a regular EFLAGS-setting instruction.
switch (ArithOp.getOpcode()) {
@@ -20112,7 +20796,6 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
Op->getOperand(1)).getValue(1);
}
default:
- default_case:
break;
}
@@ -20131,15 +20814,26 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
/// Emit nodes that will be selected as "cmp Op0,Op1", or something
/// equivalent.
-SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
- const SDLoc &dl, SelectionDAG &DAG) const {
+static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
+ unsigned X86CC, const SDLoc &dl,
+ SelectionDAG &DAG,
+ const X86Subtarget &Subtarget,
+ SDValue Chain, bool IsSignaling) {
if (isNullConstant(Op1))
- return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
+ return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);
EVT CmpVT = Op0.getValueType();
- if (CmpVT.isFloatingPoint())
- return DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1);
+ if (CmpVT.isFloatingPoint()) {
+ if (Chain) {
+ SDValue Res =
+ DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
+ dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
+ return std::make_pair(Res, Res.getValue(1));
+ }
+ return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),
+ SDValue());
+ }
assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
@@ -20154,7 +20848,7 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
if ((COp0 && !COp0->getAPIntValue().isSignedIntN(8)) ||
(COp1 && !COp1->getAPIntValue().isSignedIntN(8))) {
unsigned ExtendOp =
- isX86CCUnsigned(X86CC) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND;
+ isX86CCSigned(X86CC) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
if (X86CC == X86::COND_E || X86CC == X86::COND_NE) {
// For equality comparisons try to use SIGN_EXTEND if the input was
// truncate from something with enough sign bits.
@@ -20178,10 +20872,22 @@ SDValue X86TargetLowering::EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
Op1 = DAG.getNode(ExtendOp, dl, CmpVT, Op1);
}
}
+
+ // Try to shrink i64 compares if the input has enough zero bits.
+ // FIXME: Do this for non-constant compares for constant on LHS?
+ if (CmpVT == MVT::i64 && isa<ConstantSDNode>(Op1) && !isX86CCSigned(X86CC) &&
+ Op0.hasOneUse() && // Hacky way to not break CSE opportunities with sub.
+ cast<ConstantSDNode>(Op1)->getAPIntValue().getActiveBits() <= 32 &&
+ DAG.MaskedValueIsZero(Op0, APInt::getHighBitsSet(64, 32))) {
+ CmpVT = MVT::i32;
+ Op0 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op0);
+ Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
+ }
+
// Use SUB instead of CMP to enable CSE between SUB and CMP.
SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
- return Sub.getValue(1);
+ return std::make_pair(Sub.getValue(1), SDValue());
}
/// Convert a comparison if required by the subtarget.
@@ -20189,16 +20895,19 @@ SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
SelectionDAG &DAG) const {
// If the subtarget does not support the FUCOMI instruction, floating-point
// comparisons have to be converted.
- if (Subtarget.hasCMov() ||
- Cmp.getOpcode() != X86ISD::CMP ||
- !Cmp.getOperand(0).getValueType().isFloatingPoint() ||
- !Cmp.getOperand(1).getValueType().isFloatingPoint())
+ bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;
+ bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||
+ Cmp.getOpcode() == X86ISD::STRICT_FCMPS;
+
+ if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||
+ !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||
+ !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())
return Cmp;
// The instruction selector will select an FUCOM instruction instead of
// FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
// build an SDNode sequence that transfers the result from FPSW into EFLAGS:
- // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86cmp ...)), 8))))
+ // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))
SDLoc dl(Cmp);
SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
@@ -20399,7 +21108,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
} else {
// Use BT if the immediate can't be encoded in a TEST instruction or we
// are optimizing for size and the immedaite won't fit in a byte.
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
if ((!isUInt<32>(AndRHSVal) || (OptForSize && !isUInt<8>(AndRHSVal))) &&
isPowerOf2_64(AndRHSVal)) {
Src = AndLHS;
@@ -20442,7 +21151,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC,
/// Turns an ISD::CondCode into a value suitable for SSE floating-point mask
/// CMPs.
static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
- SDValue &Op1) {
+ SDValue &Op1, bool &IsAlwaysSignaling) {
unsigned SSECC;
bool Swap = false;
@@ -20481,6 +21190,22 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
if (Swap)
std::swap(Op0, Op1);
+ switch (SetCCOpcode) {
+ default:
+ IsAlwaysSignaling = true;
+ break;
+ case ISD::SETEQ:
+ case ISD::SETOEQ:
+ case ISD::SETUEQ:
+ case ISD::SETNE:
+ case ISD::SETONE:
+ case ISD::SETUNE:
+ case ISD::SETO:
+ case ISD::SETUO:
+ IsAlwaysSignaling = false;
+ break;
+ }
+
return SSECC;
}
@@ -20625,12 +21350,14 @@ static SDValue LowerVSETCCWithSUBUS(SDValue Op0, SDValue Op1, MVT VT,
static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
SelectionDAG &DAG) {
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
- SDValue CC = Op.getOperand(2);
- MVT VT = Op.getSimpleValueType();
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
+ Op.getOpcode() == ISD::STRICT_FSETCCS;
+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
+ SDValue CC = Op.getOperand(IsStrict ? 3 : 2);
+ MVT VT = Op->getSimpleValueType(0);
ISD::CondCode Cond = cast<CondCodeSDNode>(CC)->get();
- bool isFP = Op.getOperand(1).getSimpleValueType().isFloatingPoint();
+ bool isFP = Op1.getSimpleValueType().isFloatingPoint();
SDLoc dl(Op);
if (isFP) {
@@ -20639,57 +21366,119 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
assert(EltVT == MVT::f32 || EltVT == MVT::f64);
#endif
+ bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+
unsigned Opc;
if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
assert(VT.getVectorNumElements() <= 16);
- Opc = X86ISD::CMPM;
+ Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
} else {
- Opc = X86ISD::CMPP;
+ Opc = IsStrict ? X86ISD::STRICT_CMPP : X86ISD::CMPP;
// The SSE/AVX packed FP comparison nodes are defined with a
// floating-point vector result that matches the operand type. This allows
// them to work with an SSE1 target (integer vector types are not legal).
VT = Op0.getSimpleValueType();
}
- // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
- // emit two comparisons and a logic op to tie them together.
SDValue Cmp;
- unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1);
- if (SSECC >= 8 && !Subtarget.hasAVX()) {
- // LLVM predicate is SETUEQ or SETONE.
- unsigned CC0, CC1;
- unsigned CombineOpc;
- if (Cond == ISD::SETUEQ) {
- CC0 = 3; // UNORD
- CC1 = 0; // EQ
- CombineOpc = X86ISD::FOR;
+ bool IsAlwaysSignaling;
+ unsigned SSECC = translateX86FSETCC(Cond, Op0, Op1, IsAlwaysSignaling);
+ if (!Subtarget.hasAVX()) {
+ // TODO: We could use following steps to handle a quiet compare with
+ // signaling encodings.
+ // 1. Get ordered masks from a quiet ISD::SETO
+ // 2. Use the masks to mask potential unordered elements in operand A, B
+ // 3. Get the compare results of masked A, B
+ // 4. Calculating final result using the mask and result from 3
+ // But currently, we just fall back to scalar operations.
+ if (IsStrict && IsAlwaysSignaling && !IsSignaling)
+ return SDValue();
+
+ // Insert an extra signaling instruction to raise exception.
+ if (IsStrict && !IsAlwaysSignaling && IsSignaling) {
+ SDValue SignalCmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(1, dl, MVT::i8)}); // LT_OS
+ // FIXME: It seems we need to update the flags of all new strict nodes.
+ // Otherwise, mayRaiseFPException in MI will return false due to
+ // NoFPExcept = false by default. However, I didn't find it in other
+ // patches.
+ SignalCmp->setFlags(Op->getFlags());
+ Chain = SignalCmp.getValue(1);
+ }
+
+ // In the two cases not handled by SSE compare predicates (SETUEQ/SETONE),
+ // emit two comparisons and a logic op to tie them together.
+ if (SSECC >= 8) {
+ // LLVM predicate is SETUEQ or SETONE.
+ unsigned CC0, CC1;
+ unsigned CombineOpc;
+ if (Cond == ISD::SETUEQ) {
+ CC0 = 3; // UNORD
+ CC1 = 0; // EQ
+ CombineOpc = X86ISD::FOR;
+ } else {
+ assert(Cond == ISD::SETONE);
+ CC0 = 7; // ORD
+ CC1 = 4; // NEQ
+ CombineOpc = X86ISD::FAND;
+ }
+
+ SDValue Cmp0, Cmp1;
+ if (IsStrict) {
+ Cmp0 = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8)});
+ Cmp1 = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8)});
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Cmp0.getValue(1),
+ Cmp1.getValue(1));
+ } else {
+ Cmp0 = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC0, dl, MVT::i8));
+ Cmp1 = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(CC1, dl, MVT::i8));
+ }
+ Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
- assert(Cond == ISD::SETONE);
- CC0 = 7; // ORD
- CC1 = 4; // NEQ
- CombineOpc = X86ISD::FAND;
+ if (IsStrict) {
+ Cmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
+ Chain = Cmp.getValue(1);
+ } else
+ Cmp = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
-
- SDValue Cmp0 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getTargetConstant(CC0, dl, MVT::i8));
- SDValue Cmp1 = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getTargetConstant(CC1, dl, MVT::i8));
- Cmp = DAG.getNode(CombineOpc, dl, VT, Cmp0, Cmp1);
} else {
// Handle all other FP comparisons here.
- Cmp = DAG.getNode(Opc, dl, VT, Op0, Op1,
- DAG.getTargetConstant(SSECC, dl, MVT::i8));
+ if (IsStrict) {
+ // Make a flip on already signaling CCs before setting bit 4 of AVX CC.
+ SSECC |= (IsAlwaysSignaling ^ IsSignaling) << 4;
+ Cmp = DAG.getNode(
+ Opc, dl, {VT, MVT::Other},
+ {Chain, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8)});
+ Chain = Cmp.getValue(1);
+ } else
+ Cmp = DAG.getNode(
+ Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
}
// If this is SSE/AVX CMPP, bitcast the result back to integer to match the
// result type of SETCC. The bitcast is expected to be optimized away
// during combining/isel.
- if (Opc == X86ISD::CMPP)
- Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+ Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Cmp, Chain}, dl);
return Cmp;
}
+ assert(!IsStrict && "Strict SETCC only handles FP operands.");
+
MVT VTOp0 = Op0.getSimpleValueType();
(void)VTOp0;
assert(VTOp0 == Op1.getSimpleValueType() &&
@@ -20860,6 +21649,30 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
if (Opc == X86ISD::PCMPGT && !Subtarget.hasSSE42()) {
assert(Subtarget.hasSSE2() && "Don't know how to lower!");
+ // Special case for sign bit test. We can use a v4i32 PCMPGT and shuffle
+ // the odd elements over the even elements.
+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllZeros(Op0.getNode())) {
+ Op0 = DAG.getConstant(0, dl, MVT::v4i32);
+ Op1 = DAG.getBitcast(MVT::v4i32, Op1);
+
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
+ if (!FlipSigns && !Invert && ISD::isBuildVectorAllOnes(Op1.getNode())) {
+ Op0 = DAG.getBitcast(MVT::v4i32, Op0);
+ Op1 = DAG.getConstant(-1, dl, MVT::v4i32);
+
+ SDValue GT = DAG.getNode(X86ISD::PCMPGT, dl, MVT::v4i32, Op0, Op1);
+ static const int MaskHi[] = { 1, 1, 3, 3 };
+ SDValue Result = DAG.getVectorShuffle(MVT::v4i32, dl, GT, GT, MaskHi);
+
+ return DAG.getBitcast(VT, Result);
+ }
+
// Since SSE has no unsigned integer comparisons, we need to flip the sign
// bits of the inputs before performing those operations. The lower
// compare is always unsigned.
@@ -20999,8 +21812,9 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
/// corresponding X86 condition code constant in X86CC.
SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
ISD::CondCode CC, const SDLoc &dl,
- SelectionDAG &DAG,
- SDValue &X86CC) const {
+ SelectionDAG &DAG, SDValue &X86CC,
+ SDValue &Chain,
+ bool IsSignaling) const {
// Optimize to BT if possible.
// Lower (X & (1 << N)) == 0 to BT(X, N).
// Lower ((X >>u N) & 1) != 0 to BT(X, N).
@@ -21043,12 +21857,32 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
}
}
+ // Try to use the carry flag from the add in place of an separate CMP for:
+ // (seteq (add X, -1), -1). Similar for setne.
+ if (isAllOnesConstant(Op1) && Op0.getOpcode() == ISD::ADD &&
+ Op0.getOperand(1) == Op1 && (CC == ISD::SETEQ || CC == ISD::SETNE)) {
+ if (isProfitableToUseFlagOp(Op0)) {
+ SDVTList VTs = DAG.getVTList(Op0.getValueType(), MVT::i32);
+
+ SDValue New = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(0),
+ Op0.getOperand(1));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(Op0.getNode(), 0), New);
+ X86::CondCode CCode = CC == ISD::SETEQ ? X86::COND_AE : X86::COND_B;
+ X86CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
+ return SDValue(New.getNode(), 1);
+ }
+ }
+
bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
if (CondCode == X86::COND_INVALID)
return SDValue();
- SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG);
+ std::pair<SDValue, SDValue> Tmp =
+ EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);
+ SDValue EFLAGS = Tmp.first;
+ if (Chain)
+ Chain = Tmp.second;
EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
return EFLAGS;
@@ -21056,35 +21890,48 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
- MVT VT = Op.getSimpleValueType();
+ bool IsStrict = Op.getOpcode() == ISD::STRICT_FSETCC ||
+ Op.getOpcode() == ISD::STRICT_FSETCCS;
+ MVT VT = Op->getSimpleValueType(0);
if (VT.isVector()) return LowerVSETCC(Op, Subtarget, DAG);
assert(VT == MVT::i8 && "SetCC type must be 8-bit integer");
- SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+ SDValue Op0 = Op.getOperand(IsStrict ? 1 : 0);
+ SDValue Op1 = Op.getOperand(IsStrict ? 2 : 1);
SDLoc dl(Op);
- ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+ ISD::CondCode CC =
+ cast<CondCodeSDNode>(Op.getOperand(IsStrict ? 3 : 2))->get();
// Handle f128 first, since one possible outcome is a normal integer
// comparison which gets handled by emitFlagsForSetcc.
if (Op0.getValueType() == MVT::f128) {
- softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1);
+ softenSetCCOperands(DAG, MVT::f128, Op0, Op1, CC, dl, Op0, Op1, Chain,
+ Op.getOpcode() == ISD::STRICT_FSETCCS);
// If softenSetCCOperands returned a scalar, use it.
if (!Op1.getNode()) {
assert(Op0.getValueType() == Op.getValueType() &&
"Unexpected setcc expansion!");
+ if (IsStrict)
+ return DAG.getMergeValues({Op0, Chain}, dl);
return Op0;
}
}
SDValue X86CC;
- SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
+ SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,
+ Op.getOpcode() == ISD::STRICT_FSETCCS);
if (!EFLAGS)
return SDValue();
- return DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+ SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+
+ if (IsStrict)
+ return DAG.getMergeValues({Res, Chain}, dl);
+
+ return Res;
}
SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
@@ -21215,8 +22062,10 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
(Subtarget.hasSSE1() && VT == MVT::f32)) &&
VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
- unsigned SSECC = translateX86FSETCC(
- cast<CondCodeSDNode>(Cond.getOperand(2))->get(), CondOp0, CondOp1);
+ bool IsAlwaysSignaling;
+ unsigned SSECC =
+ translateX86FSETCC(cast<CondCodeSDNode>(Cond.getOperand(2))->get(),
+ CondOp0, CondOp1, IsAlwaysSignaling);
if (Subtarget.hasAVX512()) {
SDValue Cmp =
@@ -21454,8 +22303,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
if (AddTest) {
CC = DAG.getTargetConstant(X86::COND_NE, DL, MVT::i8);
- Cond = EmitCmp(Cond, DAG.getConstant(0, DL, Cond.getValueType()),
- X86::COND_NE, DL, DAG);
+ Cond = EmitTest(Cond, X86::COND_NE, DL, DAG, Subtarget);
}
// a < b ? -1 : 0 -> RES = ~setcc_carry
@@ -21711,7 +22559,7 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
return LowerSIGN_EXTEND_Mask(Op, Subtarget, DAG);
assert(VT.isVector() && InVT.isVector() && "Expected vector type");
- assert(VT.getVectorNumElements() == VT.getVectorNumElements() &&
+ assert(VT.getVectorNumElements() == InVT.getVectorNumElements() &&
"Expected same number of elements");
assert((VT.getVectorElementType() == MVT::i16 ||
VT.getVectorElementType() == MVT::i32 ||
@@ -21765,12 +22613,14 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
"Expecting 256/512-bit op");
// Splitting volatile memory ops is not allowed unless the operation was not
- // legal to begin with. We are assuming the input op is legal (this transform
- // is only used for targets with AVX).
+ // legal to begin with. Assume the input store is legal (this transform is
+ // only used for targets with AVX). Note: It is possible that we have an
+ // illegal type like v2i128, and so we could allow splitting a volatile store
+ // in that case if that is important.
if (!Store->isSimple())
return SDValue();
- MVT StoreVT = StoredVal.getSimpleValueType();
+ EVT StoreVT = StoredVal.getValueType();
unsigned NumElems = StoreVT.getVectorNumElements();
unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
@@ -22174,8 +23024,7 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
if (addTest) {
X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
- Cond = EmitCmp(Cond, DAG.getConstant(0, dl, Cond.getValueType()),
- X86Cond, dl, DAG);
+ Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);
}
Cond = ConvertCmpIfNecessary(Cond, DAG);
return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
@@ -22201,7 +23050,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDNode *Node = Op.getNode();
SDValue Chain = Op.getOperand(0);
SDValue Size = Op.getOperand(1);
- unsigned Align = Op.getConstantOperandVal(2);
+ MaybeAlign Alignment(Op.getConstantOperandVal(2));
EVT VT = Node->getValueType(0);
// Chain the dynamic stack allocation so that it doesn't modify the stack
@@ -22221,11 +23070,12 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
Chain = SP.getValue(1);
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- unsigned StackAlign = TFI.getStackAlignment();
+ const Align StackAlign(TFI.getStackAlignment());
Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
- if (Align > StackAlign)
- Result = DAG.getNode(ISD::AND, dl, VT, Result,
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ if (Alignment && Alignment > StackAlign)
+ Result =
+ DAG.getNode(ISD::AND, dl, VT, Result,
+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, Result); // Output chain
} else if (SplitStack) {
MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -22256,9 +23106,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, SPTy);
Chain = SP.getValue(1);
- if (Align) {
+ if (Alignment) {
SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
- DAG.getConstant(-(uint64_t)Align, dl, VT));
+ DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
Chain = DAG.getCopyToReg(Chain, dl, SPReg, SP);
}
@@ -22777,6 +23627,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
unsigned IntNo = Op.getConstantOperandVal(0);
MVT VT = Op.getSimpleValueType();
const IntrinsicData* IntrData = getIntrinsicWithoutChain(IntNo);
+
if (IntrData) {
switch(IntrData->Type) {
case INTR_TYPE_1OP: {
@@ -22794,7 +23645,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
- return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1));
+ return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
+ Op.getOperand(1));
}
case INTR_TYPE_1OP_SAE: {
SDValue Sae = Op.getOperand(2);
@@ -22866,7 +23718,7 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
}
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(),
- Src1, Src2, Src3);
+ {Src1, Src2, Src3});
}
case INTR_TYPE_4OP:
return DAG.getNode(IntrData->Opc0, dl, Op.getValueType(), Op.getOperand(1),
@@ -22890,8 +23742,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
if (!isRoundModeCurDirection(Rnd))
return SDValue();
}
- return getVectorMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src),
- Mask, PassThru, Subtarget, DAG);
+ return getVectorMaskingNode(
+ DAG.getNode(IntrData->Opc0, dl, VT, Src), Mask, PassThru,
+ Subtarget, DAG);
}
case INTR_TYPE_1OP_MASK_SAE: {
SDValue Src = Op.getOperand(1);
@@ -22907,8 +23760,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
else
return SDValue();
- return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src),
- Mask, PassThru, Subtarget, DAG);
+ return getVectorMaskingNode(DAG.getNode(Opc, dl, VT, Src), Mask, PassThru,
+ Subtarget, DAG);
}
case INTR_TYPE_SCALAR_MASK: {
SDValue Src1 = Op.getOperand(1);
@@ -23114,8 +23967,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
return SDValue();
}
//default rounding mode
- return DAG.getNode(IntrData->Opc0, dl, MaskVT, Op.getOperand(1),
- Op.getOperand(2), CC);
+ return DAG.getNode(IntrData->Opc0, dl, MaskVT,
+ {Op.getOperand(1), Op.getOperand(2), CC});
}
case CMP_MASK_SCALAR_CC: {
SDValue Src1 = Op.getOperand(1);
@@ -23315,8 +24168,8 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
MVT SrcVT = Src.getSimpleValueType();
MVT MaskVT = MVT::getVectorVT(MVT::i1, SrcVT.getVectorNumElements());
Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
- return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(), Src, PassThru,
- Mask);
+ return DAG.getNode(IntrData->Opc1, dl, Op.getValueType(),
+ {Src, PassThru, Mask});
}
case CVTPS2PH_MASK: {
SDValue Src = Op.getOperand(1);
@@ -23622,9 +24475,12 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
SDValue ShAmt = Op.getOperand(2);
// If the argument is a constant, convert it to a target constant.
if (auto *C = dyn_cast<ConstantSDNode>(ShAmt)) {
- ShAmt = DAG.getTargetConstant(C->getZExtValue(), DL, MVT::i32);
+ // Clamp out of bounds shift amounts since they will otherwise be masked
+ // to 8-bits which may make it no longer out of bounds.
+ unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
- Op.getOperand(0), Op.getOperand(1), ShAmt);
+ Op.getOperand(0), Op.getOperand(1),
+ DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
}
unsigned NewIntrinsic;
@@ -23977,7 +24833,7 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MFI.setHasCopyImplyingStackAdjustment(true);
// Don't do anything here, we will expand these intrinsics out later
// during FinalizeISel in EmitInstrWithCustomInserter.
- return SDValue();
+ return Op;
}
case Intrinsic::x86_lwpins32:
case Intrinsic::x86_lwpins64:
@@ -24152,9 +25008,11 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
MVT MaskVT = MVT::getVectorVT(MVT::i1, MemVT.getVectorNumElements());
SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
+ SDValue Offset = DAG.getUNDEF(VMask.getValueType());
- return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, VMask, MemVT,
- MemIntr->getMemOperand(), true /* truncating */);
+ return DAG.getMaskedStore(Chain, dl, DataToTruncate, Addr, Offset, VMask,
+ MemVT, MemIntr->getMemOperand(), ISD::UNINDEXED,
+ true /* truncating */);
}
case X86ISD::VTRUNCUS:
case X86ISD::VTRUNCS: {
@@ -24249,7 +25107,7 @@ SDValue X86TargetLowering::LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const {
// FIXME? Maybe this could be a TableGen attribute on some registers and
// this table could be generated automatically from RegInfo.
-Register X86TargetLowering::getRegisterByName(const char* RegName, EVT VT,
+Register X86TargetLowering::getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const {
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
@@ -24538,12 +25396,13 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
MachineFunction &MF = DAG.getMachineFunction();
const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
- unsigned StackAlignment = TFI.getStackAlignment();
+ const Align StackAlignment(TFI.getStackAlignment());
MVT VT = Op.getSimpleValueType();
SDLoc DL(Op);
// Save FP Control Word to stack slot
- int SSFI = MF.getFrameInfo().CreateStackObject(2, StackAlignment, false);
+ int SSFI =
+ MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
SDValue StackSlot =
DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
@@ -27464,12 +28323,11 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
if (PassThru.isUndef() || ISD::isBuildVectorAllZeros(PassThru.getNode()))
return Op;
- SDValue NewLoad = DAG.getMaskedLoad(VT, dl, N->getChain(),
- N->getBasePtr(), Mask,
- getZeroVector(VT, Subtarget, DAG, dl),
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType(),
- N->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ VT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ getZeroVector(VT, Subtarget, DAG, dl), N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
+ N->isExpandingLoad());
// Emit a blend.
SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
PassThru);
@@ -27503,11 +28361,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
MVT WideMaskVT = MVT::getVectorVT(MVT::i1, NumEltsInWideVec);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
- SDValue NewLoad = DAG.getMaskedLoad(WideDataVT, dl, N->getChain(),
- N->getBasePtr(), Mask, PassThru,
- N->getMemoryVT(), N->getMemOperand(),
- N->getExtensionType(),
- N->isExpandingLoad());
+ SDValue NewLoad = DAG.getMaskedLoad(
+ WideDataVT, dl, N->getChain(), N->getBasePtr(), N->getOffset(), Mask,
+ PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
+ N->getExtensionType(), N->isExpandingLoad());
SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
NewLoad.getValue(0),
@@ -27553,7 +28410,8 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
DataToStore = ExtendToType(DataToStore, WideDataVT, DAG);
Mask = ExtendToType(Mask, WideMaskVT, DAG, true);
return DAG.getMaskedStore(N->getChain(), dl, DataToStore, N->getBasePtr(),
- Mask, N->getMemoryVT(), N->getMemOperand(),
+ N->getOffset(), Mask, N->getMemoryVT(),
+ N->getMemOperand(), N->getAddressingMode(),
N->isTruncatingStore(), N->isCompressingStore());
}
@@ -27607,29 +28465,31 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
}
-SDValue X86TargetLowering::LowerGC_TRANSITION_START(SDValue Op,
- SelectionDAG &DAG) const {
- // TODO: Eventually, the lowering of these nodes should be informed by or
- // deferred to the GC strategy for the function in which they appear. For
- // now, however, they must be lowered to something. Since they are logically
- // no-ops in the case of a null GC strategy (or a GC strategy which does not
- // require special handling for these nodes), lower them as literal NOOPs for
- // the time being.
- SmallVector<SDValue, 2> Ops;
+static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
+ SDLoc dl(Op);
+ SDValue Src = Op.getOperand(0);
+ MVT DstVT = Op.getSimpleValueType();
- Ops.push_back(Op.getOperand(0));
- if (Op->getGluedNode())
- Ops.push_back(Op->getOperand(Op->getNumOperands() - 1));
+ AddrSpaceCastSDNode *N = cast<AddrSpaceCastSDNode>(Op.getNode());
+ unsigned SrcAS = N->getSrcAddressSpace();
- SDLoc OpDL(Op);
- SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
- SDValue NOOP(DAG.getMachineNode(X86::NOOP, SDLoc(Op), VTs, Ops), 0);
+ assert(SrcAS != N->getDestAddressSpace() &&
+ "addrspacecast must be between different address spaces");
- return NOOP;
+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64) {
+ Op = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+ } else if (DstVT == MVT::i64) {
+ Op = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+ } else if (DstVT == MVT::i32) {
+ Op = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+ } else {
+ report_fatal_error("Bad address space in addrspacecast");
+ }
+ return Op;
}
-SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
- SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerGC_TRANSITION(SDValue Op,
+ SelectionDAG &DAG) const {
// TODO: Eventually, the lowering of these nodes should be informed by or
// deferred to the GC strategy for the function in which they appear. For
// now, however, they must be lowered to something. Since they are logically
@@ -27651,9 +28511,21 @@ SDValue X86TargetLowering::LowerGC_TRANSITION_END(SDValue Op,
SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
RTLIB::Libcall Call) const {
- SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+
+ bool IsStrict = Op->isStrictFPOpcode();
+ unsigned Offset = IsStrict ? 1 : 0;
+ SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
+
+ SDLoc dl(Op);
+ SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
MakeLibCallOptions CallOptions;
- return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
+ std::pair<SDValue, SDValue> Tmp = makeLibCall(DAG, Call, MVT::f128, Ops,
+ CallOptions, dl, Chain);
+
+ if (IsStrict)
+ return DAG.getMergeValues({ Tmp.first, Tmp.second }, dl);
+
+ return Tmp.first;
}
/// Provide custom lowering hooks for some operations.
@@ -27673,7 +28545,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::BITREVERSE: return LowerBITREVERSE(Op, Subtarget, DAG);
case ISD::BUILD_VECTOR: return LowerBUILD_VECTOR(Op, DAG);
case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, Subtarget, DAG);
- case ISD::VECTOR_SHUFFLE: return lowerVectorShuffle(Op, Subtarget, DAG);
+ case ISD::VECTOR_SHUFFLE: return lowerVECTOR_SHUFFLE(Op, Subtarget, DAG);
case ISD::VSELECT: return LowerVSELECT(Op, DAG);
case ISD::EXTRACT_VECTOR_ELT: return LowerEXTRACT_VECTOR_ELT(Op, DAG);
case ISD::INSERT_VECTOR_ELT: return LowerINSERT_VECTOR_ELT(Op, DAG);
@@ -27690,7 +28562,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SRL_PARTS: return LowerShiftParts(Op, DAG);
case ISD::FSHL:
case ISD::FSHR: return LowerFunnelShift(Op, Subtarget, DAG);
+ case ISD::STRICT_SINT_TO_FP:
case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG);
+ case ISD::STRICT_UINT_TO_FP:
case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG);
case ISD::TRUNCATE: return LowerTRUNCATE(Op, DAG);
case ISD::ZERO_EXTEND: return LowerZERO_EXTEND(Op, Subtarget, DAG);
@@ -27700,21 +28574,24 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::SIGN_EXTEND_VECTOR_INREG:
return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
- case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
- case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
- case ISD::STRICT_FP_ROUND: return LowerSTRICT_FP_ROUND(Op, DAG);
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_UINT: return LowerFP_TO_INT(Op, DAG);
+ case ISD::FP_EXTEND:
+ case ISD::STRICT_FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+ case ISD::FP_ROUND:
+ case ISD::STRICT_FP_ROUND: return LowerFP_ROUND(Op, DAG);
case ISD::LOAD: return LowerLoad(Op, Subtarget, DAG);
case ISD::STORE: return LowerStore(Op, Subtarget, DAG);
case ISD::FADD:
case ISD::FSUB: return lowerFaddFsub(Op, DAG);
- case ISD::FMUL: return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
- case ISD::FDIV: return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
case ISD::FABS:
case ISD::FNEG: return LowerFABSorFNEG(Op, DAG);
case ISD::FCOPYSIGN: return LowerFCOPYSIGN(Op, DAG);
case ISD::FGETSIGN: return LowerFGETSIGN(Op, DAG);
- case ISD::SETCC: return LowerSETCC(Op, DAG);
+ case ISD::SETCC:
+ case ISD::STRICT_FSETCC:
+ case ISD::STRICT_FSETCCS: return LowerSETCC(Op, DAG);
case ISD::SETCCCARRY: return LowerSETCCCARRY(Op, DAG);
case ISD::SELECT: return LowerSELECT(Op, DAG);
case ISD::BRCOND: return LowerBRCOND(Op, DAG);
@@ -27778,8 +28655,9 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
case ISD::MGATHER: return LowerMGATHER(Op, Subtarget, DAG);
case ISD::MSCATTER: return LowerMSCATTER(Op, Subtarget, DAG);
case ISD::GC_TRANSITION_START:
- return LowerGC_TRANSITION_START(Op, DAG);
- case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION_END(Op, DAG);
+ case ISD::GC_TRANSITION_END: return LowerGC_TRANSITION(Op, DAG);
+ case ISD::ADDRSPACECAST:
+ return LowerADDRSPACECAST(Op, DAG);
}
}
@@ -27865,8 +28743,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
}
case X86ISD::VPMADDWD:
case X86ISD::AVG: {
- // Legalize types for ISD::UADDSAT/SADDSAT/USUBSAT/SSUBSAT and
- // X86ISD::AVG/VPMADDWD by widening.
+ // Legalize types for X86ISD::AVG/VPMADDWD by widening.
assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
EVT VT = N->getValueType(0);
@@ -28114,10 +28991,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
return;
}
case ISD::FP_TO_SINT:
- case ISD::FP_TO_UINT: {
- bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT;
+ case ISD::STRICT_FP_TO_SINT:
+ case ISD::FP_TO_UINT:
+ case ISD::STRICT_FP_TO_UINT: {
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::FP_TO_SINT ||
+ N->getOpcode() == ISD::STRICT_FP_TO_SINT;
EVT VT = N->getValueType(0);
- SDValue Src = N->getOperand(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (VT.isVector() && VT.getScalarSizeInBits() < 32) {
@@ -28128,13 +29009,19 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
unsigned NewEltWidth = std::min(128 / VT.getVectorNumElements(), 32U);
MVT PromoteVT = MVT::getVectorVT(MVT::getIntegerVT(NewEltWidth),
VT.getVectorNumElements());
- SDValue Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
+ SDValue Res;
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(ISD::STRICT_FP_TO_SINT, dl, {PromoteVT, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(ISD::FP_TO_SINT, dl, PromoteVT, Src);
// Preserve what we know about the size of the original result. Except
// when the result is v2i32 since we can't widen the assert.
if (PromoteVT != MVT::v2i32)
- Res = DAG.getNode(N->getOpcode() == ISD::FP_TO_UINT ? ISD::AssertZext
- : ISD::AssertSext,
+ Res = DAG.getNode(!IsSigned ? ISD::AssertZext : ISD::AssertSext,
dl, PromoteVT, Res,
DAG.getValueType(VT.getVectorElementType()));
@@ -28149,6 +29036,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
ConcatOps[0] = Res;
Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, ConcatVT, ConcatOps);
Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
return;
}
@@ -28160,16 +29049,49 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
assert(getTypeAction(*DAG.getContext(), VT) == TypeWidenVector &&
"Unexpected type action!");
if (Src.getValueType() == MVT::v2f64) {
+ unsigned Opc;
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+
+ // If we have VLX we can emit a target specific FP_TO_UINT node,.
if (!IsSigned && !Subtarget.hasVLX()) {
- // If we have VLX we can emit a target specific FP_TO_UINT node,
- // otherwise we can defer to the generic legalizer which will widen
+ // Otherwise we can defer to the generic legalizer which will widen
// the input as well. This will be further widened during op
// legalization to v8i32<-v8f64.
- return;
+ // For strict nodes we'll need to widen ourselves.
+ // FIXME: Fix the type legalizer to safely widen strict nodes?
+ if (!IsStrict)
+ return;
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f64, Src,
+ DAG.getConstantFP(0.0, dl, MVT::v2f64));
+ Opc = N->getOpcode();
+ }
+ SDValue Res;
+ SDValue Chain;
+ if (IsStrict) {
+ Res = DAG.getNode(Opc, dl, {MVT::v4i32, MVT::Other},
+ {N->getOperand(0), Src});
+ Chain = Res.getValue(1);
+ } else {
+ Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
}
- unsigned Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
- SDValue Res = DAG.getNode(Opc, dl, MVT::v4i32, Src);
Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
+ // Custom widen strict v2f32->v2i32 by padding with zeros.
+ // FIXME: Should generic type legalizer do this?
+ if (Src.getValueType() == MVT::v2f32 && IsStrict) {
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+ DAG.getConstantFP(0.0, dl, MVT::v2f32));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4i32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
return;
}
@@ -28183,64 +29105,168 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
if (Subtarget.hasDQI() && VT == MVT::i64 &&
(SrcVT == MVT::f32 || SrcVT == MVT::f64)) {
assert(!Subtarget.is64Bit() && "i64 should be legal");
- unsigned NumElts = Subtarget.hasVLX() ? 4 : 8;
- // Using a 256-bit input here to guarantee 128-bit input for f32 case.
- // TODO: Use 128-bit vectors for f64 case?
- // TODO: Use 128-bit vectors for f32 by using CVTTP2SI/CVTTP2UI.
+ unsigned NumElts = Subtarget.hasVLX() ? 2 : 8;
+ // If we use a 128-bit result we might need to use a target specific node.
+ unsigned SrcElts =
+ std::max(NumElts, 128U / (unsigned)SrcVT.getSizeInBits());
MVT VecVT = MVT::getVectorVT(MVT::i64, NumElts);
- MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), NumElts);
+ MVT VecInVT = MVT::getVectorVT(SrcVT.getSimpleVT(), SrcElts);
+ unsigned Opc = N->getOpcode();
+ if (NumElts != SrcElts) {
+ if (IsStrict)
+ Opc = IsSigned ? X86ISD::STRICT_CVTTP2SI : X86ISD::STRICT_CVTTP2UI;
+ else
+ Opc = IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI;
+ }
SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VecInVT,
DAG.getConstantFP(0.0, dl, VecInVT), Src,
ZeroIdx);
- Res = DAG.getNode(N->getOpcode(), SDLoc(N), VecVT, Res);
+ SDValue Chain;
+ if (IsStrict) {
+ SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
+ Res = DAG.getNode(Opc, SDLoc(N), Tys, N->getOperand(0), Res);
+ Chain = Res.getValue(1);
+ } else
+ Res = DAG.getNode(Opc, SDLoc(N), VecVT, Res);
Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Res, ZeroIdx);
Results.push_back(Res);
+ if (IsStrict)
+ Results.push_back(Chain);
return;
}
- if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned))
+ SDValue Chain;
+ if (SDValue V = FP_TO_INTHelper(SDValue(N, 0), DAG, IsSigned, Chain)) {
Results.push_back(V);
+ if (IsStrict)
+ Results.push_back(Chain);
+ }
return;
}
- case ISD::SINT_TO_FP: {
- assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL!");
- SDValue Src = N->getOperand(0);
- if (N->getValueType(0) != MVT::v2f32 || Src.getValueType() != MVT::v2i64)
- return;
- Results.push_back(DAG.getNode(X86ISD::CVTSI2P, dl, MVT::v4f32, Src));
- return;
- }
- case ISD::UINT_TO_FP: {
- assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ case ISD::UINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP: {
+ bool IsStrict = N->isStrictFPOpcode();
+ bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP ||
+ N->getOpcode() == ISD::STRICT_SINT_TO_FP;
EVT VT = N->getValueType(0);
if (VT != MVT::v2f32)
return;
- SDValue Src = N->getOperand(0);
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
EVT SrcVT = Src.getValueType();
if (Subtarget.hasDQI() && Subtarget.hasVLX() && SrcVT == MVT::v2i64) {
- Results.push_back(DAG.getNode(X86ISD::CVTUI2P, dl, MVT::v4f32, Src));
+ if (IsStrict) {
+ unsigned Opc = IsSigned ? X86ISD::STRICT_CVTSI2P
+ : X86ISD::STRICT_CVTUI2P;
+ SDValue Res = DAG.getNode(Opc, dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ } else {
+ unsigned Opc = IsSigned ? X86ISD::CVTSI2P : X86ISD::CVTUI2P;
+ Results.push_back(DAG.getNode(Opc, dl, MVT::v4f32, Src));
+ }
return;
}
+ if (SrcVT == MVT::v2i64 && !IsSigned && Subtarget.is64Bit() &&
+ Subtarget.hasSSE41() && !Subtarget.hasAVX512()) {
+ SDValue Zero = DAG.getConstant(0, dl, SrcVT);
+ SDValue One = DAG.getConstant(1, dl, SrcVT);
+ SDValue Sign = DAG.getNode(ISD::OR, dl, SrcVT,
+ DAG.getNode(ISD::SRL, dl, SrcVT, Src, One),
+ DAG.getNode(ISD::AND, dl, SrcVT, Src, One));
+ SDValue IsNeg = DAG.getSetCC(dl, MVT::v2i64, Src, Zero, ISD::SETLT);
+ SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
+ SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
+ for (int i = 0; i != 2; ++i) {
+ SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+ SignSrc, DAG.getIntPtrConstant(i, dl));
+ if (IsStrict)
+ SignCvts[i] =
+ DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
+ {N->getOperand(0), Src});
+ else
+ SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
+ };
+ SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
+ SDValue Slow, Chain;
+ if (IsStrict) {
+ Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+ SignCvts[0].getValue(1), SignCvts[1].getValue(1));
+ Slow = DAG.getNode(ISD::STRICT_FADD, dl, {MVT::v4f32, MVT::Other},
+ {Chain, SignCvt, SignCvt});
+ Chain = Slow.getValue(1);
+ } else {
+ Slow = DAG.getNode(ISD::FADD, dl, MVT::v4f32, SignCvt, SignCvt);
+ }
+ IsNeg = DAG.getBitcast(MVT::v4i32, IsNeg);
+ IsNeg =
+ DAG.getVectorShuffle(MVT::v4i32, dl, IsNeg, IsNeg, {1, 3, -1, -1});
+ SDValue Cvt = DAG.getSelect(dl, MVT::v4f32, IsNeg, Slow, SignCvt);
+ Results.push_back(Cvt);
+ if (IsStrict)
+ Results.push_back(Chain);
+ return;
+ }
+
if (SrcVT != MVT::v2i32)
return;
+
+ if (IsSigned || Subtarget.hasAVX512()) {
+ if (!IsStrict)
+ return;
+
+ // Custom widen strict v2i32->v2f32 to avoid scalarization.
+ // FIXME: Should generic type legalizer do this?
+ Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4i32, Src,
+ DAG.getConstant(0, dl, MVT::v2i32));
+ SDValue Res = DAG.getNode(N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), Src});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ return;
+ }
+
+ assert(Subtarget.hasSSE2() && "Requires at least SSE2!");
SDValue ZExtIn = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::v2i64, Src);
SDValue VBias =
DAG.getConstantFP(BitsToDouble(0x4330000000000000ULL), dl, MVT::v2f64);
SDValue Or = DAG.getNode(ISD::OR, dl, MVT::v2i64, ZExtIn,
DAG.getBitcast(MVT::v2i64, VBias));
Or = DAG.getBitcast(MVT::v2f64, Or);
- // TODO: Are there any fast-math-flags to propagate here?
- SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
- Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ if (IsStrict) {
+ SDValue Sub = DAG.getNode(ISD::STRICT_FSUB, dl, {MVT::v2f64, MVT::Other},
+ {N->getOperand(0), Or, VBias});
+ SDValue Res = DAG.getNode(X86ISD::STRICT_VFPROUND, dl,
+ {MVT::v4f32, MVT::Other},
+ {Sub.getValue(1), Sub});
+ Results.push_back(Res);
+ Results.push_back(Res.getValue(1));
+ } else {
+ // TODO: Are there any fast-math-flags to propagate here?
+ SDValue Sub = DAG.getNode(ISD::FSUB, dl, MVT::v2f64, Or, VBias);
+ Results.push_back(DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, Sub));
+ }
return;
}
+ case ISD::STRICT_FP_ROUND:
case ISD::FP_ROUND: {
- if (!isTypeLegal(N->getOperand(0).getValueType()))
- return;
- SDValue V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+ if (!isTypeLegal(Src.getValueType()))
+ return;
+ SDValue V;
+ if (IsStrict)
+ V = DAG.getNode(X86ISD::STRICT_VFPROUND, dl, {MVT::v4f32, MVT::Other},
+ {N->getOperand(0), N->getOperand(1)});
+ else
+ V = DAG.getNode(X86ISD::VFPROUND, dl, MVT::v4f32, N->getOperand(0));
Results.push_back(V);
+ if (IsStrict)
+ Results.push_back(V.getValue(1));
return;
}
case ISD::FP_EXTEND: {
@@ -28543,6 +29569,28 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
Results.push_back(Res.getValue(1));
return;
}
+ case ISD::ADDRSPACECAST: {
+ SDValue Src = N->getOperand(0);
+ EVT DstVT = N->getValueType(0);
+ AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
+ unsigned SrcAS = CastN->getSrcAddressSpace();
+
+ assert(SrcAS != CastN->getDestAddressSpace() &&
+ "addrspacecast must be between different address spaces");
+
+ SDValue Res;
+ if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
+ Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
+ else if (DstVT == MVT::i64)
+ Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
+ else if (DstVT == MVT::i32)
+ Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
+ else
+ report_fatal_error("Unrecognized addrspacecast type legalization");
+
+ Results.push_back(Res);
+ return;
+ }
}
}
@@ -28566,9 +29614,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CALL: return "X86ISD::CALL";
case X86ISD::BT: return "X86ISD::BT";
case X86ISD::CMP: return "X86ISD::CMP";
+ case X86ISD::STRICT_FCMP: return "X86ISD::STRICT_FCMP";
+ case X86ISD::STRICT_FCMPS: return "X86ISD::STRICT_FCMPS";
case X86ISD::COMI: return "X86ISD::COMI";
case X86ISD::UCOMI: return "X86ISD::UCOMI";
case X86ISD::CMPM: return "X86ISD::CMPM";
+ case X86ISD::STRICT_CMPM: return "X86ISD::STRICT_CMPM";
case X86ISD::CMPM_SAE: return "X86ISD::CMPM_SAE";
case X86ISD::SETCC: return "X86ISD::SETCC";
case X86ISD::SETCC_CARRY: return "X86ISD::SETCC_CARRY";
@@ -28653,10 +29704,12 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VMTRUNCSTORES: return "X86ISD::VMTRUNCSTORES";
case X86ISD::VMTRUNCSTOREUS: return "X86ISD::VMTRUNCSTOREUS";
case X86ISD::VFPEXT: return "X86ISD::VFPEXT";
+ case X86ISD::STRICT_VFPEXT: return "X86ISD::STRICT_VFPEXT";
case X86ISD::VFPEXT_SAE: return "X86ISD::VFPEXT_SAE";
case X86ISD::VFPEXTS: return "X86ISD::VFPEXTS";
case X86ISD::VFPEXTS_SAE: return "X86ISD::VFPEXTS_SAE";
case X86ISD::VFPROUND: return "X86ISD::VFPROUND";
+ case X86ISD::STRICT_VFPROUND: return "X86ISD::STRICT_VFPROUND";
case X86ISD::VMFPROUND: return "X86ISD::VMFPROUND";
case X86ISD::VFPROUND_RND: return "X86ISD::VFPROUND_RND";
case X86ISD::VFPROUNDS: return "X86ISD::VFPROUNDS";
@@ -28676,6 +29729,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VROTRI: return "X86ISD::VROTRI";
case X86ISD::VPPERM: return "X86ISD::VPPERM";
case X86ISD::CMPP: return "X86ISD::CMPP";
+ case X86ISD::STRICT_CMPP: return "X86ISD::STRICT_CMPP";
case X86ISD::PCMPEQ: return "X86ISD::PCMPEQ";
case X86ISD::PCMPGT: return "X86ISD::PCMPGT";
case X86ISD::PHMINPOS: return "X86ISD::PHMINPOS";
@@ -28776,6 +29830,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::VPMADD52H: return "X86ISD::VPMADD52H";
case X86ISD::VPMADD52L: return "X86ISD::VPMADD52L";
case X86ISD::VRNDSCALE: return "X86ISD::VRNDSCALE";
+ case X86ISD::STRICT_VRNDSCALE: return "X86ISD::STRICT_VRNDSCALE";
case X86ISD::VRNDSCALE_SAE: return "X86ISD::VRNDSCALE_SAE";
case X86ISD::VRNDSCALES: return "X86ISD::VRNDSCALES";
case X86ISD::VRNDSCALES_SAE: return "X86ISD::VRNDSCALES_SAE";
@@ -28837,6 +29892,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::UINT_TO_FP_RND: return "X86ISD::UINT_TO_FP_RND";
case X86ISD::CVTTP2SI: return "X86ISD::CVTTP2SI";
case X86ISD::CVTTP2UI: return "X86ISD::CVTTP2UI";
+ case X86ISD::STRICT_CVTTP2SI: return "X86ISD::STRICT_CVTTP2SI";
+ case X86ISD::STRICT_CVTTP2UI: return "X86ISD::STRICT_CVTTP2UI";
case X86ISD::MCVTTP2SI: return "X86ISD::MCVTTP2SI";
case X86ISD::MCVTTP2UI: return "X86ISD::MCVTTP2UI";
case X86ISD::CVTTP2SI_SAE: return "X86ISD::CVTTP2SI_SAE";
@@ -28847,6 +29904,8 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
case X86ISD::CVTTS2UI_SAE: return "X86ISD::CVTTS2UI_SAE";
case X86ISD::CVTSI2P: return "X86ISD::CVTSI2P";
case X86ISD::CVTUI2P: return "X86ISD::CVTUI2P";
+ case X86ISD::STRICT_CVTSI2P: return "X86ISD::STRICT_CVTSI2P";
+ case X86ISD::STRICT_CVTUI2P: return "X86ISD::STRICT_CVTUI2P";
case X86ISD::MCVTSI2P: return "X86ISD::MCVTSI2P";
case X86ISD::MCVTUI2P: return "X86ISD::MCVTUI2P";
case X86ISD::VFPCLASS: return "X86ISD::VFPCLASS";
@@ -29099,8 +30158,8 @@ bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return true;
}
-bool
-X86TargetLowering::isFMAFasterThanFMulAndFAdd(EVT VT) const {
+bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const {
if (!Subtarget.hasAnyFMA())
return false;
@@ -31518,28 +32577,26 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
case X86ISD::VSRAI:
case X86ISD::VSHLI:
case X86ISD::VSRLI: {
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
- Known.setAllZero();
- break;
- }
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= VT.getScalarSizeInBits()) {
+ Known.setAllZero();
+ break;
+ }
- Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
- unsigned ShAmt = ShiftImm->getZExtValue();
- if (Opc == X86ISD::VSHLI) {
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
- // Low bits are known zero.
- Known.Zero.setLowBits(ShAmt);
- } else if (Opc == X86ISD::VSRLI) {
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
- // High bits are known zero.
- Known.Zero.setHighBits(ShAmt);
- } else {
- Known.Zero.ashrInPlace(ShAmt);
- Known.One.ashrInPlace(ShAmt);
- }
+ Known = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+ if (Opc == X86ISD::VSHLI) {
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
+ // Low bits are known zero.
+ Known.Zero.setLowBits(ShAmt);
+ } else if (Opc == X86ISD::VSRLI) {
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
+ // High bits are known zero.
+ Known.Zero.setHighBits(ShAmt);
+ } else {
+ Known.Zero.ashrInPlace(ShAmt);
+ Known.One.ashrInPlace(ShAmt);
}
break;
}
@@ -32103,8 +33160,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
if (((MaskVT == MVT::v8i16 || MaskVT == MVT::v16i8) && Subtarget.hasSSE2()) ||
((MaskVT == MVT::v16i16 || MaskVT == MVT::v32i8) && Subtarget.hasInt256()) ||
((MaskVT == MVT::v32i16 || MaskVT == MVT::v64i8) && Subtarget.hasBWI())) {
- if (matchVectorShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
- Subtarget)) {
+ if (matchShuffleWithPACK(MaskVT, SrcVT, V1, V2, Shuffle, Mask, DAG,
+ Subtarget)) {
DstVT = MaskVT;
return true;
}
@@ -32116,8 +33173,8 @@ static bool matchBinaryShuffle(MVT MaskVT, ArrayRef<int> Mask,
(MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
(MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
(MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
- if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
- DAG, Subtarget)) {
+ if (matchShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL, DAG,
+ Subtarget)) {
SrcVT = DstVT = MaskVT;
if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
SrcVT = DstVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
@@ -32155,8 +33212,8 @@ static bool matchBinaryPermuteShuffle(
uint64_t BlendMask = 0;
bool ForceV1Zero = false, ForceV2Zero = false;
SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
- if (matchVectorShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
- ForceV2Zero, BlendMask)) {
+ if (matchShuffleAsBlend(V1, V2, TargetMask, Zeroable, ForceV1Zero,
+ ForceV2Zero, BlendMask)) {
if (MaskVT == MVT::v16i16) {
// We can only use v16i16 PBLENDW if the lanes are repeated.
SmallVector<int, 8> RepeatedMask;
@@ -32410,10 +33467,9 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
(!MaskVT.is256BitVector() || Subtarget.hasAVX2());
// Determine zeroable mask elements.
- APInt Zeroable(NumMaskElts, 0);
- for (unsigned i = 0; i != NumMaskElts; ++i)
- if (isUndefOrZero(Mask[i]))
- Zeroable.setBit(i);
+ APInt KnownUndef, KnownZero;
+ resolveZeroablesFromTargetShuffle(Mask, KnownUndef, KnownZero);
+ APInt Zeroable = KnownUndef | KnownZero;
if (UnaryShuffle) {
// If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
@@ -32834,7 +33890,8 @@ static SDValue combineX86ShuffleChainWithExtract(
Offset += Src.getConstantOperandVal(1);
Src = Src.getOperand(0);
}
- WideSizeInBits = std::max(WideSizeInBits, Src.getValueSizeInBits());
+ WideSizeInBits = std::max(WideSizeInBits,
+ (unsigned)Src.getValueSizeInBits());
assert((Offset % BaseVT.getVectorNumElements()) == 0 &&
"Unexpected subvector extraction");
Offset /= BaseVT.getVectorNumElements();
@@ -33026,6 +34083,10 @@ static SDValue combineX86ShufflesRecursively(
ArrayRef<int> RootMask, ArrayRef<const SDNode *> SrcNodes, unsigned Depth,
bool HasVariableMask, bool AllowVariableMask, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
+ assert(RootMask.size() > 0 &&
+ (RootMask.size() > 1 || (RootMask[0] == 0 && SrcOpIndex == 0)) &&
+ "Illegal shuffle root mask");
+
// Bound the depth of our recursive combine because this is ultimately
// quadratic in nature.
const unsigned MaxRecursionDepth = 8;
@@ -33056,106 +34117,137 @@ static SDValue combineX86ShufflesRecursively(
OpZero, DAG, Depth, false))
return SDValue();
- resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
-
- // Add the inputs to the Ops list, avoiding duplicates.
- SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
-
- auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
- // Attempt to find an existing match.
- SDValue InputBC = peekThroughBitcasts(Input);
- for (int i = 0, e = Ops.size(); i < e; ++i)
- if (InputBC == peekThroughBitcasts(Ops[i]))
- return i;
- // Match failed - should we replace an existing Op?
- if (InsertionPoint >= 0) {
- Ops[InsertionPoint] = Input;
- return InsertionPoint;
+ SmallVector<int, 64> Mask;
+ SmallVector<SDValue, 16> Ops;
+
+ // We don't need to merge masks if the root is empty.
+ bool EmptyRoot = (Depth == 0) && (RootMask.size() == 1);
+ if (EmptyRoot) {
+ // Only resolve zeros if it will remove an input, otherwise we might end
+ // up in an infinite loop.
+ bool ResolveKnownZeros = true;
+ if (!OpZero.isNullValue()) {
+ APInt UsedInputs = APInt::getNullValue(OpInputs.size());
+ for (int i = 0, e = OpMask.size(); i != e; ++i) {
+ int M = OpMask[i];
+ if (OpUndef[i] || OpZero[i] || isUndefOrZero(M))
+ continue;
+ UsedInputs.setBit(M / OpMask.size());
+ if (UsedInputs.isAllOnesValue()) {
+ ResolveKnownZeros = false;
+ break;
+ }
+ }
}
- // Add to the end of the Ops list.
- Ops.push_back(Input);
- return Ops.size() - 1;
- };
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero,
+ ResolveKnownZeros);
- SmallVector<int, 2> OpInputIdx;
- for (SDValue OpInput : OpInputs)
- OpInputIdx.push_back(AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
-
- assert(((RootMask.size() > OpMask.size() &&
- RootMask.size() % OpMask.size() == 0) ||
- (OpMask.size() > RootMask.size() &&
- OpMask.size() % RootMask.size() == 0) ||
- OpMask.size() == RootMask.size()) &&
- "The smaller number of elements must divide the larger.");
-
- // This function can be performance-critical, so we rely on the power-of-2
- // knowledge that we have about the mask sizes to replace div/rem ops with
- // bit-masks and shifts.
- assert(isPowerOf2_32(RootMask.size()) && "Non-power-of-2 shuffle mask sizes");
- assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
- unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
- unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
-
- unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
- unsigned RootRatio = std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
- unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
- assert((RootRatio == 1 || OpRatio == 1) &&
- "Must not have a ratio for both incoming and op masks!");
-
- assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
- assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
- assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
- unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
- unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
-
- SmallVector<int, 64> Mask(MaskWidth, SM_SentinelUndef);
-
- // Merge this shuffle operation's mask into our accumulated mask. Note that
- // this shuffle's mask will be the first applied to the input, followed by the
- // root mask to get us all the way to the root value arrangement. The reason
- // for this order is that we are recursing up the operation chain.
- for (unsigned i = 0; i < MaskWidth; ++i) {
- unsigned RootIdx = i >> RootRatioLog2;
- if (RootMask[RootIdx] < 0) {
- // This is a zero or undef lane, we're done.
- Mask[i] = RootMask[RootIdx];
- continue;
- }
+ Mask = OpMask;
+ Ops.append(OpInputs.begin(), OpInputs.end());
+ } else {
+ resolveTargetShuffleFromZeroables(OpMask, OpUndef, OpZero);
+
+ // Add the inputs to the Ops list, avoiding duplicates.
+ Ops.append(SrcOps.begin(), SrcOps.end());
+
+ auto AddOp = [&Ops](SDValue Input, int InsertionPoint) -> int {
+ // Attempt to find an existing match.
+ SDValue InputBC = peekThroughBitcasts(Input);
+ for (int i = 0, e = Ops.size(); i < e; ++i)
+ if (InputBC == peekThroughBitcasts(Ops[i]))
+ return i;
+ // Match failed - should we replace an existing Op?
+ if (InsertionPoint >= 0) {
+ Ops[InsertionPoint] = Input;
+ return InsertionPoint;
+ }
+ // Add to the end of the Ops list.
+ Ops.push_back(Input);
+ return Ops.size() - 1;
+ };
- unsigned RootMaskedIdx =
- RootRatio == 1
- ? RootMask[RootIdx]
- : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
+ SmallVector<int, 2> OpInputIdx;
+ for (SDValue OpInput : OpInputs)
+ OpInputIdx.push_back(
+ AddOp(OpInput, OpInputIdx.empty() ? SrcOpIndex : -1));
+
+ assert(((RootMask.size() > OpMask.size() &&
+ RootMask.size() % OpMask.size() == 0) ||
+ (OpMask.size() > RootMask.size() &&
+ OpMask.size() % RootMask.size() == 0) ||
+ OpMask.size() == RootMask.size()) &&
+ "The smaller number of elements must divide the larger.");
+
+ // This function can be performance-critical, so we rely on the power-of-2
+ // knowledge that we have about the mask sizes to replace div/rem ops with
+ // bit-masks and shifts.
+ assert(isPowerOf2_32(RootMask.size()) &&
+ "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpMask.size()) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootMaskSizeLog2 = countTrailingZeros(RootMask.size());
+ unsigned OpMaskSizeLog2 = countTrailingZeros(OpMask.size());
+
+ unsigned MaskWidth = std::max<unsigned>(OpMask.size(), RootMask.size());
+ unsigned RootRatio =
+ std::max<unsigned>(1, OpMask.size() >> RootMaskSizeLog2);
+ unsigned OpRatio = std::max<unsigned>(1, RootMask.size() >> OpMaskSizeLog2);
+ assert((RootRatio == 1 || OpRatio == 1) &&
+ "Must not have a ratio for both incoming and op masks!");
+
+ assert(isPowerOf2_32(MaskWidth) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(RootRatio) && "Non-power-of-2 shuffle mask sizes");
+ assert(isPowerOf2_32(OpRatio) && "Non-power-of-2 shuffle mask sizes");
+ unsigned RootRatioLog2 = countTrailingZeros(RootRatio);
+ unsigned OpRatioLog2 = countTrailingZeros(OpRatio);
+
+ Mask.resize(MaskWidth, SM_SentinelUndef);
+
+ // Merge this shuffle operation's mask into our accumulated mask. Note that
+ // this shuffle's mask will be the first applied to the input, followed by
+ // the root mask to get us all the way to the root value arrangement. The
+ // reason for this order is that we are recursing up the operation chain.
+ for (unsigned i = 0; i < MaskWidth; ++i) {
+ unsigned RootIdx = i >> RootRatioLog2;
+ if (RootMask[RootIdx] < 0) {
+ // This is a zero or undef lane, we're done.
+ Mask[i] = RootMask[RootIdx];
+ continue;
+ }
- // Just insert the scaled root mask value if it references an input other
- // than the SrcOp we're currently inserting.
- if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
- (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
- Mask[i] = RootMaskedIdx;
- continue;
- }
+ unsigned RootMaskedIdx =
+ RootRatio == 1
+ ? RootMask[RootIdx]
+ : (RootMask[RootIdx] << RootRatioLog2) + (i & (RootRatio - 1));
- RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
- unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
- if (OpMask[OpIdx] < 0) {
- // The incoming lanes are zero or undef, it doesn't matter which ones we
- // are using.
- Mask[i] = OpMask[OpIdx];
- continue;
- }
+ // Just insert the scaled root mask value if it references an input other
+ // than the SrcOp we're currently inserting.
+ if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
+ (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
+ Mask[i] = RootMaskedIdx;
+ continue;
+ }
- // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
- unsigned OpMaskedIdx =
- OpRatio == 1
- ? OpMask[OpIdx]
- : (OpMask[OpIdx] << OpRatioLog2) + (RootMaskedIdx & (OpRatio - 1));
+ RootMaskedIdx = RootMaskedIdx & (MaskWidth - 1);
+ unsigned OpIdx = RootMaskedIdx >> OpRatioLog2;
+ if (OpMask[OpIdx] < 0) {
+ // The incoming lanes are zero or undef, it doesn't matter which ones we
+ // are using.
+ Mask[i] = OpMask[OpIdx];
+ continue;
+ }
- OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
- int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
- assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
- OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
+ // Ok, we have non-zero lanes, map them through to one of the Op's inputs.
+ unsigned OpMaskedIdx = OpRatio == 1 ? OpMask[OpIdx]
+ : (OpMask[OpIdx] << OpRatioLog2) +
+ (RootMaskedIdx & (OpRatio - 1));
- Mask[i] = OpMaskedIdx;
+ OpMaskedIdx = OpMaskedIdx & (MaskWidth - 1);
+ int InputIdx = OpMask[OpIdx] / (int)OpMask.size();
+ assert(0 <= OpInputIdx[InputIdx] && "Unknown target shuffle input");
+ OpMaskedIdx += OpInputIdx[InputIdx] * MaskWidth;
+
+ Mask[i] = OpMaskedIdx;
+ }
}
// Remove unused/repeated shuffle source ops.
@@ -33189,13 +34281,18 @@ static SDValue combineX86ShufflesRecursively(
// the remaining recursion depth.
if (Ops.size() < (MaxRecursionDepth - Depth)) {
for (int i = 0, e = Ops.size(); i < e; ++i) {
+ // For empty roots, we need to resolve zeroable elements before combining
+ // them with other shuffles.
+ SmallVector<int, 64> ResolvedMask = Mask;
+ if (EmptyRoot)
+ resolveTargetShuffleFromZeroables(ResolvedMask, OpUndef, OpZero);
bool AllowVar = false;
if (Ops[i].getNode()->hasOneUse() ||
SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
AllowVar = AllowVariableMask;
if (SDValue Res = combineX86ShufflesRecursively(
- Ops, i, Root, Mask, CombinedNodes, Depth + 1, HasVariableMask,
- AllowVar, DAG, Subtarget))
+ Ops, i, Root, ResolvedMask, CombinedNodes, Depth + 1,
+ HasVariableMask, AllowVar, DAG, Subtarget))
return Res;
}
}
@@ -34207,6 +35304,15 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
In.getOperand(0).getValueType() == MVT::v2i64)
return N->getOperand(0); // return the bitcast
break;
+ case X86ISD::STRICT_CVTTP2SI:
+ case X86ISD::STRICT_CVTTP2UI:
+ case X86ISD::STRICT_CVTSI2P:
+ case X86ISD::STRICT_CVTUI2P:
+ case X86ISD::STRICT_VFPROUND:
+ if (In.getOperand(1).getValueType() == MVT::v2f64 ||
+ In.getOperand(1).getValueType() == MVT::v2i64)
+ return N->getOperand(0);
+ break;
}
}
@@ -34698,6 +35804,23 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
return true;
}
+ // If we don't demand all elements, then attempt to combine to a simpler
+ // shuffle.
+ // TODO: Handle other depths, but first we need to handle the fact that
+ // it might combine to the same shuffle.
+ if (!DemandedElts.isAllOnesValue() && Depth == 0) {
+ SmallVector<int, 64> DemandedMask(NumElts, SM_SentinelUndef);
+ for (int i = 0; i != NumElts; ++i)
+ if (DemandedElts[i])
+ DemandedMask[i] = i;
+
+ SDValue NewShuffle = combineX86ShufflesRecursively(
+ {Op}, 0, Op, DemandedMask, {}, Depth, /*HasVarMask*/ false,
+ /*AllowVarMask*/ true, TLO.DAG, Subtarget);
+ if (NewShuffle)
+ return TLO.CombineTo(Op, NewShuffle);
+ }
+
return false;
}
@@ -34739,117 +35862,110 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
case X86ISD::VSHLI: {
SDValue Op0 = Op.getOperand(0);
- SDValue Op1 = Op.getOperand(1);
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
- if (ShiftImm->getAPIntValue().uge(BitWidth))
- break;
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= BitWidth)
+ break;
- unsigned ShAmt = ShiftImm->getZExtValue();
- APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
-
- // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
- // single shift. We can do this if the bottom bits (which are shifted
- // out) are never demanded.
- if (Op0.getOpcode() == X86ISD::VSRLI &&
- OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
- if (auto *Shift2Imm = dyn_cast<ConstantSDNode>(Op0.getOperand(1))) {
- if (Shift2Imm->getAPIntValue().ult(BitWidth)) {
- int Diff = ShAmt - Shift2Imm->getZExtValue();
- if (Diff == 0)
- return TLO.CombineTo(Op, Op0.getOperand(0));
-
- unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
- SDValue NewShift = TLO.DAG.getNode(
- NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
- TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
- return TLO.CombineTo(Op, NewShift);
- }
- }
+ APInt DemandedMask = OriginalDemandedBits.lshr(ShAmt);
+
+ // If this is ((X >>u C1) << ShAmt), see if we can simplify this into a
+ // single shift. We can do this if the bottom bits (which are shifted
+ // out) are never demanded.
+ if (Op0.getOpcode() == X86ISD::VSRLI &&
+ OriginalDemandedBits.countTrailingZeros() >= ShAmt) {
+ unsigned Shift2Amt = Op0.getConstantOperandVal(1);
+ if (Shift2Amt < BitWidth) {
+ int Diff = ShAmt - Shift2Amt;
+ if (Diff == 0)
+ return TLO.CombineTo(Op, Op0.getOperand(0));
+
+ unsigned NewOpc = Diff < 0 ? X86ISD::VSRLI : X86ISD::VSHLI;
+ SDValue NewShift = TLO.DAG.getNode(
+ NewOpc, SDLoc(Op), VT, Op0.getOperand(0),
+ TLO.DAG.getTargetConstant(std::abs(Diff), SDLoc(Op), MVT::i8));
+ return TLO.CombineTo(Op, NewShift);
}
+ }
- if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
- TLO, Depth + 1))
- return true;
+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero <<= ShAmt;
- Known.One <<= ShAmt;
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero <<= ShAmt;
+ Known.One <<= ShAmt;
- // Low bits known zero.
- Known.Zero.setLowBits(ShAmt);
- }
+ // Low bits known zero.
+ Known.Zero.setLowBits(ShAmt);
break;
}
case X86ISD::VSRLI: {
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
- if (ShiftImm->getAPIntValue().uge(BitWidth))
- break;
+ unsigned ShAmt = Op.getConstantOperandVal(1);
+ if (ShAmt >= BitWidth)
+ break;
- unsigned ShAmt = ShiftImm->getZExtValue();
- APInt DemandedMask = OriginalDemandedBits << ShAmt;
+ APInt DemandedMask = OriginalDemandedBits << ShAmt;
- if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
- OriginalDemandedElts, Known, TLO, Depth + 1))
- return true;
+ if (SimplifyDemandedBits(Op.getOperand(0), DemandedMask,
+ OriginalDemandedElts, Known, TLO, Depth + 1))
+ return true;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
- // High bits known zero.
- Known.Zero.setHighBits(ShAmt);
- }
+ // High bits known zero.
+ Known.Zero.setHighBits(ShAmt);
break;
}
case X86ISD::VSRAI: {
SDValue Op0 = Op.getOperand(0);
SDValue Op1 = Op.getOperand(1);
- if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op1)) {
- if (ShiftImm->getAPIntValue().uge(BitWidth))
- break;
+ unsigned ShAmt = cast<ConstantSDNode>(Op1)->getZExtValue();
+ if (ShAmt >= BitWidth)
+ break;
- unsigned ShAmt = ShiftImm->getZExtValue();
- APInt DemandedMask = OriginalDemandedBits << ShAmt;
+ APInt DemandedMask = OriginalDemandedBits << ShAmt;
- // If we just want the sign bit then we don't need to shift it.
- if (OriginalDemandedBits.isSignMask())
- return TLO.CombineTo(Op, Op0);
+ // If we just want the sign bit then we don't need to shift it.
+ if (OriginalDemandedBits.isSignMask())
+ return TLO.CombineTo(Op, Op0);
- // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
- if (Op0.getOpcode() == X86ISD::VSHLI && Op1 == Op0.getOperand(1)) {
- SDValue Op00 = Op0.getOperand(0);
- unsigned NumSignBits =
- TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
- if (ShAmt < NumSignBits)
- return TLO.CombineTo(Op, Op00);
- }
+ // fold (VSRAI (VSHLI X, C1), C1) --> X iff NumSignBits(X) > C1
+ if (Op0.getOpcode() == X86ISD::VSHLI &&
+ Op.getOperand(1) == Op0.getOperand(1)) {
+ SDValue Op00 = Op0.getOperand(0);
+ unsigned NumSignBits =
+ TLO.DAG.ComputeNumSignBits(Op00, OriginalDemandedElts);
+ if (ShAmt < NumSignBits)
+ return TLO.CombineTo(Op, Op00);
+ }
- // If any of the demanded bits are produced by the sign extension, we also
- // demand the input sign bit.
- if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
- DemandedMask.setSignBit();
+ // If any of the demanded bits are produced by the sign extension, we also
+ // demand the input sign bit.
+ if (OriginalDemandedBits.countLeadingZeros() < ShAmt)
+ DemandedMask.setSignBit();
- if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
- TLO, Depth + 1))
- return true;
+ if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
+ TLO, Depth + 1))
+ return true;
- assert(!Known.hasConflict() && "Bits known to be one AND zero?");
- Known.Zero.lshrInPlace(ShAmt);
- Known.One.lshrInPlace(ShAmt);
+ assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+ Known.Zero.lshrInPlace(ShAmt);
+ Known.One.lshrInPlace(ShAmt);
- // If the input sign bit is known to be zero, or if none of the top bits
- // are demanded, turn this into an unsigned shift right.
- if (Known.Zero[BitWidth - ShAmt - 1] ||
- OriginalDemandedBits.countLeadingZeros() >= ShAmt)
- return TLO.CombineTo(
- Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
+ // If the input sign bit is known to be zero, or if none of the top bits
+ // are demanded, turn this into an unsigned shift right.
+ if (Known.Zero[BitWidth - ShAmt - 1] ||
+ OriginalDemandedBits.countLeadingZeros() >= ShAmt)
+ return TLO.CombineTo(
+ Op, TLO.DAG.getNode(X86ISD::VSRLI, SDLoc(Op), VT, Op0, Op1));
- // High bits are known one.
- if (Known.One[BitWidth - ShAmt - 1])
- Known.One.setHighBits(ShAmt);
- }
+ // High bits are known one.
+ if (Known.One[BitWidth - ShAmt - 1])
+ Known.One.setHighBits(ShAmt);
break;
}
case X86ISD::PEXTRB:
@@ -35005,6 +36121,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
return Vec;
break;
}
+ case X86ISD::PCMPGT:
+ // icmp sgt(0, R) == ashr(R, BitWidth-1).
+ // iff we only need the sign bit then we can use R directly.
+ if (DemandedBits.isSignMask() &&
+ ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
+ return Op.getOperand(1);
+ break;
}
APInt ShuffleUndef, ShuffleZero;
@@ -35053,123 +36176,6 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
Op, DemandedBits, DemandedElts, DAG, Depth);
}
-/// Check if a vector extract from a target-specific shuffle of a load can be
-/// folded into a single element load.
-/// Similar handling for VECTOR_SHUFFLE is performed by DAGCombiner, but
-/// shuffles have been custom lowered so we need to handle those here.
-static SDValue
-XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI) {
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- SDValue InVec = N->getOperand(0);
- SDValue EltNo = N->getOperand(1);
- EVT EltVT = N->getValueType(0);
-
- if (!isa<ConstantSDNode>(EltNo))
- return SDValue();
-
- EVT OriginalVT = InVec.getValueType();
- unsigned NumOriginalElts = OriginalVT.getVectorNumElements();
-
- // Peek through bitcasts, don't duplicate a load with other uses.
- InVec = peekThroughOneUseBitcasts(InVec);
-
- EVT CurrentVT = InVec.getValueType();
- if (!CurrentVT.isVector())
- return SDValue();
-
- unsigned NumCurrentElts = CurrentVT.getVectorNumElements();
- if ((NumOriginalElts % NumCurrentElts) != 0)
- return SDValue();
-
- if (!isTargetShuffle(InVec.getOpcode()))
- return SDValue();
-
- // Don't duplicate a load with other uses.
- if (!InVec.hasOneUse())
- return SDValue();
-
- SmallVector<int, 16> ShuffleMask;
- SmallVector<SDValue, 2> ShuffleOps;
- bool UnaryShuffle;
- if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true,
- ShuffleOps, ShuffleMask, UnaryShuffle))
- return SDValue();
-
- unsigned Scale = NumOriginalElts / NumCurrentElts;
- if (Scale > 1) {
- SmallVector<int, 16> ScaledMask;
- scaleShuffleMask<int>(Scale, ShuffleMask, ScaledMask);
- ShuffleMask = std::move(ScaledMask);
- }
- assert(ShuffleMask.size() == NumOriginalElts && "Shuffle mask size mismatch");
-
- // Select the input vector, guarding against out of range extract vector.
- int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
- int Idx = (Elt > (int)NumOriginalElts) ? SM_SentinelUndef : ShuffleMask[Elt];
-
- if (Idx == SM_SentinelZero)
- return EltVT.isInteger() ? DAG.getConstant(0, SDLoc(N), EltVT)
- : DAG.getConstantFP(+0.0, SDLoc(N), EltVT);
- if (Idx == SM_SentinelUndef)
- return DAG.getUNDEF(EltVT);
-
- // Bail if any mask element is SM_SentinelZero - getVectorShuffle below
- // won't handle it.
- if (llvm::any_of(ShuffleMask, [](int M) { return M == SM_SentinelZero; }))
- return SDValue();
-
- assert(0 <= Idx && Idx < (int)(2 * NumOriginalElts) &&
- "Shuffle index out of range");
- SDValue LdNode = (Idx < (int)NumOriginalElts) ? ShuffleOps[0] : ShuffleOps[1];
-
- // If inputs to shuffle are the same for both ops, then allow 2 uses
- unsigned AllowedUses =
- (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1;
-
- if (LdNode.getOpcode() == ISD::BITCAST) {
- // Don't duplicate a load with other uses.
- if (!LdNode.getNode()->hasNUsesOfValue(AllowedUses, 0))
- return SDValue();
-
- AllowedUses = 1; // only allow 1 load use if we have a bitcast
- LdNode = LdNode.getOperand(0);
- }
-
- if (!ISD::isNormalLoad(LdNode.getNode()))
- return SDValue();
-
- LoadSDNode *LN0 = cast<LoadSDNode>(LdNode);
-
- if (!LN0 || !LN0->hasNUsesOfValue(AllowedUses, 0) || !LN0->isSimple())
- return SDValue();
-
- // If there's a bitcast before the shuffle, check if the load type and
- // alignment is valid.
- unsigned Align = LN0->getAlignment();
- const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
- EltVT.getTypeForEVT(*DAG.getContext()));
-
- if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, EltVT))
- return SDValue();
-
- // All checks match so transform back to vector_shuffle so that DAG combiner
- // can finish the job
- SDLoc dl(N);
-
- // Create shuffle node taking into account the case that its a unary shuffle
- SDValue Shuffle = UnaryShuffle ? DAG.getUNDEF(OriginalVT)
- : DAG.getBitcast(OriginalVT, ShuffleOps[1]);
- Shuffle = DAG.getVectorShuffle(OriginalVT, dl,
- DAG.getBitcast(OriginalVT, ShuffleOps[0]),
- Shuffle, ShuffleMask);
- return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle,
- EltNo);
-}
-
// Helper to peek through bitops/setcc to determine size of source vector.
// Allows combineBitcastvxi1 to determine what size vector generated a <X x i1>.
static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
@@ -35714,7 +36720,7 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
const X86Subtarget &Subtarget) {
// Find the appropriate width for the PSADBW.
EVT InVT = Zext0.getOperand(0).getValueType();
- unsigned RegSize = std::max(128u, InVT.getSizeInBits());
+ unsigned RegSize = std::max(128u, (unsigned)InVT.getSizeInBits());
// "Zero-extend" the i8 vectors. This is not a per-element zext, rather we
// fill in the missing vector elements with 0.
@@ -36263,6 +37269,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
assert(ExtElt->getOpcode() == ISD::EXTRACT_VECTOR_ELT && "Unexpected caller");
+ // We need at least SSE2 to anything here.
+ if (!Subtarget.hasSSE2())
+ return SDValue();
+
ISD::NodeType Opc;
SDValue Rdx =
DAG.matchBinOpReduction(ExtElt, Opc, {ISD::ADD, ISD::FADD}, true);
@@ -36382,8 +37392,9 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
EVT VT = N->getValueType(0);
SDLoc dl(InputVector);
bool IsPextr = N->getOpcode() != ISD::EXTRACT_VECTOR_ELT;
+ unsigned NumSrcElts = SrcVT.getVectorNumElements();
- if (CIdx && CIdx->getAPIntValue().uge(SrcVT.getVectorNumElements()))
+ if (CIdx && CIdx->getAPIntValue().uge(NumSrcElts))
return IsPextr ? DAG.getConstant(0, dl, VT) : DAG.getUNDEF(VT);
// Integer Constant Folding.
@@ -36419,14 +37430,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
}
// TODO - Remove this once we can handle the implicit zero-extension of
- // X86ISD::PEXTRW/X86ISD::PEXTRB in XFormVExtractWithShuffleIntoLoad,
- // combineHorizontalPredicateResult and combineBasicSADPattern.
+ // X86ISD::PEXTRW/X86ISD::PEXTRB in combineHorizontalPredicateResult and
+ // combineBasicSADPattern.
return SDValue();
}
- if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
- return NewOp;
-
// Detect mmx extraction of all bits as a i64. It works better as a bitcast.
if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
@@ -36482,7 +37490,6 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
};
if (all_of(InputVector->uses(), IsBoolExtract) &&
BoolExtracts.size() > 1) {
- unsigned NumSrcElts = SrcVT.getVectorNumElements();
EVT BCVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcElts);
if (SDValue BC =
combineBitcastvxi1(DAG, BCVT, InputVector, dl, Subtarget)) {
@@ -36568,9 +37575,8 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
if (TValIsAllZeros || FValIsAllOnes) {
SDValue CC = Cond.getOperand(2);
- ISD::CondCode NewCC =
- ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
- Cond.getOperand(0).getValueType().isInteger());
+ ISD::CondCode NewCC = ISD::getSetCCInverse(
+ cast<CondCodeSDNode>(CC)->get(), Cond.getOperand(0).getValueType());
Cond = DAG.getSetCC(DL, CondVT, Cond.getOperand(0), Cond.getOperand(1),
NewCC);
std::swap(LHS, RHS);
@@ -36761,37 +37767,117 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
if (VT.is512BitVector())
return SDValue();
- // TODO: Add other opcodes eventually lowered into BLEND.
- for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
- UI != UE; ++UI)
- if ((UI->getOpcode() != ISD::VSELECT &&
- UI->getOpcode() != X86ISD::BLENDV) ||
- UI.getOperandNo() != 0)
+ auto OnlyUsedAsSelectCond = [](SDValue Cond) {
+ for (SDNode::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
+ UI != UE; ++UI)
+ if ((UI->getOpcode() != ISD::VSELECT &&
+ UI->getOpcode() != X86ISD::BLENDV) ||
+ UI.getOperandNo() != 0)
+ return false;
+
+ return true;
+ };
+
+ if (OnlyUsedAsSelectCond(Cond)) {
+ APInt DemandedMask(APInt::getSignMask(BitWidth));
+ KnownBits Known;
+ TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+ !DCI.isBeforeLegalizeOps());
+ if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
return SDValue();
+ // If we changed the computation somewhere in the DAG, this change will
+ // affect all users of Cond. Update all the nodes so that we do not use
+ // the generic VSELECT anymore. Otherwise, we may perform wrong
+ // optimizations as we messed with the actual expectation for the vector
+ // boolean values.
+ for (SDNode *U : Cond->uses()) {
+ if (U->getOpcode() == X86ISD::BLENDV)
+ continue;
+
+ SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
+ Cond, U->getOperand(1), U->getOperand(2));
+ DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+ DCI.AddToWorklist(U);
+ }
+ DCI.CommitTargetLoweringOpt(TLO);
+ return SDValue(N, 0);
+ }
+
+ // Otherwise we can still at least try to simplify multiple use bits.
APInt DemandedMask(APInt::getSignMask(BitWidth));
+ APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));
KnownBits Known;
TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
!DCI.isBeforeLegalizeOps());
- if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+ if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,
+ DemandedElts, DAG, 0))
+ return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+ V, N->getOperand(1), N->getOperand(2));
+
+ return SDValue();
+}
+
+// Try to match:
+// (or (and (M, (sub 0, X)), (pandn M, X)))
+// which is a special case of:
+// (select M, (sub 0, X), X)
+// Per:
+// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
+// We know that, if fNegate is 0 or 1:
+// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
+//
+// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
+// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
+// ( M ? -X : X) == ((X ^ M ) + (M & 1))
+// This lets us transform our vselect to:
+// (add (xor X, M), (and M, 1))
+// And further to:
+// (sub (xor X, M), M)
+static SDValue combineLogicBlendIntoConditionalNegate(
+ EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
+ SelectionDAG &DAG, const X86Subtarget &Subtarget) {
+ EVT MaskVT = Mask.getValueType();
+ assert(MaskVT.isInteger() &&
+ DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
+ "Mask must be zero/all-bits");
+
+ if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
+ return SDValue();
+ if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
return SDValue();
- // If we changed the computation somewhere in the DAG, this change will
- // affect all users of Cond. Update all the nodes so that we do not use
- // the generic VSELECT anymore. Otherwise, we may perform wrong
- // optimizations as we messed with the actual expectation for the vector
- // boolean values.
- for (SDNode *U : Cond->uses()) {
- if (U->getOpcode() == X86ISD::BLENDV)
- continue;
+ auto IsNegV = [](SDNode *N, SDValue V) {
+ return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
+ ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
+ };
- SDValue SB = DAG.getNode(X86ISD::BLENDV, SDLoc(U), U->getValueType(0),
- Cond, U->getOperand(1), U->getOperand(2));
- DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
- DCI.AddToWorklist(U);
- }
- DCI.CommitTargetLoweringOpt(TLO);
- return SDValue(N, 0);
+ SDValue V;
+ if (IsNegV(Y.getNode(), X))
+ V = X;
+ else if (IsNegV(X.getNode(), Y))
+ V = Y;
+ else
+ return SDValue();
+
+ SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
+ SDValue SubOp2 = Mask;
+
+ // If the negate was on the false side of the select, then
+ // the operands of the SUB need to be swapped. PR 27251.
+ // This is because the pattern being matched above is
+ // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
+ // but if the pattern matched was
+ // (vselect M, X, (sub (0, X))), that is really negation of the pattern
+ // above, -(vselect M, (sub 0, X), X), and therefore the replacement
+ // pattern also needs to be a negation of the replacement pattern above.
+ // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
+ // sub accomplishes the negation of the replacement pattern.
+ if (V == Y)
+ std::swap(SubOp1, SubOp2);
+
+ SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
+ return DAG.getBitcast(VT, Res);
}
/// Do target-specific dag combines on SELECT and VSELECT nodes.
@@ -36811,10 +37897,21 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
EVT VT = LHS.getValueType();
EVT CondVT = Cond.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ bool CondConstantVector = ISD::isBuildVectorOfConstantSDNodes(Cond.getNode());
+
+ // Attempt to combine (select M, (sub 0, X), X) -> (sub (xor X, M), M).
+ // Limit this to cases of non-constant masks that createShuffleMaskFromVSELECT
+ // can't catch, plus vXi8 cases where we'd likely end up with BLENDV.
+ if (CondVT.isVector() && CondVT.isInteger() &&
+ CondVT.getScalarSizeInBits() == VT.getScalarSizeInBits() &&
+ (!CondConstantVector || CondVT.getScalarType() == MVT::i8) &&
+ DAG.ComputeNumSignBits(Cond) == CondVT.getScalarSizeInBits())
+ if (SDValue V = combineLogicBlendIntoConditionalNegate(VT, Cond, RHS, LHS,
+ DL, DAG, Subtarget))
+ return V;
// Convert vselects with constant condition into shuffles.
- if (ISD::isBuildVectorOfConstantSDNodes(Cond.getNode()) &&
- DCI.isBeforeLegalizeOps()) {
+ if (CondConstantVector && DCI.isBeforeLegalizeOps()) {
SmallVector<int, 64> Mask;
if (createShuffleMaskFromVSELECT(Mask, Cond))
return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask);
@@ -36843,7 +37940,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS)))
break;
@@ -36854,7 +37951,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
case ISD::SETOLE:
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMIN;
@@ -36873,7 +37970,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
case ISD::SETOGE:
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) && !DAG.isKnownNeverZeroFloat(RHS))
break;
Opcode = X86ISD::FMAX;
@@ -36883,7 +37980,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// the operands would cause it to handle comparisons between positive
// and negative zero incorrectly.
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)) {
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS)))
break;
@@ -36911,7 +38008,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a min would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!(DAG.isKnownNeverZeroFloat(LHS) ||
DAG.isKnownNeverZeroFloat(RHS))) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
@@ -36922,8 +38019,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
break;
case ISD::SETUGT:
// Converting this to a min would handle NaNs incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
- (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS)))
+ if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
break;
Opcode = X86ISD::FMIN;
break;
@@ -36948,7 +38044,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
// Converting this to a max would handle comparisons between positive
// and negative zero incorrectly, and swapping the operands would
// cause it to handle NaNs incorrectly.
- if (!DAG.getTarget().Options.UnsafeFPMath &&
+ if (!DAG.getTarget().Options.NoSignedZerosFPMath &&
!DAG.isKnownNeverZeroFloat(LHS) &&
!DAG.isKnownNeverZeroFloat(RHS)) {
if (!DAG.isKnownNeverNaN(LHS) || !DAG.isKnownNeverNaN(RHS))
@@ -37093,7 +38189,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SDValue Other;
if (ISD::isBuildVectorAllZeros(LHS.getNode())) {
Other = RHS;
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
} else if (ISD::isBuildVectorAllZeros(RHS.getNode())) {
Other = LHS;
}
@@ -37165,7 +38261,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
SDValue Other;
if (ISD::isBuildVectorAllOnes(LHS.getNode())) {
Other = RHS;
- CC = ISD::getSetCCInverse(CC, true);
+ CC = ISD::getSetCCInverse(CC, VT.getVectorElementType());
} else if (ISD::isBuildVectorAllOnes(RHS.getNode())) {
Other = LHS;
}
@@ -37788,7 +38884,7 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
}
/// Different mul shrinking modes.
-enum ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
+enum class ShrinkMode { MULS8, MULU8, MULS16, MULU16 };
static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
EVT VT = N->getOperand(0).getValueType();
@@ -37809,16 +38905,16 @@ static bool canReduceVMulWidth(SDNode *N, SelectionDAG &DAG, ShrinkMode &Mode) {
unsigned MinSignBits = std::min(SignBits[0], SignBits[1]);
// When ranges are from -128 ~ 127, use MULS8 mode.
if (MinSignBits >= 25)
- Mode = MULS8;
+ Mode = ShrinkMode::MULS8;
// When ranges are from 0 ~ 255, use MULU8 mode.
else if (AllPositive && MinSignBits >= 24)
- Mode = MULU8;
+ Mode = ShrinkMode::MULU8;
// When ranges are from -32768 ~ 32767, use MULS16 mode.
else if (MinSignBits >= 17)
- Mode = MULS16;
+ Mode = ShrinkMode::MULS16;
// When ranges are from 0 ~ 65535, use MULU16 mode.
else if (AllPositive && MinSignBits >= 16)
- Mode = MULU16;
+ Mode = ShrinkMode::MULU16;
else
return false;
return true;
@@ -37888,15 +38984,17 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
// Generate the lower part of mul: pmullw. For MULU8/MULS8, only the
// lower part is needed.
SDValue MulLo = DAG.getNode(ISD::MUL, DL, ReducedVT, NewN0, NewN1);
- if (Mode == MULU8 || Mode == MULS8)
- return DAG.getNode((Mode == MULU8) ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND,
+ if (Mode == ShrinkMode::MULU8 || Mode == ShrinkMode::MULS8)
+ return DAG.getNode((Mode == ShrinkMode::MULU8) ? ISD::ZERO_EXTEND
+ : ISD::SIGN_EXTEND,
DL, VT, MulLo);
MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
// Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
// the higher part is also needed.
- SDValue MulHi = DAG.getNode(Mode == MULS16 ? ISD::MULHS : ISD::MULHU, DL,
- ReducedVT, NewN0, NewN1);
+ SDValue MulHi =
+ DAG.getNode(Mode == ShrinkMode::MULS16 ? ISD::MULHS : ISD::MULHU, DL,
+ ReducedVT, NewN0, NewN1);
// Repack the lower part and higher part result of mul into a wider
// result.
@@ -38294,7 +39392,7 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
// We shift all of the values by one. In many cases we do not have
// hardware support for this operation. This is better expressed as an ADD
// of two values.
- if (N1SplatC->getAPIntValue() == 1)
+ if (N1SplatC->isOne())
return DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N0);
}
@@ -38546,15 +39644,15 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
EVT VT = N->getValueType(0);
SDValue N0 = N->getOperand(0);
- SDValue N1 = N->getOperand(1);
unsigned NumBitsPerElt = VT.getScalarSizeInBits();
assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
"Unexpected value type");
- assert(N1.getValueType() == MVT::i8 && "Unexpected shift amount type");
+ assert(N->getOperand(1).getValueType() == MVT::i8 &&
+ "Unexpected shift amount type");
// Out of range logical bit shifts are guaranteed to be zero.
// Out of range arithmetic bit shifts splat the sign bit.
- unsigned ShiftVal = cast<ConstantSDNode>(N1)->getZExtValue();
+ unsigned ShiftVal = N->getConstantOperandVal(1);
if (ShiftVal >= NumBitsPerElt) {
if (LogicalShift)
return DAG.getConstant(0, SDLoc(N), VT);
@@ -39094,6 +40192,71 @@ static SDValue combineParity(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(ISD::ZERO_EXTEND, DL, N->getValueType(0), Setnp);
}
+
+// Look for (and (bitcast (vXi1 (concat_vectors (vYi1 setcc), undef,))), C)
+// Where C is a mask containing the same number of bits as the setcc and
+// where the setcc will freely 0 upper bits of k-register. We can replace the
+// undef in the concat with 0s and remove the AND. This mainly helps with
+// v2i1/v4i1 setcc being casted to scalar.
+static SDValue combineScalarAndWithMaskSetcc(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::AND && "Unexpected opcode!");
+
+ EVT VT = N->getValueType(0);
+
+ // Make sure this is an AND with constant. We will check the value of the
+ // constant later.
+ if (!isa<ConstantSDNode>(N->getOperand(1)))
+ return SDValue();
+
+ // This is implied by the ConstantSDNode.
+ assert(!VT.isVector() && "Expected scalar VT!");
+
+ if (N->getOperand(0).getOpcode() != ISD::BITCAST ||
+ !N->getOperand(0).hasOneUse() ||
+ !N->getOperand(0).getOperand(0).hasOneUse())
+ return SDValue();
+
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ SDValue Src = N->getOperand(0).getOperand(0);
+ EVT SrcVT = Src.getValueType();
+ if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::i1 ||
+ !TLI.isTypeLegal(SrcVT))
+ return SDValue();
+
+ if (Src.getOpcode() != ISD::CONCAT_VECTORS)
+ return SDValue();
+
+ // We only care about the first subvector of the concat, we expect the
+ // other subvectors to be ignored due to the AND if we make the change.
+ SDValue SubVec = Src.getOperand(0);
+ EVT SubVecVT = SubVec.getValueType();
+
+ // First subvector should be a setcc with a legal result type. The RHS of the
+ // AND should be a mask with this many bits.
+ if (SubVec.getOpcode() != ISD::SETCC || !TLI.isTypeLegal(SubVecVT) ||
+ !N->getConstantOperandAPInt(1).isMask(SubVecVT.getVectorNumElements()))
+ return SDValue();
+
+ EVT SetccVT = SubVec.getOperand(0).getValueType();
+ if (!TLI.isTypeLegal(SetccVT) ||
+ !(Subtarget.hasVLX() || SetccVT.is512BitVector()))
+ return SDValue();
+
+ if (!(Subtarget.hasBWI() || SetccVT.getScalarSizeInBits() >= 32))
+ return SDValue();
+
+ // We passed all the checks. Rebuild the concat_vectors with zeroes
+ // and cast it back to VT.
+ SDLoc dl(N);
+ SmallVector<SDValue, 4> Ops(Src.getNumOperands(),
+ DAG.getConstant(0, dl, SubVecVT));
+ Ops[0] = SubVec;
+ SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, dl, SrcVT,
+ Ops);
+ return DAG.getBitcast(VT, Concat);
+}
+
static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI,
const X86Subtarget &Subtarget) {
@@ -39132,9 +40295,12 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
SrcOps.size() == 1) {
SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
if (Mask) {
APInt AllBits = APInt::getAllOnesValue(NumElts);
return DAG.getSetCC(dl, MVT::i1, Mask,
@@ -39143,6 +40309,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
}
}
+ if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
+ return V;
+
if (DCI.isBeforeLegalizeOps())
return SDValue();
@@ -39290,68 +40459,6 @@ static bool matchLogicBlend(SDNode *N, SDValue &X, SDValue &Y, SDValue &Mask) {
return true;
}
-// Try to match:
-// (or (and (M, (sub 0, X)), (pandn M, X)))
-// which is a special case of vselect:
-// (vselect M, (sub 0, X), X)
-// Per:
-// http://graphics.stanford.edu/~seander/bithacks.html#ConditionalNegate
-// We know that, if fNegate is 0 or 1:
-// (fNegate ? -v : v) == ((v ^ -fNegate) + fNegate)
-//
-// Here, we have a mask, M (all 1s or 0), and, similarly, we know that:
-// ((M & 1) ? -X : X) == ((X ^ -(M & 1)) + (M & 1))
-// ( M ? -X : X) == ((X ^ M ) + (M & 1))
-// This lets us transform our vselect to:
-// (add (xor X, M), (and M, 1))
-// And further to:
-// (sub (xor X, M), M)
-static SDValue combineLogicBlendIntoConditionalNegate(
- EVT VT, SDValue Mask, SDValue X, SDValue Y, const SDLoc &DL,
- SelectionDAG &DAG, const X86Subtarget &Subtarget) {
- EVT MaskVT = Mask.getValueType();
- assert(MaskVT.isInteger() &&
- DAG.ComputeNumSignBits(Mask) == MaskVT.getScalarSizeInBits() &&
- "Mask must be zero/all-bits");
-
- if (X.getValueType() != MaskVT || Y.getValueType() != MaskVT)
- return SDValue();
- if (!DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT))
- return SDValue();
-
- auto IsNegV = [](SDNode *N, SDValue V) {
- return N->getOpcode() == ISD::SUB && N->getOperand(1) == V &&
- ISD::isBuildVectorAllZeros(N->getOperand(0).getNode());
- };
-
- SDValue V;
- if (IsNegV(Y.getNode(), X))
- V = X;
- else if (IsNegV(X.getNode(), Y))
- V = Y;
- else
- return SDValue();
-
- SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask);
- SDValue SubOp2 = Mask;
-
- // If the negate was on the false side of the select, then
- // the operands of the SUB need to be swapped. PR 27251.
- // This is because the pattern being matched above is
- // (vselect M, (sub (0, X), X) -> (sub (xor X, M), M)
- // but if the pattern matched was
- // (vselect M, X, (sub (0, X))), that is really negation of the pattern
- // above, -(vselect M, (sub 0, X), X), and therefore the replacement
- // pattern also needs to be a negation of the replacement pattern above.
- // And -(sub X, Y) is just sub (Y, X), so swapping the operands of the
- // sub accomplishes the negation of the replacement pattern.
- if (V == Y)
- std::swap(SubOp1, SubOp2);
-
- SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2);
- return DAG.getBitcast(VT, Res);
-}
-
// Try to fold:
// (or (and (m, y), (pandn m, x)))
// into:
@@ -39512,66 +40619,20 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
return Ret;
}
-static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
- TargetLowering::DAGCombinerInfo &DCI,
- const X86Subtarget &Subtarget) {
+static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
+ const X86Subtarget &Subtarget) {
+ assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
SDValue N0 = N->getOperand(0);
SDValue N1 = N->getOperand(1);
EVT VT = N->getValueType(0);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
- // If this is SSE1 only convert to FOR to avoid scalarization.
- if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
- return DAG.getBitcast(MVT::v4i32,
- DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
- DAG.getBitcast(MVT::v4f32, N0),
- DAG.getBitcast(MVT::v4f32, N1)));
- }
-
- // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
- // TODO: Support multiple SrcOps.
- if (VT == MVT::i1) {
- SmallVector<SDValue, 2> SrcOps;
- if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
- SrcOps.size() == 1) {
- SDLoc dl(N);
- unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
- EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
- SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
- if (Mask) {
- APInt AllBits = APInt::getNullValue(NumElts);
- return DAG.getSetCC(dl, MVT::i1, Mask,
- DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
- }
- }
- }
-
- if (DCI.isBeforeLegalizeOps())
- return SDValue();
-
- if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
- return R;
-
- if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
- return FPLogic;
-
- if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
- return R;
-
- if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
- return R;
-
- // Attempt to recursively combine an OR of shuffles.
- if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
- SDValue Op(N, 0);
- if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
- return Res;
- }
-
- if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+ if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
+ !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
return SDValue();
// fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
- bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
+ bool OptForSize = DAG.shouldOptForSize();
unsigned Bits = VT.getScalarSizeInBits();
// SHLD/SHRD instructions have lower register pressure, but on some
@@ -39589,11 +40650,13 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
if (!N0.hasOneUse() || !N1.hasOneUse())
return SDValue();
+ EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
+
SDValue ShAmt0 = N0.getOperand(1);
- if (ShAmt0.getValueType() != MVT::i8)
+ if (ShAmt0.getValueType() != ShiftVT)
return SDValue();
SDValue ShAmt1 = N1.getOperand(1);
- if (ShAmt1.getValueType() != MVT::i8)
+ if (ShAmt1.getValueType() != ShiftVT)
return SDValue();
// Peek through any modulo shift masks.
@@ -39628,12 +40691,12 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
std::swap(ShMsk0, ShMsk1);
}
- auto GetFunnelShift = [&DAG, &DL, VT, Opc](SDValue Op0, SDValue Op1,
- SDValue Amt) {
+ auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
+ SDValue Amt) {
if (Opc == ISD::FSHR)
std::swap(Op0, Op1);
return DAG.getNode(Opc, DL, VT, Op0, Op1,
- DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Amt));
+ DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
};
// OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
@@ -39674,7 +40737,7 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
(ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
if (Op1.getOpcode() == InnerShift &&
isa<ConstantSDNode>(Op1.getOperand(1)) &&
- Op1.getConstantOperandAPInt(1) == 1) {
+ Op1.getConstantOperandAPInt(1).isOneValue()) {
return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
}
// Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
@@ -39689,6 +40752,70 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
+ TargetLowering::DAGCombinerInfo &DCI,
+ const X86Subtarget &Subtarget) {
+ SDValue N0 = N->getOperand(0);
+ SDValue N1 = N->getOperand(1);
+ EVT VT = N->getValueType(0);
+
+ // If this is SSE1 only convert to FOR to avoid scalarization.
+ if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && VT == MVT::v4i32) {
+ return DAG.getBitcast(MVT::v4i32,
+ DAG.getNode(X86ISD::FOR, SDLoc(N), MVT::v4f32,
+ DAG.getBitcast(MVT::v4f32, N0),
+ DAG.getBitcast(MVT::v4f32, N1)));
+ }
+
+ // Match any-of bool scalar reductions into a bitcast/movmsk + cmp.
+ // TODO: Support multiple SrcOps.
+ if (VT == MVT::i1) {
+ SmallVector<SDValue, 2> SrcOps;
+ if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+ SrcOps.size() == 1) {
+ SDLoc dl(N);
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ unsigned NumElts = SrcOps[0].getValueType().getVectorNumElements();
+ EVT MaskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
+ SDValue Mask = combineBitcastvxi1(DAG, MaskVT, SrcOps[0], dl, Subtarget);
+ if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
+ Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
+ if (Mask) {
+ APInt AllBits = APInt::getNullValue(NumElts);
+ return DAG.getSetCC(dl, MVT::i1, Mask,
+ DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+ }
+ }
+ }
+
+ if (DCI.isBeforeLegalizeOps())
+ return SDValue();
+
+ if (SDValue R = combineCompareEqual(N, DAG, DCI, Subtarget))
+ return R;
+
+ if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
+ return FPLogic;
+
+ if (SDValue R = canonicalizeBitSelect(N, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
+ return R;
+
+ if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
+ return R;
+
+ // Attempt to recursively combine an OR of shuffles.
+ if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
+ SDValue Op(N, 0);
+ if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+ return Res;
+ }
+
+ return SDValue();
+}
+
/// Try to turn tests against the signbit in the form of:
/// XOR(TRUNCATE(SRL(X, size(X)-1)), 1)
/// into:
@@ -39758,8 +40885,8 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
default: return SDValue();
case MVT::v16i8:
case MVT::v8i16:
- case MVT::v4i32: if (!Subtarget.hasSSE2()) return SDValue(); break;
- case MVT::v2i64: if (!Subtarget.hasSSE42()) return SDValue(); break;
+ case MVT::v4i32:
+ case MVT::v2i64: if (!Subtarget.hasSSE2()) return SDValue(); break;
case MVT::v32i8:
case MVT::v16i16:
case MVT::v8i32:
@@ -39783,7 +40910,7 @@ static SDValue foldVectorXorShiftIntoCmp(SDNode *N, SelectionDAG &DAG,
// Create a greater-than comparison against -1. We don't use the more obvious
// greater-than-or-equal-to-zero because SSE/AVX don't have that instruction.
- return DAG.getNode(X86ISD::PCMPGT, SDLoc(N), VT, Shift.getOperand(0), Ones);
+ return DAG.getSetCC(SDLoc(N), VT, Shift.getOperand(0), Ones, ISD::SETGT);
}
/// Detect patterns of truncation with unsigned saturation:
@@ -39950,7 +41077,7 @@ static SDValue combineTruncateWithSat(SDValue In, EVT VT, const SDLoc &DL,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (TLI.isTypeLegal(InVT) && InVT.isVector() && SVT != MVT::i1 &&
Subtarget.hasAVX512() && (InSVT != MVT::i16 || Subtarget.hasBWI())) {
- unsigned TruncOpc;
+ unsigned TruncOpc = 0;
SDValue SatVal;
if (auto SSatVal = detectSSatPattern(In, VT)) {
SatVal = SSatVal;
@@ -40252,6 +41379,7 @@ static bool getParamsForOneTrueMaskedElt(MaskedLoadStoreSDNode *MaskedOp,
static SDValue
reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
// TODO: This is not x86-specific, so it could be lifted to DAGCombiner.
// However, some target hooks may need to be added to know when the transform
// is profitable. Endianness would also have to be considered.
@@ -40279,6 +41407,7 @@ reduceMaskedLoadToScalarLoad(MaskedLoadSDNode *ML, SelectionDAG &DAG,
static SDValue
combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ assert(ML->isUnindexed() && "Unexpected indexed masked load!");
if (!ISD::isBuildVectorOfConstantSDNodes(ML->getMask().getNode()))
return SDValue();
@@ -40314,10 +41443,10 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
// The new masked load has an undef pass-through operand. The select uses the
// original pass-through operand.
- SDValue NewML = DAG.getMaskedLoad(VT, DL, ML->getChain(), ML->getBasePtr(),
- ML->getMask(), DAG.getUNDEF(VT),
- ML->getMemoryVT(), ML->getMemOperand(),
- ML->getExtensionType());
+ SDValue NewML = DAG.getMaskedLoad(
+ VT, DL, ML->getChain(), ML->getBasePtr(), ML->getOffset(), ML->getMask(),
+ DAG.getUNDEF(VT), ML->getMemoryVT(), ML->getMemOperand(),
+ ML->getAddressingMode(), ML->getExtensionType());
SDValue Blend = DAG.getSelect(DL, VT, ML->getMask(), NewML,
ML->getPassThru());
@@ -40403,8 +41532,9 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
TLI.isTruncStoreLegal(Value.getOperand(0).getValueType(),
Mst->getMemoryVT())) {
return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Value.getOperand(0),
- Mst->getBasePtr(), Mask,
- Mst->getMemoryVT(), Mst->getMemOperand(), true);
+ Mst->getBasePtr(), Mst->getOffset(), Mask,
+ Mst->getMemoryVT(), Mst->getMemOperand(),
+ Mst->getAddressingMode(), true);
}
return SDValue();
@@ -40593,59 +41723,24 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
cast<LoadSDNode>(St->getValue())->isSimple() &&
St->getChain().hasOneUse() && St->isSimple()) {
LoadSDNode *Ld = cast<LoadSDNode>(St->getValue().getNode());
- SmallVector<SDValue, 8> Ops;
if (!ISD::isNormalLoad(Ld))
return SDValue();
- // If this is not the MMX case, i.e. we are just turning i64 load/store
- // into f64 load/store, avoid the transformation if there are multiple
- // uses of the loaded value.
- if (!VT.isVector() && !Ld->hasNUsesOfValue(1, 0))
+ // Avoid the transformation if there are multiple uses of the loaded value.
+ if (!Ld->hasNUsesOfValue(1, 0))
return SDValue();
SDLoc LdDL(Ld);
SDLoc StDL(N);
- // If we are a 64-bit capable x86, lower to a single movq load/store pair.
- // Otherwise, if it's legal to use f64 SSE instructions, use f64 load/store
- // pair instead.
- if (Subtarget.is64Bit() || F64IsLegal) {
- MVT LdVT = Subtarget.is64Bit() ? MVT::i64 : MVT::f64;
- SDValue NewLd = DAG.getLoad(LdVT, LdDL, Ld->getChain(), Ld->getBasePtr(),
- Ld->getMemOperand());
-
- // Make sure new load is placed in same chain order.
- DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
- return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
- St->getMemOperand());
- }
-
- // Otherwise, lower to two pairs of 32-bit loads / stores.
- SDValue LoAddr = Ld->getBasePtr();
- SDValue HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, LdDL);
-
- SDValue LoLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), LoAddr,
- Ld->getPointerInfo(), Ld->getAlignment(),
- Ld->getMemOperand()->getFlags());
- SDValue HiLd = DAG.getLoad(MVT::i32, LdDL, Ld->getChain(), HiAddr,
- Ld->getPointerInfo().getWithOffset(4),
- MinAlign(Ld->getAlignment(), 4),
- Ld->getMemOperand()->getFlags());
- // Make sure new loads are placed in same chain order.
- DAG.makeEquivalentMemoryOrdering(Ld, LoLd);
- DAG.makeEquivalentMemoryOrdering(Ld, HiLd);
-
- LoAddr = St->getBasePtr();
- HiAddr = DAG.getMemBasePlusOffset(LoAddr, 4, StDL);
-
- SDValue LoSt =
- DAG.getStore(St->getChain(), StDL, LoLd, LoAddr, St->getPointerInfo(),
- St->getAlignment(), St->getMemOperand()->getFlags());
- SDValue HiSt = DAG.getStore(St->getChain(), StDL, HiLd, HiAddr,
- St->getPointerInfo().getWithOffset(4),
- MinAlign(St->getAlignment(), 4),
- St->getMemOperand()->getFlags());
- return DAG.getNode(ISD::TokenFactor, StDL, MVT::Other, LoSt, HiSt);
+ // Lower to a single movq load/store pair.
+ SDValue NewLd = DAG.getLoad(MVT::f64, LdDL, Ld->getChain(),
+ Ld->getBasePtr(), Ld->getMemOperand());
+
+ // Make sure new load is placed in same chain order.
+ DAG.makeEquivalentMemoryOrdering(Ld, NewLd);
+ return DAG.getStore(St->getChain(), StDL, NewLd, St->getBasePtr(),
+ St->getMemOperand());
}
// This is similar to the above case, but here we handle a scalar 64-bit
@@ -41351,23 +42446,25 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
SDValue Op = peekThroughBitcasts(SDValue(N, 0));
EVT VT = Op->getValueType(0);
- // Make sure the element size does't change.
+
+ // Make sure the element size doesn't change.
if (VT.getScalarSizeInBits() != ScalarSize)
return SDValue();
- if (auto SVOp = dyn_cast<ShuffleVectorSDNode>(Op.getNode())) {
+ unsigned Opc = Op.getOpcode();
+ switch (Opc) {
+ case ISD::VECTOR_SHUFFLE: {
// For a VECTOR_SHUFFLE(VEC1, VEC2), if the VEC2 is undef, then the negate
// of this is VECTOR_SHUFFLE(-VEC1, UNDEF). The mask can be anything here.
- if (!SVOp->getOperand(1).isUndef())
+ if (!Op.getOperand(1).isUndef())
return SDValue();
- if (SDValue NegOp0 = isFNEG(DAG, SVOp->getOperand(0).getNode(), Depth + 1))
+ if (SDValue NegOp0 = isFNEG(DAG, Op.getOperand(0).getNode(), Depth + 1))
if (NegOp0.getValueType() == VT) // FIXME: Can we do better?
- return DAG.getVectorShuffle(VT, SDLoc(SVOp), NegOp0, DAG.getUNDEF(VT),
- SVOp->getMask());
- return SDValue();
+ return DAG.getVectorShuffle(VT, SDLoc(Op), NegOp0, DAG.getUNDEF(VT),
+ cast<ShuffleVectorSDNode>(Op)->getMask());
+ break;
}
- unsigned Opc = Op.getOpcode();
- if (Opc == ISD::INSERT_VECTOR_ELT) {
+ case ISD::INSERT_VECTOR_ELT: {
// Negate of INSERT_VECTOR_ELT(UNDEF, V, INDEX) is INSERT_VECTOR_ELT(UNDEF,
// -V, INDEX).
SDValue InsVector = Op.getOperand(0);
@@ -41378,34 +42475,35 @@ static SDValue isFNEG(SelectionDAG &DAG, SDNode *N, unsigned Depth = 0) {
if (NegInsVal.getValueType() == VT.getVectorElementType()) // FIXME
return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Op), VT, InsVector,
NegInsVal, Op.getOperand(2));
- return SDValue();
+ break;
}
+ case ISD::FSUB:
+ case ISD::XOR:
+ case X86ISD::FXOR: {
+ SDValue Op1 = Op.getOperand(1);
+ SDValue Op0 = Op.getOperand(0);
- if (Opc != X86ISD::FXOR && Opc != ISD::XOR && Opc != ISD::FSUB)
- return SDValue();
-
- SDValue Op1 = Op.getOperand(1);
- SDValue Op0 = Op.getOperand(0);
-
- // For XOR and FXOR, we want to check if constant bits of Op1 are sign bit
- // masks. For FSUB, we have to check if constant bits of Op0 are sign bit
- // masks and hence we swap the operands.
- if (Opc == ISD::FSUB)
- std::swap(Op0, Op1);
+ // For XOR and FXOR, we want to check if constant
+ // bits of Op1 are sign bit masks. For FSUB, we
+ // have to check if constant bits of Op0 are sign
+ // bit masks and hence we swap the operands.
+ if (Opc == ISD::FSUB)
+ std::swap(Op0, Op1);
- APInt UndefElts;
- SmallVector<APInt, 16> EltBits;
- // Extract constant bits and see if they are all sign bit masks. Ignore the
- // undef elements.
- if (getTargetConstantBitsFromNode(Op1, ScalarSize,
- UndefElts, EltBits,
- /* AllowWholeUndefs */ true,
- /* AllowPartialUndefs */ false)) {
- for (unsigned I = 0, E = EltBits.size(); I < E; I++)
- if (!UndefElts[I] && !EltBits[I].isSignMask())
- return SDValue();
+ APInt UndefElts;
+ SmallVector<APInt, 16> EltBits;
+ // Extract constant bits and see if they are all
+ // sign bit masks. Ignore the undef elements.
+ if (getTargetConstantBitsFromNode(Op1, ScalarSize, UndefElts, EltBits,
+ /* AllowWholeUndefs */ true,
+ /* AllowPartialUndefs */ false)) {
+ for (unsigned I = 0, E = EltBits.size(); I < E; I++)
+ if (!UndefElts[I] && !EltBits[I].isSignMask())
+ return SDValue();
- return peekThroughBitcasts(Op0);
+ return peekThroughBitcasts(Op0);
+ }
+ }
}
return SDValue();
@@ -41642,8 +42740,7 @@ static SDValue foldXor1SetCC(SDNode *N, SelectionDAG &DAG) {
return SDValue();
SDValue LHS = N->getOperand(0);
- auto *RHSC = dyn_cast<ConstantSDNode>(N->getOperand(1));
- if (!RHSC || RHSC->getZExtValue() != 1 || LHS->getOpcode() != X86ISD::SETCC)
+ if (!isOneConstant(N->getOperand(1)) || LHS->getOpcode() != X86ISD::SETCC)
return SDValue();
X86::CondCode NewCC = X86::GetOppositeBranchCondition(
@@ -41817,8 +42914,9 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
assert(N->getOpcode() == X86ISD::FMIN || N->getOpcode() == X86ISD::FMAX);
- // Only perform optimizations if UnsafeMath is used.
- if (!DAG.getTarget().Options.UnsafeFPMath)
+ // FMIN/FMAX are commutative if no NaNs and no negative zeros are allowed.
+ if (!DAG.getTarget().Options.NoNaNsFPMath ||
+ !DAG.getTarget().Options.NoSignedZerosFPMath)
return SDValue();
// If we run in unsafe-math mode, then convert the FMAX and FMIN nodes
@@ -41943,6 +43041,7 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
+ // FIXME: Handle strict fp nodes.
EVT VT = N->getValueType(0);
// Convert a full vector load into vzload when not all bits are needed.
@@ -41951,7 +43050,7 @@ static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
assert(InVT.is128BitVector() && "Expected 128-bit input vector");
- LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
+ LoadSDNode *LN = cast<LoadSDNode>(In);
// Unless the load is volatile or atomic.
if (LN->isSimple()) {
SDLoc dl(N);
@@ -42569,6 +43668,44 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a
+/// recognizable memcmp expansion.
+static bool isOrXorXorTree(SDValue X, bool Root = true) {
+ if (X.getOpcode() == ISD::OR)
+ return isOrXorXorTree(X.getOperand(0), false) &&
+ isOrXorXorTree(X.getOperand(1), false);
+ if (Root)
+ return false;
+ return X.getOpcode() == ISD::XOR;
+}
+
+/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp
+/// expansion.
+template<typename F>
+static SDValue emitOrXorXorTree(SDValue X, SDLoc &DL, SelectionDAG &DAG,
+ EVT VecVT, EVT CmpVT, bool HasPT, F SToV) {
+ SDValue Op0 = X.getOperand(0);
+ SDValue Op1 = X.getOperand(1);
+ if (X.getOpcode() == ISD::OR) {
+ SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT, CmpVT, HasPT, SToV);
+ SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT, CmpVT, HasPT, SToV);
+ if (VecVT != CmpVT)
+ return DAG.getNode(ISD::OR, DL, CmpVT, A, B);
+ if (HasPT)
+ return DAG.getNode(ISD::OR, DL, VecVT, A, B);
+ return DAG.getNode(ISD::AND, DL, CmpVT, A, B);
+ } else if (X.getOpcode() == ISD::XOR) {
+ SDValue A = SToV(Op0);
+ SDValue B = SToV(Op1);
+ if (VecVT != CmpVT)
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETNE);
+ if (HasPT)
+ return DAG.getNode(ISD::XOR, DL, VecVT, A, B);
+ return DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
+ }
+ llvm_unreachable("Impossible");
+}
+
/// Try to map a 128-bit or larger integer comparison to vector instructions
/// before type legalization splits it up into chunks.
static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
@@ -42589,10 +43726,8 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
// logically-combined vector-sized operands compared to zero. This pattern may
// be generated by the memcmp expansion pass with oversized integer compares
// (see PR33325).
- bool IsOrXorXorCCZero = isNullConstant(Y) && X.getOpcode() == ISD::OR &&
- X.getOperand(0).getOpcode() == ISD::XOR &&
- X.getOperand(1).getOpcode() == ISD::XOR;
- if (isNullConstant(Y) && !IsOrXorXorCCZero)
+ bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X);
+ if (isNullConstant(Y) && !IsOrXorXorTreeCCZero)
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
@@ -42602,66 +43737,102 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
X.getOpcode() == ISD::LOAD;
};
if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) &&
- !IsOrXorXorCCZero)
+ !IsOrXorXorTreeCCZero)
return SDValue();
EVT VT = SetCC->getValueType(0);
SDLoc DL(SetCC);
bool HasAVX = Subtarget.hasAVX();
- // Use XOR (plus OR) and PTEST after SSE4.1 and before AVX512.
+ // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
+ // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
// Otherwise use PCMPEQ (plus AND) and mask testing.
if ((OpSize == 128 && Subtarget.hasSSE2()) ||
(OpSize == 256 && HasAVX) ||
(OpSize == 512 && Subtarget.useAVX512Regs())) {
bool HasPT = Subtarget.hasSSE41();
+
+ // PTEST and MOVMSK are slow on Knights Landing and Knights Mill and widened
+ // vector registers are essentially free. (Technically, widening registers
+ // prevents load folding, but the tradeoff is worth it.)
+ bool PreferKOT = Subtarget.preferMaskRegisters();
+ bool NeedZExt = PreferKOT && !Subtarget.hasVLX() && OpSize != 512;
+
EVT VecVT = MVT::v16i8;
- EVT CmpVT = MVT::v16i8;
- if (OpSize == 256)
- VecVT = CmpVT = MVT::v32i8;
- if (OpSize == 512) {
+ EVT CmpVT = PreferKOT ? MVT::v16i1 : VecVT;
+ if (OpSize == 256) {
+ VecVT = MVT::v32i8;
+ CmpVT = PreferKOT ? MVT::v32i1 : VecVT;
+ }
+ EVT CastVT = VecVT;
+ bool NeedsAVX512FCast = false;
+ if (OpSize == 512 || NeedZExt) {
if (Subtarget.hasBWI()) {
VecVT = MVT::v64i8;
CmpVT = MVT::v64i1;
+ if (OpSize == 512)
+ CastVT = VecVT;
} else {
VecVT = MVT::v16i32;
CmpVT = MVT::v16i1;
+ CastVT = OpSize == 512 ? VecVT :
+ OpSize == 256 ? MVT::v8i32 : MVT::v4i32;
+ NeedsAVX512FCast = true;
+ }
+ }
+
+ auto ScalarToVector = [&](SDValue X) -> SDValue {
+ bool TmpZext = false;
+ EVT TmpCastVT = CastVT;
+ if (X.getOpcode() == ISD::ZERO_EXTEND) {
+ SDValue OrigX = X.getOperand(0);
+ unsigned OrigSize = OrigX.getScalarValueSizeInBits();
+ if (OrigSize < OpSize) {
+ if (OrigSize == 128) {
+ TmpCastVT = NeedsAVX512FCast ? MVT::v4i32 : MVT::v16i8;
+ X = OrigX;
+ TmpZext = true;
+ } else if (OrigSize == 256) {
+ TmpCastVT = NeedsAVX512FCast ? MVT::v8i32 : MVT::v32i8;
+ X = OrigX;
+ TmpZext = true;
+ }
+ }
}
- }
+ X = DAG.getBitcast(TmpCastVT, X);
+ if (!NeedZExt && !TmpZext)
+ return X;
+ const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+ MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+ return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
+ DAG.getConstant(0, DL, VecVT), X,
+ DAG.getConstant(0, DL, VecIdxVT));
+ };
SDValue Cmp;
- if (IsOrXorXorCCZero) {
+ if (IsOrXorXorTreeCCZero) {
// This is a bitwise-combined equality comparison of 2 pairs of vectors:
// setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne
// Use 2 vector equality compares and 'and' the results before doing a
// MOVMSK.
- SDValue A = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(0));
- SDValue B = DAG.getBitcast(VecVT, X.getOperand(0).getOperand(1));
- SDValue C = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(0));
- SDValue D = DAG.getBitcast(VecVT, X.getOperand(1).getOperand(1));
- if (VecVT == CmpVT && HasPT) {
- SDValue Cmp1 = DAG.getNode(ISD::XOR, DL, VecVT, A, B);
- SDValue Cmp2 = DAG.getNode(ISD::XOR, DL, VecVT, C, D);
- Cmp = DAG.getNode(ISD::OR, DL, VecVT, Cmp1, Cmp2);
- } else {
- SDValue Cmp1 = DAG.getSetCC(DL, CmpVT, A, B, ISD::SETEQ);
- SDValue Cmp2 = DAG.getSetCC(DL, CmpVT, C, D, ISD::SETEQ);
- Cmp = DAG.getNode(ISD::AND, DL, CmpVT, Cmp1, Cmp2);
- }
+ Cmp = emitOrXorXorTree(X, DL, DAG, VecVT, CmpVT, HasPT, ScalarToVector);
} else {
- SDValue VecX = DAG.getBitcast(VecVT, X);
- SDValue VecY = DAG.getBitcast(VecVT, Y);
- if (VecVT == CmpVT && HasPT) {
+ SDValue VecX = ScalarToVector(X);
+ SDValue VecY = ScalarToVector(Y);
+ if (VecVT != CmpVT) {
+ Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETNE);
+ } else if (HasPT) {
Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY);
} else {
Cmp = DAG.getSetCC(DL, CmpVT, VecX, VecY, ISD::SETEQ);
}
}
- // For 512-bits we want to emit a setcc that will lower to kortest.
+ // AVX512 should emit a setcc that will lower to kortest.
if (VecVT != CmpVT) {
- EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 : MVT::i16;
- SDValue Mask = DAG.getAllOnesConstant(DL, KRegVT);
- return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp), Mask, CC);
+ EVT KRegVT = CmpVT == MVT::v64i1 ? MVT::i64 :
+ CmpVT == MVT::v32i1 ? MVT::i32 : MVT::i16;
+ return DAG.getSetCC(DL, VT, DAG.getBitcast(KRegVT, Cmp),
+ DAG.getConstant(0, DL, KRegVT), CC);
}
if (HasPT) {
SDValue BCCmp = DAG.getBitcast(OpSize == 256 ? MVT::v4i64 : MVT::v2i64,
@@ -42687,9 +43858,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
- SDValue LHS = N->getOperand(0);
- SDValue RHS = N->getOperand(1);
+ const ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
+ const SDValue LHS = N->getOperand(0);
+ const SDValue RHS = N->getOperand(1);
EVT VT = N->getValueType(0);
EVT OpVT = LHS.getValueType();
SDLoc DL(N);
@@ -42716,30 +43887,35 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
(CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
- // Put build_vectors on the right.
- if (LHS.getOpcode() == ISD::BUILD_VECTOR) {
- std::swap(LHS, RHS);
- CC = ISD::getSetCCSwappedOperands(CC);
+ // Using temporaries to avoid messing up operand ordering for later
+ // transformations if this doesn't work.
+ SDValue Op0 = LHS;
+ SDValue Op1 = RHS;
+ ISD::CondCode TmpCC = CC;
+ // Put build_vector on the right.
+ if (Op0.getOpcode() == ISD::BUILD_VECTOR) {
+ std::swap(Op0, Op1);
+ TmpCC = ISD::getSetCCSwappedOperands(TmpCC);
}
bool IsSEXT0 =
- (LHS.getOpcode() == ISD::SIGN_EXTEND) &&
- (LHS.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
- bool IsVZero1 = ISD::isBuildVectorAllZeros(RHS.getNode());
+ (Op0.getOpcode() == ISD::SIGN_EXTEND) &&
+ (Op0.getOperand(0).getValueType().getVectorElementType() == MVT::i1);
+ bool IsVZero1 = ISD::isBuildVectorAllZeros(Op1.getNode());
if (IsSEXT0 && IsVZero1) {
- assert(VT == LHS.getOperand(0).getValueType() &&
+ assert(VT == Op0.getOperand(0).getValueType() &&
"Uexpected operand type");
- if (CC == ISD::SETGT)
+ if (TmpCC == ISD::SETGT)
return DAG.getConstant(0, DL, VT);
- if (CC == ISD::SETLE)
+ if (TmpCC == ISD::SETLE)
return DAG.getConstant(1, DL, VT);
- if (CC == ISD::SETEQ || CC == ISD::SETGE)
- return DAG.getNOT(DL, LHS.getOperand(0), VT);
+ if (TmpCC == ISD::SETEQ || TmpCC == ISD::SETGE)
+ return DAG.getNOT(DL, Op0.getOperand(0), VT);
- assert((CC == ISD::SETNE || CC == ISD::SETLT) &&
+ assert((TmpCC == ISD::SETNE || TmpCC == ISD::SETLT) &&
"Unexpected condition code!");
- return LHS.getOperand(0);
+ return Op0.getOperand(0);
}
}
@@ -42752,8 +43928,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
VT.getVectorElementType() == MVT::i1 &&
(OpVT.getVectorElementType() == MVT::i8 ||
OpVT.getVectorElementType() == MVT::i16)) {
- SDValue Setcc = DAG.getNode(ISD::SETCC, DL, OpVT, LHS, RHS,
- N->getOperand(2));
+ SDValue Setcc = DAG.getSetCC(DL, OpVT, LHS, RHS, CC);
return DAG.getNode(ISD::TRUNCATE, DL, VT, Setcc);
}
@@ -42985,16 +44160,18 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
// unary operation isn't a bitwise AND, or if the sizes of the operations
// aren't the same.
EVT VT = N->getValueType(0);
- if (!VT.isVector() || N->getOperand(0)->getOpcode() != ISD::AND ||
- N->getOperand(0)->getOperand(0)->getOpcode() != ISD::SETCC ||
- VT.getSizeInBits() != N->getOperand(0).getValueSizeInBits())
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
+ if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
+ Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
+ VT.getSizeInBits() != Op0.getValueSizeInBits())
return SDValue();
// Now check that the other operand of the AND is a constant. We could
// make the transformation for non-constant splats as well, but it's unclear
// that would be a benefit as it would not eliminate any operations, just
// perform one more step in scalar code before moving to the vector unit.
- if (auto *BV = dyn_cast<BuildVectorSDNode>(N->getOperand(0).getOperand(1))) {
+ if (auto *BV = dyn_cast<BuildVectorSDNode>(Op0.getOperand(1))) {
// Bail out if the vector isn't a constant.
if (!BV->isConstant())
return SDValue();
@@ -43004,12 +44181,19 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
EVT IntVT = BV->getValueType(0);
// Create a new constant of the appropriate type for the transformed
// DAG.
- SDValue SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
+ SDValue SourceConst;
+ if (IsStrict)
+ SourceConst = DAG.getNode(N->getOpcode(), DL, {VT, MVT::Other},
+ {N->getOperand(0), SDValue(BV, 0)});
+ else
+ SourceConst = DAG.getNode(N->getOpcode(), DL, VT, SDValue(BV, 0));
// The AND node needs bitcasts to/from an integer vector type around it.
SDValue MaskConst = DAG.getBitcast(IntVT, SourceConst);
- SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT,
- N->getOperand(0)->getOperand(0), MaskConst);
+ SDValue NewAnd = DAG.getNode(ISD::AND, DL, IntVT, Op0->getOperand(0),
+ MaskConst);
SDValue Res = DAG.getBitcast(VT, NewAnd);
+ if (IsStrict)
+ return DAG.getMergeValues({Res, SourceConst.getValue(1)}, DL);
return Res;
}
@@ -43053,7 +44237,8 @@ static SDValue combineToFPTruncExtElt(SDNode *N, SelectionDAG &DAG) {
static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
- SDValue Op0 = N->getOperand(0);
+ bool IsStrict = N->isStrictFPOpcode();
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
@@ -43067,14 +44252,21 @@ static SDValue combineUIntToFP(SDNode *N, SelectionDAG &DAG,
SDValue P = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Op0);
// UINT_TO_FP isn't legal without AVX512 so use SINT_TO_FP.
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), P});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
// Since UINT_TO_FP is legal (it's marked custom), dag combiner won't
// optimize it to a SINT_TO_FP when the sign bit is known zero. Perform
// the optimization here.
- if (DAG.SignBitIsZero(Op0))
+ if (DAG.SignBitIsZero(Op0)) {
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, SDLoc(N), {VT, MVT::Other},
+ {N->getOperand(0), Op0});
return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, Op0);
+ }
return SDValue();
}
@@ -43084,11 +44276,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
const X86Subtarget &Subtarget) {
// First try to optimize away the conversion entirely when it's
// conditionally from a constant. Vectors only.
+ bool IsStrict = N->isStrictFPOpcode();
if (SDValue Res = combineVectorCompareAndMaskUnaryOp(N, DAG))
return Res;
// Now move on to more general possibilities.
- SDValue Op0 = N->getOperand(0);
+ SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
EVT VT = N->getValueType(0);
EVT InVT = Op0.getValueType();
@@ -43100,6 +44293,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
EVT DstVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
InVT.getVectorNumElements());
SDValue P = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Op0);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), P});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, P);
}
@@ -43117,6 +44313,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
SDLoc dl(N);
if (DCI.isBeforeLegalize() || TruncVT != MVT::v2i32) {
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, Op0);
+ if (IsStrict)
+ return DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {VT, MVT::Other},
+ {N->getOperand(0), Trunc});
return DAG.getNode(ISD::SINT_TO_FP, dl, VT, Trunc);
}
// If we're after legalize and the type is v2i32 we need to shuffle and
@@ -43125,6 +44324,9 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
SDValue Cast = DAG.getBitcast(MVT::v4i32, Op0);
SDValue Shuf = DAG.getVectorShuffle(MVT::v4i32, dl, Cast, Cast,
{ 0, 2, -1, -1 });
+ if (IsStrict)
+ return DAG.getNode(X86ISD::STRICT_CVTSI2P, dl, {VT, MVT::Other},
+ {N->getOperand(0), Shuf});
return DAG.getNode(X86ISD::CVTSI2P, dl, VT, Shuf);
}
}
@@ -43148,13 +44350,16 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
if (Ld->isSimple() && !VT.isVector() &&
ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
!Subtarget.is64Bit() && LdVT == MVT::i64) {
- SDValue FILDChain = Subtarget.getTargetLowering()->BuildFILD(
+ std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
- DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), FILDChain.getValue(1));
- return FILDChain;
+ DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
+ return Tmp.first;
}
}
+ if (IsStrict)
+ return SDValue();
+
if (SDValue V = combineToFPTruncExtElt(N, DAG))
return V;
@@ -43579,7 +44784,8 @@ static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
auto UsePMADDWD = [&](SDValue Op) {
ShrinkMode Mode;
return Op.getOpcode() == ISD::MUL &&
- canReduceVMulWidth(Op.getNode(), DAG, Mode) && Mode != MULU16 &&
+ canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
+ Mode != ShrinkMode::MULU16 &&
(!Subtarget.hasSSE41() ||
(Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
Op->isOnlyUserOf(Op.getOperand(1).getNode())));
@@ -43784,7 +44990,8 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
// Check if the Mul source can be safely shrunk.
ShrinkMode Mode;
- if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) || Mode == MULU16)
+ if (!canReduceVMulWidth(Mul.getNode(), DAG, Mode) ||
+ Mode == ShrinkMode::MULU16)
return SDValue();
auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
@@ -44468,7 +45675,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
SDValue InVec = N->getOperand(0);
SDValue InVecBC = peekThroughBitcasts(InVec);
EVT InVecVT = InVec.getValueType();
- EVT InVecBCVT = InVecBC.getValueType();
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
if (Subtarget.hasAVX() && !Subtarget.hasAVX2() &&
@@ -44512,31 +45718,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
VT, SDLoc(N),
InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
- // Try to move vector bitcast after extract_subv by scaling extraction index:
- // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
- // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
- if (InVec != InVecBC && InVecBCVT.isVector()) {
- unsigned SrcNumElts = InVecBCVT.getVectorNumElements();
- unsigned DestNumElts = InVecVT.getVectorNumElements();
- if ((DestNumElts % SrcNumElts) == 0) {
- unsigned DestSrcRatio = DestNumElts / SrcNumElts;
- if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
- unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
- EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
- InVecBCVT.getScalarType(), NewExtNumElts);
- if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
- TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
- unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
- SDLoc DL(N);
- SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
- SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
- InVecBC, NewIndex);
- return DAG.getBitcast(VT, NewExtract);
- }
- }
- }
- }
-
// If we are extracting from an insert into a zero vector, replace with a
// smaller insert into zero if we don't access less than the original
// subvector. Don't do this for i1 vectors.
@@ -44583,7 +45764,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
return DAG.getNode(X86ISD::CVTSI2P, SDLoc(N), VT, InVec.getOperand(0));
}
// v2f64 CVTUDQ2PD(v4i32).
- if (InOpcode == ISD::UINT_TO_FP &&
+ if (InOpcode == ISD::UINT_TO_FP && Subtarget.hasVLX() &&
InVec.getOperand(0).getValueType() == MVT::v4i32) {
return DAG.getNode(X86ISD::CVTUI2P, SDLoc(N), VT, InVec.getOperand(0));
}
@@ -44751,6 +45932,9 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
TargetLowering::DAGCombinerInfo &DCI) {
EVT VT = N->getValueType(0);
+ if (ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()))
+ return DAG.getConstant(0, SDLoc(N), VT);
+
APInt KnownUndef, KnownZero;
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
@@ -44802,8 +45986,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
case ISD::MLOAD: return combineMaskedLoad(N, DAG, DCI, Subtarget);
case ISD::STORE: return combineStore(N, DAG, DCI, Subtarget);
case ISD::MSTORE: return combineMaskedStore(N, DAG, DCI, Subtarget);
- case ISD::SINT_TO_FP: return combineSIntToFP(N, DAG, DCI, Subtarget);
- case ISD::UINT_TO_FP: return combineUIntToFP(N, DAG, Subtarget);
+ case ISD::SINT_TO_FP:
+ case ISD::STRICT_SINT_TO_FP:
+ return combineSIntToFP(N, DAG, DCI, Subtarget);
+ case ISD::UINT_TO_FP:
+ case ISD::STRICT_UINT_TO_FP:
+ return combineUIntToFP(N, DAG, Subtarget);
case ISD::FADD:
case ISD::FSUB: return combineFaddFsub(N, DAG, Subtarget);
case ISD::FNEG: return combineFneg(N, DAG, Subtarget);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 6f7e90008de4..3a17099da38f 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -598,6 +598,34 @@ namespace llvm {
// For avx512-vp2intersect
VP2INTERSECT,
+ /// X86 strict FP compare instructions.
+ STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+ STRICT_FCMPS,
+
+ // Vector packed double/float comparison.
+ STRICT_CMPP,
+
+ /// Vector comparison generating mask bits for fp and
+ /// integer signed and unsigned data types.
+ STRICT_CMPM,
+
+ // Vector float/double to signed/unsigned integer with truncation.
+ STRICT_CVTTP2SI, STRICT_CVTTP2UI,
+
+ // Vector FP extend.
+ STRICT_VFPEXT,
+
+ // Vector FP round.
+ STRICT_VFPROUND,
+
+ // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+ // Also used by the legacy (V)ROUND intrinsics where we mask out the
+ // scaling part of the immediate.
+ STRICT_VRNDSCALE,
+
+ // Vector signed/unsigned integer to float/double.
+ STRICT_CVTSI2P, STRICT_CVTUI2P,
+
// Compare and swap.
LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
LCMPXCHG8_DAG,
@@ -969,9 +997,7 @@ namespace llvm {
unsigned
getInlineAsmMemConstraint(StringRef ConstraintCode) const override {
- if (ConstraintCode == "i")
- return InlineAsm::Constraint_i;
- else if (ConstraintCode == "o")
+ if (ConstraintCode == "o")
return InlineAsm::Constraint_o;
else if (ConstraintCode == "v")
return InlineAsm::Constraint_v;
@@ -1056,7 +1082,8 @@ namespace llvm {
/// Return true if an FMA operation is faster than a pair of fmul and fadd
/// instructions. fmuladd intrinsics will be expanded to FMAs when this
/// method returns true, otherwise fmuladd is expanded to fmul + fadd.
- bool isFMAFasterThanFMulAndFAdd(EVT VT) const override;
+ bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
+ EVT VT) const override;
/// Return true if it's profitable to narrow
/// operations of type VT1 to VT2. e.g. on x86, it's profitable to narrow
@@ -1125,9 +1152,6 @@ namespace llvm {
bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
SDValue C) const override;
- bool shouldUseStrictFP_TO_INT(EVT FpVT, EVT IntVT,
- bool IsSigned) const override;
-
/// Return true if EXTRACT_SUBVECTOR is cheap for this result type
/// with this index.
bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
@@ -1165,7 +1189,7 @@ namespace llvm {
return nullptr; // nothing to do, move along.
}
- Register getRegisterByName(const char* RegName, EVT VT,
+ Register getRegisterByName(const char* RegName, LLT VT,
const MachineFunction &MF) const override;
/// If a physical register, this returns the register that receives the
@@ -1203,8 +1227,9 @@ namespace llvm {
/// offset as appropriate.
Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
- SDValue BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain, SDValue StackSlot,
- SelectionDAG &DAG) const;
+ std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
+ SDValue StackSlot,
+ SelectionDAG &DAG) const;
bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
@@ -1315,7 +1340,8 @@ namespace llvm {
unsigned getAddressSpace(void) const;
- SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned) const;
+ SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned,
+ SDValue &Chain) const;
SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1340,6 +1366,7 @@ namespace llvm {
SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -1358,8 +1385,7 @@ namespace llvm {
SDValue LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFLT_ROUNDS_(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGC_TRANSITION_START(SDValue Op, SelectionDAG &DAG) const;
- SDValue LowerGC_TRANSITION_END(SDValue Op, SelectionDAG &DAG) const;
+ SDValue LowerGC_TRANSITION(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
SDValue lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const;
SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
@@ -1477,20 +1503,15 @@ namespace llvm {
MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
MachineBasicBlock *MBB) const;
- /// Emit nodes that will be selected as "cmp Op0,Op1", or something
- /// equivalent, for use with the given x86 condition code.
- SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC, const SDLoc &dl,
- SelectionDAG &DAG) const;
-
/// Convert a comparison if required by the subtarget.
SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
/// Emit flags for the given setcc condition and operands. Also returns the
/// corresponding X86 condition code constant in X86CC.
- SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1,
- ISD::CondCode CC, const SDLoc &dl,
- SelectionDAG &DAG,
- SDValue &X86CC) const;
+ SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
+ const SDLoc &dl, SelectionDAG &DAG,
+ SDValue &X86CC, SDValue &Chain,
+ bool IsSignaling) const;
/// Check if replacement of SQRT with RSQRT should be disabled.
bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index cc0f59ab329d..48d0d8a35704 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -48,12 +48,12 @@ private:
static char ID;
/// Machine instruction info used throughout the class.
- const X86InstrInfo *TII;
+ const X86InstrInfo *TII = nullptr;
/// Endbr opcode for the current machine function.
- unsigned int EndbrOpcode;
+ unsigned int EndbrOpcode = 0;
- /// Adds a new ENDBR instruction to the begining of the MBB.
+ /// Adds a new ENDBR instruction to the beginning of the MBB.
/// The function will not add it if already exists.
/// It will add ENDBR32 or ENDBR64 opcode, depending on the target.
/// \returns true if the ENDBR was added and false otherwise.
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 9b5de59430a5..32f012033fb0 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -2078,7 +2078,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
"$cc, $src2, $src1", "$src1, $src2, $cc",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(OpNode_su (_.VT _.RC:$src1), (_.VT _.RC:$src2),
- timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>;
+ timm:$cc)>, EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
let mayLoad = 1 in
defm rm_Int : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
(outs _.KRC:$dst),
@@ -2089,8 +2089,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
timm:$cc),
(OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ let Uses = [MXCSR] in
defm rrb_Int : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),
(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
@@ -2111,7 +2112,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
[(set _.KRC:$dst, (OpNode _.FRC:$src1,
_.FRC:$src2,
timm:$cc))]>,
- EVEX_4V, VEX_LIG, Sched<[sched]>;
+ EVEX_4V, VEX_LIG, Sched<[sched]>, SIMD_EXC;
def rm : AVX512Ii8<0xC2, MRMSrcMem,
(outs _.KRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2, u8imm:$cc),
@@ -2121,7 +2122,7 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
(_.ScalarLdFrag addr:$src2),
timm:$cc))]>,
EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
@@ -2522,11 +2523,12 @@ def X86cmpm_imm_commute : SDNodeXForm<timm, [{
multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
string Name> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rri : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst), (ins _.RC:$src1, _.RC:$src2,u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (X86cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
+ (X86any_cmpm (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1), (_.VT _.RC:$src2), timm:$cc),
1>, Sched<[sched]>;
@@ -2534,8 +2536,8 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.MemOp:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
"$cc, $src2, $src1", "$src1, $src2, $cc",
- (X86cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
- timm:$cc),
+ (X86any_cmpm (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
+ timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1), (_.VT (_.LdFrag addr:$src2)),
timm:$cc)>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -2546,17 +2548,18 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
"vcmp"#_.Suffix,
"$cc, ${src2}"#_.BroadcastStr#", $src1",
"$src1, ${src2}"#_.BroadcastStr#", $cc",
- (X86cmpm (_.VT _.RC:$src1),
- (_.VT (_.BroadcastLdFrag addr:$src2)),
- timm:$cc),
+ (X86any_cmpm (_.VT _.RC:$src1),
+ (_.VT (_.BroadcastLdFrag addr:$src2)),
+ timm:$cc),
(X86cmpm_su (_.VT _.RC:$src1),
(_.VT (_.BroadcastLdFrag addr:$src2)),
timm:$cc)>,
EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
// Patterns for selecting with loads in other operand.
- def : Pat<(X86cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
- timm:$cc),
+ def : Pat<(X86any_cmpm (_.LdFrag addr:$src2), (_.VT _.RC:$src1),
+ timm:$cc),
(!cast<Instruction>(Name#_.ZSuffix#"rmi") _.RC:$src1, addr:$src2,
(X86cmpm_imm_commute timm:$cc))>;
@@ -2567,8 +2570,8 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
_.RC:$src1, addr:$src2,
(X86cmpm_imm_commute timm:$cc))>;
- def : Pat<(X86cmpm (_.BroadcastLdFrag addr:$src2),
- (_.VT _.RC:$src1), timm:$cc),
+ def : Pat<(X86any_cmpm (_.BroadcastLdFrag addr:$src2),
+ (_.VT _.RC:$src1), timm:$cc),
(!cast<Instruction>(Name#_.ZSuffix#"rmbi") _.RC:$src1, addr:$src2,
(X86cmpm_imm_commute timm:$cc))>;
@@ -2582,6 +2585,7 @@ multiclass avx512_vcmp_common<X86FoldableSchedWrite sched, X86VectorVTInfo _,
multiclass avx512_vcmp_sae<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
// comparison code form (VCMP[EQ/LT/LE/...]
+ let Uses = [MXCSR] in
defm rrib : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
(outs _.KRC:$dst),(ins _.RC:$src1, _.RC:$src2, u8imm:$cc),
"vcmp"#_.Suffix,
@@ -2639,7 +2643,7 @@ def X86Vfpclass_su : PatFrag<(ops node:$src1, node:$src2),
multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
Predicate prd> {
- let Predicates = [prd], ExeDomain = _.ExeDomain in {
+ let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -2679,7 +2683,7 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
string mem>{
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -3197,8 +3201,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
X86VectorVTInfo Narrow,
X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT Narrow.RC:$src2), timm:$cc)),
+def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT Narrow.RC:$src2), timm:$cc)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrri")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3215,8 +3219,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
timm:$cc), Narrow.KRC)>;
// Broadcast load.
-def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
- (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
+ (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrmbi")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3231,8 +3235,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
addr:$src2, timm:$cc), Narrow.KRC)>;
// Commuted with broadcast load.
-def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
- (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+ (Narrow.VT Narrow.RC:$src1), timm:$cc)),
(COPY_TO_REGCLASS
(!cast<Instruction>(InstStr#"Zrmbi")
(Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3928,6 +3932,17 @@ def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
def : InstAlias<"vmovd\t{$src, $dst|$dst, $src}",
(VMOVPQIto64Zrr GR64:$dst, VR128X:$src), 0>;
+// Conversions between masks and scalar fp.
+def : Pat<(v32i1 (bitconvert FR32X:$src)),
+ (KMOVDkr (VMOVSS2DIZrr FR32X:$src))>;
+def : Pat<(f32 (bitconvert VK32:$src)),
+ (VMOVDI2SSZrr (KMOVDrk VK32:$src))>;
+
+def : Pat<(v64i1 (bitconvert FR64X:$src)),
+ (KMOVQkr (VMOVSDto64Zrr FR64X:$src))>;
+def : Pat<(f64 (bitconvert VK64:$src)),
+ (VMOV64toSDZrr (KMOVQrk VK64:$src))>;
+
//===----------------------------------------------------------------------===//
// AVX-512 MOVSS, MOVSD
//===----------------------------------------------------------------------===//
@@ -5278,7 +5293,7 @@ defm : avx512_logical_lowering_types<"VPANDN", X86andnp>;
multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode VecNode,
X86FoldableSchedWrite sched, bit IsCommutable> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -5312,7 +5327,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode VecNode, X86FoldableSchedWrite sched,
bit IsCommutable = 0> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -5329,16 +5344,17 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1, _.RC:$src2))>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(_.VT (VecNode _.RC:$src1,
_.ScalarIntMemCPat:$src2))>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
- let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+ let isCodeGenOnly = 1, Predicates = [HasAVX512],
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -5356,6 +5372,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
EVEX2VEXOverride<EVEX2VexOvrd#"rm">;
}
+ let Uses = [MXCSR] in
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -5391,13 +5408,13 @@ multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
NAME#"SD">,
XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
}
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86fadds, X86faddRnds,
+defm VADD : avx512_binop_s_round<0x58, "vadd", any_fadd, X86fadds, X86faddRnds,
SchedWriteFAddSizes, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmuls, X86fmulRnds,
+defm VMUL : avx512_binop_s_round<0x59, "vmul", any_fmul, X86fmuls, X86fmulRnds,
SchedWriteFMulSizes, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubs, X86fsubRnds,
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", any_fsub, X86fsubs, X86fsubRnds,
SchedWriteFAddSizes, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivs, X86fdivRnds,
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", any_fdiv, X86fdivs, X86fdivRnds,
SchedWriteFDivSizes, 0>;
defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86fmin, X86fmins, X86fminSAEs,
SchedWriteFCmpSizes, 0>;
@@ -5429,27 +5446,28 @@ multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
}
defm VMINCSSZ : avx512_comutable_binop_s<0x5D, "vminss", f32x_info, X86fminc,
SchedWriteFCmp.Scl, "VMINCSS">, XS,
- EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
defm VMINCSDZ : avx512_comutable_binop_s<0x5D, "vminsd", f64x_info, X86fminc,
SchedWriteFCmp.Scl, "VMINCSD">, XD,
VEX_W, EVEX_4V, VEX_LIG,
- EVEX_CD8<64, CD8VT1>;
+ EVEX_CD8<64, CD8VT1>, SIMD_EXC;
defm VMAXCSSZ : avx512_comutable_binop_s<0x5F, "vmaxss", f32x_info, X86fmaxc,
SchedWriteFCmp.Scl, "VMAXCSS">, XS,
- EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
+ EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>, SIMD_EXC;
defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
SchedWriteFCmp.Scl, "VMAXCSD">, XD,
VEX_W, EVEX_4V, VEX_LIG,
- EVEX_CD8<64, CD8VT1>;
+ EVEX_CD8<64, CD8VT1>, SIMD_EXC;
multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
X86VectorVTInfo _, X86FoldableSchedWrite sched,
bit IsCommutable,
bit IsKCommutable = IsCommutable> {
- let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ let ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
@@ -5476,7 +5494,7 @@ multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpN
multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNodeRnd,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
"$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -5487,7 +5505,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNodeSAE,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -5526,6 +5544,7 @@ multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator Op
}
}
+let Uses = [MXCSR] in
multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
X86SchedWriteSizes sched> {
defm PSZ : avx512_fp_round_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
@@ -5536,6 +5555,7 @@ multiclass avx512_fp_binop_p_round<bits<8> opc, string OpcodeStr, SDNode OpNodeR
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
+let Uses = [MXCSR] in
multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd,
X86SchedWriteSizes sched> {
defm PSZ : avx512_fp_sae_packed<opc, OpcodeStr, OpNodeRnd, sched.PS.ZMM,
@@ -5546,16 +5566,16 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
}
-defm VADD : avx512_fp_binop_p<0x58, "vadd", fadd, HasAVX512,
+defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, HasAVX512,
SchedWriteFAddSizes, 1>,
avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
-defm VMUL : avx512_fp_binop_p<0x59, "vmul", fmul, HasAVX512,
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, HasAVX512,
SchedWriteFMulSizes, 1>,
avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", fsub, HasAVX512,
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, HasAVX512,
SchedWriteFAddSizes>,
avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", fdiv, HasAVX512,
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, HasAVX512,
SchedWriteFDivSizes>,
avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
@@ -5570,6 +5590,7 @@ let isCodeGenOnly = 1 in {
defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
SchedWriteFCmpSizes, 1>;
}
+let Uses = []<Register>, mayRaiseFPException = 0 in {
defm VAND : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
@@ -5578,10 +5599,11 @@ defm VOR : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
defm VXOR : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
SchedWriteFLogicSizes, 1>;
+}
multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
@@ -5603,7 +5625,7 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
"$src2, $src1", "$src1, $src2",
@@ -6399,7 +6421,8 @@ let Predicates = [HasAVX512] in {
multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -6425,7 +6448,8 @@ multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR] in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
@@ -6462,7 +6486,7 @@ multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
VEX_W;
}
-defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86Fmadd, X86FmaddRnd>;
+defm VFMADD213 : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB213 : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
@@ -6473,7 +6497,8 @@ defm VFNMSUB213 : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubR
multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -6500,7 +6525,8 @@ multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR] in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
@@ -6538,7 +6564,7 @@ multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
VEX_W;
}
-defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86Fmadd, X86FmaddRnd>;
+defm VFMADD231 : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB231 : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
@@ -6548,7 +6574,8 @@ defm VFNMSUB231 : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubR
multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -6578,7 +6605,8 @@ multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, string Suff> {
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0 in
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
+ Uses = [MXCSR] in
defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
@@ -6616,7 +6644,7 @@ multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
VEX_W;
}
-defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86Fmadd, X86FmaddRnd>;
+defm VFMADD132 : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB132 : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
@@ -6630,14 +6658,15 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
defm r_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3), OpcodeStr,
"$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
- AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>;
+ AVX512FMA3Base, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
let mayLoad = 1 in
defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
"$src3, $src2", "$src2, $src3", (null_frag), 1, 1>,
- AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>;
+ AVX512FMA3Base, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+ let Uses = [MXCSR] in
defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc", (null_frag), 1, 1>,
@@ -6648,13 +6677,14 @@ let Constraints = "$src1 = $dst", hasSideEffects = 0 in {
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>;
+ !if(MaskOnlyReg, [], [RHS_r])>, Sched<[SchedWriteFMA.Scl]>, SIMD_EXC;
def m : AVX512FMA3S<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.ScalarMemOp:$src3),
!strconcat(OpcodeStr,
"\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
- [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>;
+ [RHS_m]>, Sched<[SchedWriteFMA.Scl.Folded, SchedWriteFMA.Scl.ReadAfterFold]>, SIMD_EXC;
+ let Uses = [MXCSR] in
def rb : AVX512FMA3S<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, _.FRC:$src3, AVX512RC:$rc),
!strconcat(OpcodeStr,
@@ -6711,7 +6741,7 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
}
}
-defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86Fmadd, X86FmaddRnd>;
+defm VFMADD : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
defm VFMSUB : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
@@ -6918,7 +6948,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
}
}
-defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SS",
+defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS",
X86Movss, v4f32x_info, fp32imm0>;
defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS",
X86Movss, v4f32x_info, fp32imm0>;
@@ -6927,7 +6957,7 @@ defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmadd, X86FmaddRnd, "VFMADD", "SD",
+defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD",
X86Movsd, v2f64x_info, fp64imm0>;
defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD",
X86Movsd, v2f64x_info, fp64imm0>;
@@ -6997,7 +7027,10 @@ defm VPMADD52HUQ : avx512_pmadd52_common<0xb5, "vpmadd52huq", x86vpmadd52h,
multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSchedWrite sched,
RegisterClass SrcRC, X86VectorVTInfo DstVT,
X86MemOperand x86memop, PatFrag ld_frag, string asm,
- string mem> {
+ string mem, list<Register> _Uses = [MXCSR],
+ bit _mayRaiseFPException = 1> {
+let ExeDomain = DstVT.ExeDomain, Uses = _Uses,
+ mayRaiseFPException = _mayRaiseFPException in {
let hasSideEffects = 0, isCodeGenOnly = 1 in {
def rr : SI<opc, MRMSrcReg, (outs DstVT.FRC:$dst),
(ins DstVT.FRC:$src1, SrcRC:$src),
@@ -7023,6 +7056,7 @@ multiclass avx512_vcvtsi<bits<8> opc, SDPatternOperator OpNode, X86FoldableSched
(OpNode (DstVT.VT DstVT.RC:$src1),
(ld_frag addr:$src2)))]>,
EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
+}
def : InstAlias<"v"#asm#mem#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
(!cast<Instruction>(NAME#"rr_Int") DstVT.RC:$dst,
DstVT.RC:$src1, SrcRC:$src2), 0, "att">;
@@ -7032,6 +7066,7 @@ multiclass avx512_vcvtsi_round<bits<8> opc, SDNode OpNode,
X86FoldableSchedWrite sched, RegisterClass SrcRC,
X86VectorVTInfo DstVT, string asm,
string mem> {
+ let ExeDomain = DstVT.ExeDomain, Uses = [MXCSR] in
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst),
(ins DstVT.RC:$src1, SrcRC:$src2, AVX512RC:$rc),
!strconcat(asm,
@@ -7066,7 +7101,7 @@ defm VCVTSI642SSZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
v4f32x_info, i64mem, loadi64, "cvtsi2ss", "q">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VCVTSI2SDZ : avx512_vcvtsi<0x2A, null_frag, WriteCvtI2SD, GR32,
- v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l">,
+ v2f64x_info, i32mem, loadi32, "cvtsi2sd", "l", [], 0>,
XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VCVTSI642SDZ: avx512_vcvtsi_common<0x2A, X86SintToFp, X86SintToFpRnd,
WriteCvtI2SD, GR64,
@@ -7078,22 +7113,22 @@ def : InstAlias<"vcvtsi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
def : InstAlias<"vcvtsi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
(VCVTSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
-def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
(VCVTSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
(VCVTSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f32 (sint_to_fp GR32:$src)),
+def : Pat<(f32 (any_sint_to_fp GR32:$src)),
(VCVTSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
-def : Pat<(f32 (sint_to_fp GR64:$src)),
+def : Pat<(f32 (any_sint_to_fp GR64:$src)),
(VCVTSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
-def : Pat<(f64 (sint_to_fp GR32:$src)),
+def : Pat<(f64 (any_sint_to_fp GR32:$src)),
(VCVTSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
-def : Pat<(f64 (sint_to_fp GR64:$src)),
+def : Pat<(f64 (any_sint_to_fp GR64:$src)),
(VCVTSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
defm VCVTUSI2SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
@@ -7105,7 +7140,7 @@ defm VCVTUSI642SSZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
v4f32x_info, i64mem, loadi64, "cvtusi2ss", "q">,
XS, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VCVTUSI2SDZ : avx512_vcvtsi<0x7B, null_frag, WriteCvtI2SD, GR32, v2f64x_info,
- i32mem, loadi32, "cvtusi2sd", "l">,
+ i32mem, loadi32, "cvtusi2sd", "l", [], 0>,
XD, VEX_LIG, EVEX_CD8<32, CD8VT1>;
defm VCVTUSI642SDZ : avx512_vcvtsi_common<0x7B, X86UintToFp, X86UintToFpRnd,
WriteCvtI2SD, GR64,
@@ -7117,22 +7152,22 @@ def : InstAlias<"vcvtusi2ss\t{$src, $src1, $dst|$dst, $src1, $src}",
def : InstAlias<"vcvtusi2sd\t{$src, $src1, $dst|$dst, $src1, $src}",
(VCVTUSI2SDZrm_Int VR128X:$dst, VR128X:$src1, i32mem:$src), 0, "att">;
-def : Pat<(f32 (uint_to_fp (loadi32 addr:$src))),
+def : Pat<(f32 (any_uint_to_fp (loadi32 addr:$src))),
(VCVTUSI2SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f32 (uint_to_fp (loadi64 addr:$src))),
+def : Pat<(f32 (any_uint_to_fp (loadi64 addr:$src))),
(VCVTUSI642SSZrm (f32 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f64 (uint_to_fp (loadi32 addr:$src))),
+def : Pat<(f64 (any_uint_to_fp (loadi32 addr:$src))),
(VCVTUSI2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f64 (uint_to_fp (loadi64 addr:$src))),
+def : Pat<(f64 (any_uint_to_fp (loadi64 addr:$src))),
(VCVTUSI642SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>;
-def : Pat<(f32 (uint_to_fp GR32:$src)),
+def : Pat<(f32 (any_uint_to_fp GR32:$src)),
(VCVTUSI2SSZrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
-def : Pat<(f32 (uint_to_fp GR64:$src)),
+def : Pat<(f32 (any_uint_to_fp GR64:$src)),
(VCVTUSI642SSZrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
-def : Pat<(f64 (uint_to_fp GR32:$src)),
+def : Pat<(f64 (any_uint_to_fp GR32:$src)),
(VCVTUSI2SDZrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
-def : Pat<(f64 (uint_to_fp GR64:$src)),
+def : Pat<(f64 (any_uint_to_fp GR64:$src)),
(VCVTUSI642SDZrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
}
@@ -7145,11 +7180,12 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
SDNode OpNodeRnd,
X86FoldableSchedWrite sched, string asm,
string aliasStr> {
- let Predicates = [HasAVX512] in {
+ let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src)))]>,
- EVEX, VEX_LIG, Sched<[sched]>;
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
def rrb_Int : SI<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.RC:$src, AVX512RC:$rc),
!strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
[(set DstVT.RC:$dst, (OpNodeRnd (SrcVT.VT SrcVT.RC:$src),(i32 timm:$rc)))]>,
@@ -7159,7 +7195,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstVT.RC:$dst, (OpNode
(SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
- EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
} // Predicates = [HasAVX512]
def : InstAlias<"v" # asm # aliasStr # "\t{$src, $dst|$dst, $src}",
@@ -7202,82 +7238,82 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2u
let Predicates = [HasAVX512] in {
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
(VCVTSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
(VCVTSI642SSZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
(VCVTSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
(VCVTSI2SSZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
(VCVTSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
(VCVTSI642SDZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
(VCVTSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
(VCVTSI2SDZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (uint_to_fp GR64:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR64:$src)))))),
(VCVTUSI642SSZrr_Int VR128X:$dst, GR64:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi64 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi64 addr:$src))))))),
(VCVTUSI642SSZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (uint_to_fp GR32:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp GR32:$src)))))),
(VCVTUSI2SSZrr_Int VR128X:$dst, GR32:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
- (v4f32 (scalar_to_vector (f32 (uint_to_fp (loadi32 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_uint_to_fp (loadi32 addr:$src))))))),
(VCVTUSI2SSZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (uint_to_fp GR64:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR64:$src)))))),
(VCVTUSI642SDZrr_Int VR128X:$dst, GR64:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi64 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi64 addr:$src))))))),
(VCVTUSI642SDZrm_Int VR128X:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (uint_to_fp GR32:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp GR32:$src)))))),
(VCVTUSI2SDZrr_Int VR128X:$dst, GR32:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
- (v2f64 (scalar_to_vector (f64 (uint_to_fp (loadi32 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_uint_to_fp (loadi32 addr:$src))))))),
(VCVTUSI2SDZrm_Int VR128X:$dst, addr:$src)>;
} // Predicates = [HasAVX512]
@@ -7286,22 +7322,23 @@ multiclass avx512_cvt_s_all<bits<8> opc, string asm, X86VectorVTInfo _SrcRC,
X86VectorVTInfo _DstRC, SDNode OpNode,
SDNode OpNodeInt, SDNode OpNodeSAE,
X86FoldableSchedWrite sched, string aliasStr>{
-let Predicates = [HasAVX512] in {
+let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
let isCodeGenOnly = 1 in {
def rr : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.FRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode _SrcRC.FRC:$src))]>,
- EVEX, VEX_LIG, Sched<[sched]>;
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
def rm : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst), (ins _SrcRC.ScalarMemOp:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNode (_SrcRC.ScalarLdFrag addr:$src)))]>,
- EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
def rr_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst, (OpNodeInt (_SrcRC.VT _SrcRC.RC:$src)))]>,
- EVEX, VEX_LIG, Sched<[sched]>;
+ EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
def rrb_Int : AVX512<opc, MRMSrcReg, (outs _DstRC.RC:$dst), (ins _SrcRC.RC:$src),
!strconcat(asm,"\t{{sae}, $src, $dst|$dst, $src, {sae}}"),
[(set _DstRC.RC:$dst, (OpNodeSAE (_SrcRC.VT _SrcRC.RC:$src)))]>,
@@ -7311,7 +7348,7 @@ let Predicates = [HasAVX512] in {
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set _DstRC.RC:$dst,
(OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
- EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
} //HasAVX512
def : InstAlias<asm # aliasStr # "\t{$src, $dst|$dst, $src}",
@@ -7324,35 +7361,36 @@ let Predicates = [HasAVX512] in {
}
defm VCVTTSS2SIZ: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i32x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
"{l}">, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2SI64Z: avx512_cvt_s_all<0x2C, "vcvttss2si", f32x_info, i64x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSS2I,
"{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2SIZ: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i32x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
"{l}">, XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2SI64Z: avx512_cvt_s_all<0x2C, "vcvttsd2si", f64x_info, i64x_info,
- fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
+ any_fp_to_sint, X86cvtts2Int, X86cvtts2IntSAE, WriteCvtSD2I,
"{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSS2USIZ: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i32x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
"{l}">, XS, EVEX_CD8<32, CD8VT1>;
defm VCVTTSS2USI64Z: avx512_cvt_s_all<0x78, "vcvttss2usi", f32x_info, i64x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSS2I,
"{q}">, XS,VEX_W, EVEX_CD8<32, CD8VT1>;
defm VCVTTSD2USIZ: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i32x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
"{l}">, XD, EVEX_CD8<64, CD8VT1>;
defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
- fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
+ any_fp_to_uint, X86cvtts2UInt, X86cvtts2UIntSAE, WriteCvtSD2I,
"{q}">, XD, VEX_W, EVEX_CD8<64, CD8VT1>;
//===----------------------------------------------------------------------===//
// AVX-512 Convert form float to double and back
//===----------------------------------------------------------------------===//
+let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNode,
X86FoldableSchedWrite sched> {
@@ -7387,6 +7425,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
"{sae}, $src2, $src1", "$src1, $src2, {sae}",
@@ -7399,6 +7438,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeRnd,
X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -7435,28 +7475,28 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd", X86fpexts,
X86fpextsSAE, WriteCvtSS2SD, f32x_info,
f64x_info>;
-def : Pat<(f64 (fpextend FR32X:$src)),
+def : Pat<(f64 (any_fpextend FR32X:$src)),
(VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), FR32X:$src)>,
Requires<[HasAVX512]>;
-def : Pat<(f64 (fpextend (loadf32 addr:$src))),
+def : Pat<(f64 (any_fpextend (loadf32 addr:$src))),
(VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
Requires<[HasAVX512, OptForSize]>;
-def : Pat<(f32 (fpround FR64X:$src)),
+def : Pat<(f32 (any_fpround FR64X:$src)),
(VCVTSD2SSZrr (f32 (IMPLICIT_DEF)), FR64X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128X:$dst),
(v4f32 (scalar_to_vector
- (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (f32 (any_fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
(VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>,
Requires<[HasAVX512]>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128X:$dst),
(v2f64 (scalar_to_vector
- (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
+ (f64 (any_fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
(VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>,
Requires<[HasAVX512]>;
@@ -7472,7 +7512,7 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
string Alias = "", X86MemOperand MemOp = _Src.MemOp,
RegisterClass MaskRC = _.KRCWM,
dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
-
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src),
(ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src),
@@ -7512,11 +7552,13 @@ multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
_.RC:$src0),
vselect, "$src0 = $dst">,
EVEX, EVEX_B, Sched<[sched.Folded]>;
+ }
}
// Coversion with SAE - suppress all exceptions
multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
@@ -7528,6 +7570,7 @@ multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
X86VectorVTInfo _Src, SDNode OpNodeRnd,
X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
defm rrb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _Src.RC:$src, AVX512RC:$rc), OpcodeStr,
"$rc, $src", "$src, $rc",
@@ -7551,14 +7594,14 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
- fpextend, sched.ZMM>,
+ any_fpextend, sched.ZMM>,
avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
X86vfpextSAE, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasVLX] in {
defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
- X86vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
- defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, fpextend,
+ X86any_vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
+ defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, any_fpextend,
sched.YMM>, EVEX_V256;
}
}
@@ -7566,7 +7609,7 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
// Truncate Double to Float
multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
let Predicates = [HasAVX512] in {
- defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86vfpround, sched.ZMM>,
+ defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86any_vfpround, sched.ZMM>,
avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
X86vfproundRnd, sched.ZMM>, EVEX_V512;
}
@@ -7574,7 +7617,7 @@ multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sc
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>,
EVEX_V128;
- defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86vfpround,
+ defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86any_vfpround,
sched.YMM, "{1to4}", "{y}">, EVEX_V256;
}
@@ -7624,70 +7667,10 @@ defm VCVTPD2PS : avx512_cvtpd2ps<0x5A, "vcvtpd2ps", SchedWriteCvtPD2PS>,
defm VCVTPS2PD : avx512_cvtps2pd<0x5A, "vcvtps2pd", SchedWriteCvtPS2PD>,
PS, EVEX_CD8<32, CD8VH>;
-let Predicates = [HasAVX512] in {
- def : Pat<(v8f32 (fpround (v8f64 VR512:$src))),
- (VCVTPD2PSZrr VR512:$src)>;
- def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))),
- VR256X:$src0),
- (VCVTPD2PSZrrk VR256X:$src0, VK8WM:$mask, VR512:$src)>;
- def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (v8f64 VR512:$src))),
- v8f32x_info.ImmAllZerosV),
- (VCVTPD2PSZrrkz VK8WM:$mask, VR512:$src)>;
-
- def : Pat<(v8f32 (fpround (loadv8f64 addr:$src))),
- (VCVTPD2PSZrm addr:$src)>;
- def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))),
- VR256X:$src0),
- (VCVTPD2PSZrmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
- def : Pat<(vselect VK8WM:$mask, (v8f32 (fpround (loadv8f64 addr:$src))),
- v8f32x_info.ImmAllZerosV),
- (VCVTPD2PSZrmkz VK8WM:$mask, addr:$src)>;
-
- def : Pat<(v8f32 (fpround (v8f64 (X86VBroadcastld64 addr:$src)))),
- (VCVTPD2PSZrmb addr:$src)>;
- def : Pat<(vselect VK8WM:$mask,
- (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
- (v8f32 VR256X:$src0)),
- (VCVTPD2PSZrmbk VR256X:$src0, VK8WM:$mask, addr:$src)>;
- def : Pat<(vselect VK8WM:$mask,
- (fpround (v8f64 (X86VBroadcastld64 addr:$src))),
- v8f32x_info.ImmAllZerosV),
- (VCVTPD2PSZrmbkz VK8WM:$mask, addr:$src)>;
-}
-
let Predicates = [HasVLX] in {
- def : Pat<(v4f32 (fpround (v4f64 VR256X:$src))),
- (VCVTPD2PSZ256rr VR256X:$src)>;
- def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))),
- VR128X:$src0),
- (VCVTPD2PSZ256rrk VR128X:$src0, VK4WM:$mask, VR256X:$src)>;
- def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (v4f64 VR256X:$src))),
- v4f32x_info.ImmAllZerosV),
- (VCVTPD2PSZ256rrkz VK4WM:$mask, VR256X:$src)>;
-
- def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
- (VCVTPD2PSZ256rm addr:$src)>;
- def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))),
- VR128X:$src0),
- (VCVTPD2PSZ256rmk VR128X:$src0, VK4WM:$mask, addr:$src)>;
- def : Pat<(vselect VK4WM:$mask, (v4f32 (fpround (loadv4f64 addr:$src))),
- v4f32x_info.ImmAllZerosV),
- (VCVTPD2PSZ256rmkz VK4WM:$mask, addr:$src)>;
-
- def : Pat<(v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
- (VCVTPD2PSZ256rmb addr:$src)>;
- def : Pat<(vselect VK4WM:$mask,
- (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
- VR128X:$src0),
- (VCVTPD2PSZ256rmbk VR128X:$src0, VK4WM:$mask, addr:$src)>;
- def : Pat<(vselect VK4WM:$mask,
- (v4f32 (fpround (v4f64 (X86VBroadcastld64 addr:$src)))),
- v4f32x_info.ImmAllZerosV),
- (VCVTPD2PSZ256rmbkz VK4WM:$mask, addr:$src)>;
-
// Special patterns to allow use of X86vmfpround for masking. Instruction
// patterns have been disabled with null_frag.
- def : Pat<(X86vfpround (v2f64 VR128X:$src)),
+ def : Pat<(X86any_vfpround (v2f64 VR128X:$src)),
(VCVTPD2PSZ128rr VR128X:$src)>;
def : Pat<(X86vmfpround (v2f64 VR128X:$src), (v4f32 VR128X:$src0),
VK2WM:$mask),
@@ -7696,7 +7679,7 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
- def : Pat<(X86vfpround (loadv2f64 addr:$src)),
+ def : Pat<(X86any_vfpround (loadv2f64 addr:$src)),
(VCVTPD2PSZ128rm addr:$src)>;
def : Pat<(X86vmfpround (loadv2f64 addr:$src), (v4f32 VR128X:$src0),
VK2WM:$mask),
@@ -7705,7 +7688,7 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTPD2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(X86vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
+ def : Pat<(X86any_vfpround (v2f64 (X86VBroadcastld64 addr:$src))),
(VCVTPD2PSZ128rmb addr:$src)>;
def : Pat<(X86vmfpround (v2f64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
@@ -7716,6 +7699,7 @@ let Predicates = [HasVLX] in {
}
// Convert Signed/Unsigned Doubleword to Double
+let Uses = []<Register>, mayRaiseFPException = 0 in
multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
SDNode OpNode128, X86SchedWriteWidths sched> {
// No rounding in this op
@@ -8075,34 +8059,34 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
VK4WM:$mask, i64mem:$src), 0, "att">;
}
-defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", sint_to_fp, X86VSintToFP,
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, X86any_VSintToFP,
SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
-defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", sint_to_fp,
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp,
X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
PS, EVEX_CD8<32, CD8VF>;
-defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86cvttp2si,
+defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
X86cvttp2siSAE, SchedWriteCvtPS2DQ>,
XS, EVEX_CD8<32, CD8VF>;
-defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86cvttp2si,
+defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
X86cvttp2siSAE, SchedWriteCvtPD2DQ>,
PD, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86cvttp2ui,
+defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS,
EVEX_CD8<32, CD8VF>;
-defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86cvttp2ui,
+defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
X86cvttp2uiSAE, SchedWriteCvtPD2DQ>,
PS, VEX_W, EVEX_CD8<64, CD8VF>;
-defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", uint_to_fp,
- X86VUintToFP, SchedWriteCvtDQ2PD>, XS,
+defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
+ X86any_VUintToFP, SchedWriteCvtDQ2PD>, XS,
EVEX_CD8<32, CD8VH>;
-defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", uint_to_fp,
+defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD,
EVEX_CD8<32, CD8VF>;
@@ -8138,35 +8122,35 @@ defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
-defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86cvttp2si,
+defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si,
X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86cvttp2si,
+defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si,
X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
-defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86cvttp2ui,
+defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui,
X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W,
PD, EVEX_CD8<64, CD8VF>;
-defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86cvttp2ui,
+defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui,
X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD,
EVEX_CD8<32, CD8VH>;
-defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", sint_to_fp,
+defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp,
X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
EVEX_CD8<64, CD8VF>;
-defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", uint_to_fp,
+defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
EVEX_CD8<64, CD8VF>;
-defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", sint_to_fp,
+defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp,
X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
EVEX_CD8<64, CD8VF>;
-defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", uint_to_fp,
+defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp,
X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
EVEX_CD8<64, CD8VF>;
@@ -8202,7 +8186,7 @@ let Predicates = [HasVLX] in {
// Special patterns to allow use of X86mcvttp2si for masking. Instruction
// patterns have been disabled with null_frag.
- def : Pat<(v4i32 (X86cvttp2si (v2f64 VR128X:$src))),
+ def : Pat<(v4i32 (X86any_cvttp2si (v2f64 VR128X:$src))),
(VCVTTPD2DQZ128rr VR128X:$src)>;
def : Pat<(X86mcvttp2si (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
VK2WM:$mask),
@@ -8211,7 +8195,7 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2DQZ128rrkz VK2WM:$mask, VR128X:$src)>;
- def : Pat<(v4i32 (X86cvttp2si (loadv2f64 addr:$src))),
+ def : Pat<(v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))),
(VCVTTPD2DQZ128rm addr:$src)>;
def : Pat<(X86mcvttp2si (loadv2f64 addr:$src), (v4i32 VR128X:$src0),
VK2WM:$mask),
@@ -8220,7 +8204,7 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2DQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))),
+ def : Pat<(v4i32 (X86any_cvttp2si (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2DQZ128rmb addr:$src)>;
def : Pat<(X86mcvttp2si (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
@@ -8260,7 +8244,7 @@ let Predicates = [HasVLX] in {
// Special patterns to allow use of X86mcvtp2UInt for masking. Instruction
// patterns have been disabled with null_frag.
- def : Pat<(v4i32 (X86cvttp2ui (v2f64 VR128X:$src))),
+ def : Pat<(v4i32 (X86any_cvttp2ui (v2f64 VR128X:$src))),
(VCVTTPD2UDQZ128rr VR128X:$src)>;
def : Pat<(X86mcvttp2ui (v2f64 VR128X:$src), (v4i32 VR128X:$src0),
VK2WM:$mask),
@@ -8269,7 +8253,7 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2UDQZ128rrkz VK2WM:$mask, VR128X:$src)>;
- def : Pat<(v4i32 (X86cvttp2ui (loadv2f64 addr:$src))),
+ def : Pat<(v4i32 (X86any_cvttp2ui (loadv2f64 addr:$src))),
(VCVTTPD2UDQZ128rm addr:$src)>;
def : Pat<(X86mcvttp2ui (loadv2f64 addr:$src), (v4i32 VR128X:$src0),
VK2WM:$mask),
@@ -8278,7 +8262,7 @@ let Predicates = [HasVLX] in {
VK2WM:$mask),
(VCVTTPD2UDQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4i32 (X86cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))),
+ def : Pat<(v4i32 (X86any_cvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)))),
(VCVTTPD2UDQZ128rmb addr:$src)>;
def : Pat<(X86mcvttp2ui (v2f64 (X86VBroadcastld64 addr:$src)),
(v4i32 VR128X:$src0), VK2WM:$mask),
@@ -8311,7 +8295,7 @@ let Predicates = [HasDQI, HasVLX] in {
v2i64x_info.ImmAllZerosV)),
(VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ def : Pat<(v2i64 (X86any_cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2QQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
@@ -8322,7 +8306,7 @@ let Predicates = [HasDQI, HasVLX] in {
v2i64x_info.ImmAllZerosV)),
(VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2i64 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
+ def : Pat<(v2i64 (X86any_cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
(VCVTTPS2UQQZ128rm addr:$src)>;
def : Pat<(v2i64 (vselect VK2WM:$mask,
(X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
@@ -8334,63 +8318,26 @@ let Predicates = [HasDQI, HasVLX] in {
(VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
}
-let Predicates = [HasAVX512, NoVLX] in {
-def : Pat<(v8i32 (X86cvttp2ui (v8f32 VR256X:$src1))),
- (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v4i32 (X86cvttp2ui (v4f32 VR128X:$src1))),
- (EXTRACT_SUBREG (v16i32 (VCVTTPS2UDQZrr
- (v16f32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-
-def : Pat<(v4i32 (X86cvttp2ui (v4f64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8i32 (VCVTTPD2UDQZrr
- (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_xmm)>;
-
-def : Pat<(v8f32 (uint_to_fp (v8i32 VR256X:$src1))),
- (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v4f32 (uint_to_fp (v4i32 VR128X:$src1))),
- (EXTRACT_SUBREG (v16f32 (VCVTUDQ2PSZrr
- (v16i32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-
-def : Pat<(v4f64 (uint_to_fp (v4i32 VR128X:$src1))),
- (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
- (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_ymm)>;
-
-def : Pat<(v2f64 (X86VUintToFP (v4i32 VR128X:$src1))),
- (EXTRACT_SUBREG (v8f64 (VCVTUDQ2PDZrr
- (v8i32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-}
-
let Predicates = [HasVLX] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDZ128rm addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
v2f64x_info.ImmAllZerosV)),
(VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v2f64 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ def : Pat<(v2f64 (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTUDQ2PDZ128rm addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
VR128X:$src0)),
(VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
def : Pat<(v2f64 (vselect VK2WM:$mask,
- (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+ (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
v2f64x_info.ImmAllZerosV)),
(VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
}
@@ -8398,7 +8345,7 @@ let Predicates = [HasVLX] in {
let Predicates = [HasDQI, HasVLX] in {
// Special patterns to allow use of X86VMSintToFP for masking. Instruction
// patterns have been disabled with null_frag.
- def : Pat<(v4f32 (X86VSintToFP (v2i64 VR128X:$src))),
+ def : Pat<(v4f32 (X86any_VSintToFP (v2i64 VR128X:$src))),
(VCVTQQ2PSZ128rr VR128X:$src)>;
def : Pat<(X86VMSintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
VK2WM:$mask),
@@ -8407,7 +8354,7 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
- def : Pat<(v4f32 (X86VSintToFP (loadv2i64 addr:$src))),
+ def : Pat<(v4f32 (X86any_VSintToFP (loadv2i64 addr:$src))),
(VCVTQQ2PSZ128rm addr:$src)>;
def : Pat<(X86VMSintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
VK2WM:$mask),
@@ -8416,7 +8363,7 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (X86VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+ def : Pat<(v4f32 (X86any_VSintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
(VCVTQQ2PSZ128rmb addr:$src)>;
def : Pat<(X86VMSintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
@@ -8427,7 +8374,7 @@ let Predicates = [HasDQI, HasVLX] in {
// Special patterns to allow use of X86VMUintToFP for masking. Instruction
// patterns have been disabled with null_frag.
- def : Pat<(v4f32 (X86VUintToFP (v2i64 VR128X:$src))),
+ def : Pat<(v4f32 (X86any_VUintToFP (v2i64 VR128X:$src))),
(VCVTUQQ2PSZ128rr VR128X:$src)>;
def : Pat<(X86VMUintToFP (v2i64 VR128X:$src), (v4f32 VR128X:$src0),
VK2WM:$mask),
@@ -8436,7 +8383,7 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTUQQ2PSZ128rrkz VK2WM:$mask, VR128X:$src)>;
- def : Pat<(v4f32 (X86VUintToFP (loadv2i64 addr:$src))),
+ def : Pat<(v4f32 (X86any_VUintToFP (loadv2i64 addr:$src))),
(VCVTUQQ2PSZ128rm addr:$src)>;
def : Pat<(X86VMUintToFP (loadv2i64 addr:$src), (v4f32 VR128X:$src0),
VK2WM:$mask),
@@ -8445,7 +8392,7 @@ let Predicates = [HasDQI, HasVLX] in {
VK2WM:$mask),
(VCVTUQQ2PSZ128rmkz VK2WM:$mask, addr:$src)>;
- def : Pat<(v4f32 (X86VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
+ def : Pat<(v4f32 (X86any_VUintToFP (v2i64 (X86VBroadcastld64 addr:$src)))),
(VCVTUQQ2PSZ128rmb addr:$src)>;
def : Pat<(X86VMUintToFP (v2i64 (X86VBroadcastld64 addr:$src)),
(v4f32 VR128X:$src0), VK2WM:$mask),
@@ -8455,72 +8402,11 @@ let Predicates = [HasDQI, HasVLX] in {
(VCVTUQQ2PSZ128rmbkz VK2WM:$mask, addr:$src)>;
}
-let Predicates = [HasDQI, NoVLX] in {
-def : Pat<(v2i64 (X86cvttp2si (v2f64 VR128X:$src1))),
- (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
- (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-
-def : Pat<(v4i64 (X86cvttp2si (v4f32 VR128X:$src1))),
- (EXTRACT_SUBREG (v8i64 (VCVTTPS2QQZrr
- (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_ymm)>;
-
-def : Pat<(v4i64 (X86cvttp2si (v4f64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8i64 (VCVTTPD2QQZrr
- (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v2i64 (X86cvttp2ui (v2f64 VR128X:$src1))),
- (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
- (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-
-def : Pat<(v4i64 (X86cvttp2ui (v4f32 VR128X:$src1))),
- (EXTRACT_SUBREG (v8i64 (VCVTTPS2UQQZrr
- (v8f32 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_ymm)>;
-
-def : Pat<(v4i64 (X86cvttp2ui (v4f64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8i64 (VCVTTPD2UQQZrr
- (v8f64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v4f32 (sint_to_fp (v4i64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8f32 (VCVTQQ2PSZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_xmm)>;
-
-def : Pat<(v2f64 (sint_to_fp (v2i64 VR128X:$src1))),
- (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-
-def : Pat<(v4f64 (sint_to_fp (v4i64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8f64 (VCVTQQ2PDZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_ymm)>;
-
-def : Pat<(v4f32 (uint_to_fp (v4i64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8f32 (VCVTUQQ2PSZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_xmm)>;
-
-def : Pat<(v2f64 (uint_to_fp (v2i64 VR128X:$src1))),
- (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR128X:$src1, sub_xmm)))), sub_xmm)>;
-
-def : Pat<(v4f64 (uint_to_fp (v4i64 VR256X:$src1))),
- (EXTRACT_SUBREG (v8f64 (VCVTUQQ2PDZrr
- (v8i64 (INSERT_SUBREG (IMPLICIT_DEF),
- VR256X:$src1, sub_ymm)))), sub_ymm)>;
-}
-
//===----------------------------------------------------------------------===//
// Half precision conversion instructions
//===----------------------------------------------------------------------===//
+let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86MemOperand x86memop, PatFrag ld_frag,
X86FoldableSchedWrite sched> {
@@ -8537,6 +8423,7 @@ multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86FoldableSchedWrite sched> {
+ let Uses = [MXCSR] in
defm rrb : AVX512_maskable<0x13, MRMSrcReg, _dest, (outs _dest.RC:$dst),
(ins _src.RC:$src), "vcvtph2ps",
"{sae}, $src", "$src, {sae}",
@@ -8568,7 +8455,7 @@ let Predicates = [HasVLX] in {
multiclass avx512_cvtps2ph<X86VectorVTInfo _dest, X86VectorVTInfo _src,
X86MemOperand x86memop, SchedWrite RR, SchedWrite MR> {
-let ExeDomain = GenericDomain in {
+let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
"vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -8605,7 +8492,7 @@ let ExeDomain = GenericDomain in {
multiclass avx512_cvtps2ph_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
SchedWrite Sched> {
- let hasSideEffects = 0 in
+ let hasSideEffects = 0, Uses = [MXCSR] in
defm rrb : AVX512_maskable_in_asm<0x1D, MRMDestReg, _dest,
(outs _dest.RC:$dst),
(ins _src.RC:$src1, i32u8imm:$src2),
@@ -8664,52 +8551,51 @@ let Predicates = [HasVLX] in {
// Unordered/Ordered scalar fp compare with Sae and set EFLAGS
multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
- string OpcodeStr, X86FoldableSchedWrite sched> {
- let hasSideEffects = 0 in
+ string OpcodeStr, Domain d,
+ X86FoldableSchedWrite sched = WriteFCom> {
+ let hasSideEffects = 0, Uses = [MXCSR] in
def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
!strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
EVEX, EVEX_B, VEX_LIG, EVEX_V128, Sched<[sched]>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
- defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", WriteFCom>,
+ defm VUCOMISSZ : avx512_ord_cmp_sae<0x2E, v4f32x_info, "vucomiss", SSEPackedSingle>,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", WriteFCom>,
+ defm VUCOMISDZ : avx512_ord_cmp_sae<0x2E, v2f64x_info, "vucomisd", SSEPackedDouble>,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
- defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", WriteFCom>,
+ defm VCOMISSZ : avx512_ord_cmp_sae<0x2F, v4f32x_info, "vcomiss", SSEPackedSingle>,
AVX512PSIi8Base, EVEX_CD8<32, CD8VT1>;
- defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", WriteFCom>,
+ defm VCOMISDZ : avx512_ord_cmp_sae<0x2F, v2f64x_info, "vcomisd", SSEPackedDouble>,
AVX512PDIi8Base, VEX_W, EVEX_CD8<64, CD8VT1>;
}
let Defs = [EFLAGS], Predicates = [HasAVX512] in {
- defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86cmp, f32, f32mem, loadf32,
- "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
+ defm VUCOMISSZ : sse12_ord_cmp<0x2E, FR32X, X86any_fcmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
- defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86cmp, f64, f64mem, loadf64,
- "ucomisd", WriteFCom>, PD, EVEX,
+ defm VUCOMISDZ : sse12_ord_cmp<0x2E, FR64X, X86any_fcmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, PD, EVEX,
+ VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
+ defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, X86strict_fcmps, f32, f32mem, loadf32,
+ "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
+ EVEX_CD8<32, CD8VT1>;
+ defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, X86strict_fcmps, f64, f64mem, loadf64,
+ "comisd", SSEPackedDouble>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
- let Pattern = []<dag> in {
- defm VCOMISSZ : sse12_ord_cmp<0x2F, FR32X, undef, f32, f32mem, loadf32,
- "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
- EVEX_CD8<32, CD8VT1>;
- defm VCOMISDZ : sse12_ord_cmp<0x2F, FR64X, undef, f64, f64mem, loadf64,
- "comisd", WriteFCom>, PD, EVEX,
- VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
- }
let isCodeGenOnly = 1 in {
defm VUCOMISSZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", WriteFCom>, PS, EVEX, VEX_LIG,
+ sse_load_f32, "ucomiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VUCOMISDZ : sse12_ord_cmp_int<0x2E, VR128X, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", WriteFCom>, PD, EVEX,
+ sse_load_f64, "ucomisd", SSEPackedDouble>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
defm VCOMISSZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", WriteFCom>, PS, EVEX, VEX_LIG,
+ sse_load_f32, "comiss", SSEPackedSingle>, PS, EVEX, VEX_LIG,
EVEX_CD8<32, CD8VT1>;
defm VCOMISDZ : sse12_ord_cmp_int<0x2F, VR128X, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", WriteFCom>, PD, EVEX,
+ sse_load_f64, "comisd", SSEPackedDouble>, PD, EVEX,
VEX_LIG, VEX_W, EVEX_CD8<64, CD8VT1>;
}
}
@@ -8717,7 +8603,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
/// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
+ let Predicates = [HasAVX512], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
@@ -8767,6 +8653,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
}
+let Uses = [MXCSR] in
multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteWidths sched> {
defm PSZ : avx512_fp14_p<opc, !strconcat(OpcodeStr, "ps"), OpNode, sched.ZMM,
@@ -8798,12 +8685,12 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86rcp14, SchedWriteFRcp>;
multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
SDNode OpNode, SDNode OpNodeSAE,
X86FoldableSchedWrite sched> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), (_.VT _.RC:$src2))>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
defm rb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -8815,7 +8702,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
@@ -8840,7 +8727,7 @@ defm VGETEXP : avx512_eri_s<0x43, "vgetexp", X86fgetexps, X86fgetexpSAEs,
multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
SDNode OpNode, X86FoldableSchedWrite sched> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
(OpNode (_.VT _.RC:$src))>,
@@ -8862,7 +8749,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
}
multiclass avx512_fp28_p_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
SDNode OpNode, X86FoldableSchedWrite sched> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr,
"{sae}, $src", "$src, {sae}",
@@ -8923,25 +8810,26 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src), OpcodeStr, "$src", "$src",
- (_.VT (fsqrt _.RC:$src))>, EVEX,
+ (_.VT (any_fsqrt _.RC:$src))>, EVEX,
Sched<[sched]>;
defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.MemOp:$src), OpcodeStr, "$src", "$src",
- (fsqrt (_.VT
+ (any_fsqrt (_.VT
(bitconvert (_.LdFrag addr:$src))))>, EVEX,
Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.ScalarMemOp:$src), OpcodeStr,
"${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
- (fsqrt (_.VT
+ (any_fsqrt (_.VT
(_.BroadcastLdFrag addr:$src)))>,
EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
defm PSZ : avx512_sqrt_packed<opc, !strconcat(OpcodeStr, "ps"),
@@ -8967,6 +8855,7 @@ multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
}
}
+let Uses = [MXCSR] in
multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
X86SchedWriteSizes sched> {
defm PSZ : avx512_sqrt_packed_round<opc, !strconcat(OpcodeStr, "ps"),
@@ -8985,13 +8874,14 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
"$src2, $src1", "$src1, $src2",
(X86fsqrts (_.VT _.RC:$src1),
(_.VT _.RC:$src2))>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
defm m_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
"$src2, $src1", "$src1, $src2",
(X86fsqrts (_.VT _.RC:$src1),
_.ScalarIntMemCPat:$src2)>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+ let Uses = [MXCSR] in
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr,
"$rc, $src2, $src1", "$src1, $src2, $rc",
@@ -9004,23 +8894,23 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
let mayLoad = 1 in
def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2),
OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
let Predicates = [HasAVX512] in {
- def : Pat<(_.EltVT (fsqrt _.FRC:$src)),
+ def : Pat<(_.EltVT (any_fsqrt _.FRC:$src)),
(!cast<Instruction>(Name#Zr)
(_.EltVT (IMPLICIT_DEF)), _.FRC:$src)>;
}
let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(_.EltVT (fsqrt (load addr:$src))),
+ def : Pat<(_.EltVT (any_fsqrt (load addr:$src))),
(!cast<Instruction>(Name#Zm)
(_.EltVT (IMPLICIT_DEF)), addr:$src)>;
}
@@ -9047,8 +8937,9 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales (_.VT _.RC:$src1), (_.VT _.RC:$src2),
(i32 timm:$src3)))>,
- Sched<[sched]>;
+ Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3), OpcodeStr,
"$src3, {sae}, $src2, $src1", "$src1, $src2, {sae}, $src3",
@@ -9062,30 +8953,30 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
"$src3, $src2, $src1", "$src1, $src2, $src3",
(_.VT (X86RndScales _.RC:$src1,
_.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
def r : I<opc, MRMSrcReg, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.FRC:$src2, i32u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[sched]>;
+ []>, Sched<[sched]>, SIMD_EXC;
let mayLoad = 1 in
def m : I<opc, MRMSrcMem, (outs _.FRC:$dst),
(ins _.FRC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
OpcodeStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
- []>, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ []>, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
let Predicates = [HasAVX512] in {
- def : Pat<(X86VRndScale _.FRC:$src1, timm:$src2),
+ def : Pat<(X86any_VRndScale _.FRC:$src1, timm:$src2),
(_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
_.FRC:$src1, timm:$src2))>;
}
let Predicates = [HasAVX512, OptForSize] in {
- def : Pat<(X86VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
+ def : Pat<(X86any_VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
(_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
addr:$src1, timm:$src2))>;
}
@@ -9681,7 +9572,7 @@ defm : AVX512_pmovx_patterns<"VPMOVSX", sext, sext_invec>;
defm : AVX512_pmovx_patterns<"VPMOVZX", zext, zext_invec>;
// Without BWI we can't do a trunc from v16i16 to v16i8. DAG combine can merge
-// ext+trunc aggresively making it impossible to legalize the DAG to this
+// ext+trunc aggressively making it impossible to legalize the DAG to this
// pattern directly.
let Predicates = [HasAVX512, NoBWI] in {
def: Pat<(v16i8 (trunc (v16i16 VR256X:$src))),
@@ -10101,7 +9992,7 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
//all instruction created with FROUND_CURRENT
multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
@@ -10127,7 +10018,7 @@ multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNo
multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, i32u8imm:$src2),
OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
@@ -10160,7 +10051,7 @@ multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
//all instruction created with FROUND_CURRENT
multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _>{
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10232,7 +10123,7 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
// op(reg_vec2,mem_scalar,imm)
multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in {
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rri : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
@@ -10254,7 +10145,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86FoldableSchedWrite sched,
X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -10268,7 +10159,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
//handle scalar instruction reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo _> {
- let ExeDomain = _.ExeDomain in
+ let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -10350,7 +10241,7 @@ defm VREDUCE : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56
X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>,
AVX512AIi8Base, EVEX;
defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
- X86VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>,
+ X86any_VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>,
AVX512AIi8Base, EVEX;
defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>,
@@ -10892,10 +10783,12 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load
// AVX-512 - Unpack Instructions
//===----------------------------------------------------------------------===//
+let Uses = []<Register>, mayRaiseFPException = 0 in {
defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
SchedWriteFShuffleSizes, 0, 1>;
defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
SchedWriteFShuffleSizes>;
+}
defm VPUNPCKLBW : avx512_binop_rm_vl_b<0x60, "vpunpcklbw", X86Unpckl,
SchedWriteShuffle, HasBWI>;
@@ -11587,7 +11480,8 @@ let Predicates = [HasVLX] in {
multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched, X86VectorVTInfo _,
X86VectorVTInfo TblVT>{
- let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+ let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
@@ -11619,7 +11513,7 @@ multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched,
X86VectorVTInfo _, X86VectorVTInfo TblVT>
: avx512_fixupimm_packed<opc, OpcodeStr, sched, _, TblVT> {
-let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
+let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11643,7 +11537,8 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(X86VFixupimms (_.VT _.RC:$src1),
(_.VT _.RC:$src2),
(_src3VT.VT _src3VT.RC:$src3),
- (i32 timm:$src4))>, Sched<[sched]>;
+ (i32 timm:$src4))>, Sched<[sched]>, SIMD_EXC;
+ let Uses = [MXCSR] in
defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
@@ -11661,7 +11556,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
(_src3VT.VT (scalar_to_vector
(_src3VT.ScalarLdFrag addr:$src3))),
(i32 timm:$src4))>,
- Sched<[sched.Folded, sched.ReadAfterFold]>;
+ Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
}
}
@@ -11978,6 +11873,7 @@ let Constraints = "$src1 = $dst" in
multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
X86FoldableSchedWrite sched, X86VectorVTInfo VTI,
bit IsCommutable> {
+ let ExeDomain = VTI.ExeDomain in {
defm r : AVX512_maskable_3src<Op, MRMSrcReg, VTI, (outs VTI.RC:$dst),
(ins VTI.RC:$src2, VTI.RC:$src3), OpStr,
"$src3, $src2", "$src2, $src3",
@@ -12000,6 +11896,7 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
(VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
T8PD, Sched<[sched.Folded, sched.ReadAfterFold]>;
+ }
}
multiclass VNNI_common<bits<8> Op, string OpStr, SDNode OpNode,
@@ -12164,7 +12061,7 @@ defm VGF2P8AFFINEQB : GF2P8AFFINE_avx512_common<0xCE, "vgf2p8affineqb",
//===----------------------------------------------------------------------===//
let hasSideEffects = 0, mayLoad = 1, ExeDomain = SSEPackedSingle,
- Constraints = "$src1 = $dst" in {
+ Constraints = "$src1 = $dst", Uses = [MXCSR], mayRaiseFPException = 1 in {
defm V4FMADDPSrm : AVX512_maskable_3src_in_asm<0x9A, MRMSrcMem, v16f32_info,
(outs VR512:$dst), (ins VR512:$src2, f128mem:$src3),
"v4fmaddps", "$src3, $src2", "$src2, $src3",
@@ -12210,9 +12107,9 @@ defm VP4DPWSSDSrm : AVX512_maskable_3src_in_asm<0x53, MRMSrcMem, v16i32_info,
}
let hasSideEffects = 0 in {
- let mayStore = 1 in
+ let mayStore = 1, SchedRW = [WriteFStoreX] in
def MASKPAIR16STORE : PseudoI<(outs), (ins anymem:$dst, VK16PAIR:$src), []>;
- let mayLoad = 1 in
+ let mayLoad = 1, SchedRW = [WriteFLoadX] in
def MASKPAIR16LOAD : PseudoI<(outs VK16PAIR:$dst), (ins anymem:$src), []>;
}
@@ -12220,7 +12117,7 @@ let hasSideEffects = 0 in {
// VP2INTERSECT
//===----------------------------------------------------------------------===//
-multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
+multiclass avx512_vp2intersect_modes<X86FoldableSchedWrite sched, X86VectorVTInfo _> {
def rr : I<0x68, MRMSrcReg,
(outs _.KRPC:$dst),
(ins _.RC:$src1, _.RC:$src2),
@@ -12228,7 +12125,7 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRPC:$dst, (X86vp2intersect
_.RC:$src1, (_.VT _.RC:$src2)))]>,
- EVEX_4V, T8XD;
+ EVEX_4V, T8XD, Sched<[sched]>;
def rm : I<0x68, MRMSrcMem,
(outs _.KRPC:$dst),
@@ -12237,7 +12134,8 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
"\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
[(set _.KRPC:$dst, (X86vp2intersect
_.RC:$src1, (_.VT (bitconvert (_.LdFrag addr:$src2)))))]>,
- EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>;
+ EVEX_4V, T8XD, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
def rmb : I<0x68, MRMSrcMem,
(outs _.KRPC:$dst),
@@ -12246,21 +12144,22 @@ multiclass avx512_vp2intersect_modes<X86VectorVTInfo _> {
", $src1, $dst|$dst, $src1, ${src2}", _.BroadcastStr ,"}"),
[(set _.KRPC:$dst, (X86vp2intersect
_.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))]>,
- EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>;
+ EVEX_4V, T8XD, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
}
-multiclass avx512_vp2intersect<AVX512VLVectorVTInfo _> {
+multiclass avx512_vp2intersect<X86SchedWriteWidths sched, AVX512VLVectorVTInfo _> {
let Predicates = [HasAVX512, HasVP2INTERSECT] in
- defm Z : avx512_vp2intersect_modes<_.info512>, EVEX_V512;
+ defm Z : avx512_vp2intersect_modes<sched.ZMM, _.info512>, EVEX_V512;
let Predicates = [HasAVX512, HasVP2INTERSECT, HasVLX] in {
- defm Z256 : avx512_vp2intersect_modes<_.info256>, EVEX_V256;
- defm Z128 : avx512_vp2intersect_modes<_.info128>, EVEX_V128;
+ defm Z256 : avx512_vp2intersect_modes<sched.YMM, _.info256>, EVEX_V256;
+ defm Z128 : avx512_vp2intersect_modes<sched.XMM, _.info128>, EVEX_V128;
}
}
-defm VP2INTERSECTD : avx512_vp2intersect<avx512vl_i32_info>;
-defm VP2INTERSECTQ : avx512_vp2intersect<avx512vl_i64_info>, VEX_W;
+defm VP2INTERSECTD : avx512_vp2intersect<SchedWriteVecALU, avx512vl_i32_info>;
+defm VP2INTERSECTQ : avx512_vp2intersect<SchedWriteVecALU, avx512vl_i64_info>, VEX_W;
multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched,
@@ -12293,17 +12192,19 @@ defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
// Truncate Float to BFloat16
multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
X86SchedWriteWidths sched> {
- let Predicates = [HasBF16] in {
+ let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in {
defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
}
let Predicates = [HasBF16, HasVLX] in {
+ let Uses = []<Register>, mayRaiseFPException = 0 in {
defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
VK4WM>, EVEX_V128;
defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
X86cvtneps2bf16,
sched.YMM, "{1to8}", "{y}">, EVEX_V256;
+ }
def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
(!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
@@ -12358,19 +12259,21 @@ let Predicates = [HasBF16, HasVLX] in {
let Constraints = "$src1 = $dst" in {
multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
+ X86FoldableSchedWrite sched,
X86VectorVTInfo _, X86VectorVTInfo src_v> {
defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.RC:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
- EVEX_4V;
+ EVEX_4V, Sched<[sched]>;
defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.MemOp:$src3),
OpcodeStr, "$src3, $src2", "$src2, $src3",
(_.VT (OpNode _.RC:$src1, _.RC:$src2,
(src_v.VT (bitconvert
- (src_v.LdFrag addr:$src3)))))>, EVEX_4V;
+ (src_v.LdFrag addr:$src3)))))>, EVEX_4V,
+ Sched<[sched.Folded, sched.ReadAfterFold]>;
defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
(ins _.RC:$src2, _.ScalarMemOp:$src3),
@@ -12379,26 +12282,26 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
!strconcat("$src2, ${src3}", _.BroadcastStr),
(_.VT (OpNode _.RC:$src1, _.RC:$src2,
(src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
- EVEX_B, EVEX_4V;
+ EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
}
} // Constraints = "$src1 = $dst"
multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
- AVX512VLVectorVTInfo _,
+ X86SchedWriteWidths sched, AVX512VLVectorVTInfo _,
AVX512VLVectorVTInfo src_v, Predicate prd> {
let Predicates = [prd] in {
- defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info512,
+ defm Z : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.ZMM, _.info512,
src_v.info512>, EVEX_V512;
}
let Predicates = [HasVLX, prd] in {
- defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info256,
+ defm Z256 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.YMM, _.info256,
src_v.info256>, EVEX_V256;
- defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, _.info128,
+ defm Z128 : avx512_dpbf16ps_rm<opc, OpcodeStr, OpNode, sched.XMM, _.info128,
src_v.info128>, EVEX_V128;
}
}
-defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps,
+defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
avx512vl_f32_info, avx512vl_i32_info,
HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index e1e6eea59884..32faeb1a86f2 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -220,12 +220,12 @@ let isCall = 1 in
// registers are added manually.
let Uses = [ESP, SSP] in {
def CALLpcrel32 : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i32imm_pcrel:$dst),
+ (outs), (ins i32imm_brtarget:$dst),
"call{l}\t$dst", []>, OpSize32,
Requires<[Not64BitMode]>, Sched<[WriteJump]>;
let hasSideEffects = 0 in
def CALLpcrel16 : Ii16PCRel<0xE8, RawFrm,
- (outs), (ins i16imm_pcrel:$dst),
+ (outs), (ins i16imm_brtarget:$dst),
"call{w}\t$dst", []>, OpSize16,
Sched<[WriteJump]>;
def CALL16r : I<0xFF, MRM2r, (outs), (ins GR16:$dst),
@@ -285,7 +285,7 @@ let isCall = 1 in
// Tail call stuff.
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, Uses = [ESP, SSP] in {
- def TCRETURNdi : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$offset),
+ def TCRETURNdi : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>, NotMemoryFoldable;
def TCRETURNri : PseudoI<(outs), (ins ptr_rc_tailcall:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>, NotMemoryFoldable;
@@ -293,7 +293,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
def TCRETURNmi : PseudoI<(outs), (ins i32mem_TC:$dst, i32imm:$offset),
[]>, Sched<[WriteJumpLd]>;
- def TAILJMPd : PseudoI<(outs), (ins i32imm_pcrel:$dst),
+ def TAILJMPd : PseudoI<(outs), (ins i32imm_brtarget:$dst),
[]>, Sched<[WriteJump]>;
def TAILJMPr : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
@@ -309,10 +309,11 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
isCodeGenOnly = 1, SchedRW = [WriteJump] in
let Uses = [ESP, EFLAGS, SSP] in {
def TCRETURNdicc : PseudoI<(outs),
- (ins i32imm_pcrel:$dst, i32imm:$offset, i32imm:$cond), []>;
+ (ins i32imm_brtarget:$dst, i32imm:$offset, i32imm:$cond),
+ []>;
// This gets substituted to a conditional jump instruction in MC lowering.
- def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_pcrel:$dst, i32imm:$cond), []>;
+ def TAILJMPd_CC : PseudoI<(outs), (ins i32imm_brtarget:$dst, i32imm:$cond), []>;
}
@@ -328,7 +329,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
// that the offset between an arbitrary immediate and the call will fit in
// the 32-bit pcrel field that we have.
def CALL64pcrel32 : Ii32PCRel<0xE8, RawFrm,
- (outs), (ins i64i32imm_pcrel:$dst),
+ (outs), (ins i64i32imm_brtarget:$dst),
"call{q}\t$dst", []>, OpSize32,
Requires<[In64BitMode]>;
def CALL64r : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
@@ -357,7 +358,7 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
isCodeGenOnly = 1, Uses = [RSP, SSP] in {
def TCRETURNdi64 : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset),
+ (ins i64i32imm_brtarget:$dst, i32imm:$offset),
[]>, Sched<[WriteJump]>;
def TCRETURNri64 : PseudoI<(outs),
(ins ptr_rc_tailcall:$dst, i32imm:$offset),
@@ -367,7 +368,7 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBarrier = 1,
(ins i64mem_TC:$dst, i32imm:$offset),
[]>, Sched<[WriteJumpLd]>, NotMemoryFoldable;
- def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_pcrel:$dst),
+ def TAILJMPd64 : PseudoI<(outs), (ins i64i32imm_brtarget:$dst),
[]>, Sched<[WriteJump]>;
def TAILJMPr64 : PseudoI<(outs), (ins ptr_rc_tailcall:$dst),
@@ -415,10 +416,10 @@ let isCall = 1, isTerminator = 1, isReturn = 1, isBranch = 1,
isCodeGenOnly = 1, SchedRW = [WriteJump] in
let Uses = [RSP, EFLAGS, SSP] in {
def TCRETURNdi64cc : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$offset,
+ (ins i64i32imm_brtarget:$dst, i32imm:$offset,
i32imm:$cond), []>;
// This gets substituted to a conditional jump instruction in MC lowering.
def TAILJMPd64_CC : PseudoI<(outs),
- (ins i64i32imm_pcrel:$dst, i32imm:$cond), []>;
+ (ins i64i32imm_brtarget:$dst, i32imm:$cond), []>;
}
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 0cca71bdc431..9e43a532a3f8 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -95,7 +95,8 @@ multiclass fma3p_rm_132<bits<8> opc, string OpcodeStr, RegisterClass RC,
Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
}
-let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1 in
+let Constraints = "$src1 = $dst", hasSideEffects = 0, isCommutable = 1,
+ Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpcodeStr, string PackTy, string Suff,
PatFrag MemFrag128, PatFrag MemFrag256,
@@ -122,7 +123,7 @@ multiclass fma3p_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// Fused Multiply-Add
let ExeDomain = SSEPackedSingle in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "ps", "PS",
- loadv4f32, loadv8f32, X86Fmadd, v4f32, v8f32,
+ loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
SchedWriteFMA>;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
@@ -137,7 +138,7 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
defm VFMADD : fma3p_forms<0x98, 0xA8, 0xB8, "vfmadd", "pd", "PD",
- loadv2f64, loadv4f64, X86Fmadd, v2f64,
+ loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
v4f64, SchedWriteFMA>, VEX_W;
defm VFMSUB : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
loadv2f64, loadv4f64, X86Fmsub, v2f64,
@@ -237,7 +238,7 @@ multiclass fma3s_rm_132<bits<8> opc, string OpcodeStr,
}
let Constraints = "$src1 = $dst", isCommutable = 1, isCodeGenOnly = 1,
- hasSideEffects = 0 in
+ hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
string OpStr, string PackTy, string Suff,
SDNode OpNode, RegisterClass RC,
@@ -263,7 +264,8 @@ multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
// the lowest element of the FMA*_Int instruction. Even though such analysis
// may be not implemented yet we allow the routines doing the actual commute
// transformation to decide if one or another instruction is commutable or not.
-let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0 in
+let Constraints = "$src1 = $dst", isCommutable = 1, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
Operand memopr, RegisterClass RC,
X86FoldableSchedWrite sched> {
@@ -317,7 +319,7 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
VR128, sdmem, sched>, VEX_W;
}
-defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86Fmadd,
+defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
SchedWriteFMA.Scl>, VEX_LIG;
defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
SchedWriteFMA.Scl>, VEX_LIG;
@@ -370,12 +372,12 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
}
}
-defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
@@ -384,6 +386,7 @@ defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR6
// FMA4 - AMD 4 operand Fused Multiply-Add instructions
//===----------------------------------------------------------------------===//
+let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma4s<bits<8> opc, string OpcodeStr, RegisterClass RC,
X86MemOperand x86memop, ValueType OpVT, SDNode OpNode,
PatFrag mem_frag, X86FoldableSchedWrite sched> {
@@ -425,7 +428,8 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
multiclass fma4s_int<bits<8> opc, string OpcodeStr, Operand memop,
ValueType VT, X86FoldableSchedWrite sched> {
-let isCodeGenOnly = 1, hasSideEffects = 0 in {
+let isCodeGenOnly = 1, hasSideEffects = 0,
+ Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr_Int : FMA4S_Int<opc, MRMSrcRegOp4, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src2, VR128:$src3),
!strconcat(OpcodeStr,
@@ -458,6 +462,7 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
} // isCodeGenOnly = 1
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in
multiclass fma4p<bits<8> opc, string OpcodeStr, SDNode OpNode,
ValueType OpVT128, ValueType OpVT256,
PatFrag ld_frag128, PatFrag ld_frag256,
@@ -533,7 +538,7 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in {
let ExeDomain = SSEPackedSingle in {
// Scalar Instructions
- defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86Fmadd, loadf32,
+ defm VFMADDSS4 : fma4s<0x6A, "vfmaddss", FR32, f32mem, f32, X86any_Fmadd, loadf32,
SchedWriteFMA.Scl>,
fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
SchedWriteFMA.Scl>;
@@ -550,7 +555,7 @@ let ExeDomain = SSEPackedSingle in {
fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
SchedWriteFMA.Scl>;
// Packed Instructions
- defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86Fmadd, v4f32, v8f32,
+ defm VFMADDPS4 : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
defm VFMSUBPS4 : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -566,7 +571,7 @@ let ExeDomain = SSEPackedSingle in {
let ExeDomain = SSEPackedDouble in {
// Scalar Instructions
- defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86Fmadd, loadf64,
+ defm VFMADDSD4 : fma4s<0x6B, "vfmaddsd", FR64, f64mem, f64, X86any_Fmadd, loadf64,
SchedWriteFMA.Scl>,
fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
@@ -583,7 +588,7 @@ let ExeDomain = SSEPackedDouble in {
fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
SchedWriteFMA.Scl>;
// Packed Instructions
- defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86Fmadd, v2f64, v4f64,
+ defm VFMADDPD4 : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
defm VFMSUBPD4 : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -624,12 +629,12 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
}
}
-defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 2ec6d50f9702..1830262205c6 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -29,7 +29,7 @@ def SDTX86CwdStore : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
def X86fld : SDNode<"X86ISD::FLD", SDTX86Fld,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
def X86fst : SDNode<"X86ISD::FST", SDTX86Fst,
- [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ [SDNPHasChain, SDNPOptInGlue, SDNPMayStore,
SDNPMemOperand]>;
def X86fild : SDNode<"X86ISD::FILD", SDTX86Fild,
[SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -37,7 +37,7 @@ def X86fildflag : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
[SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
SDNPMemOperand]>;
def X86fist : SDNode<"X86ISD::FIST", SDTX86Fist,
- [SDNPHasChain, SDNPInGlue, SDNPMayStore,
+ [SDNPHasChain, SDNPOptInGlue, SDNPMayStore,
SDNPMemOperand]>;
def X86fp_stsw : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst,
@@ -282,32 +282,32 @@ def _FI32m : FPI<0xDA, fp, (outs), (ins i32mem:$src),
!strconcat("fi", asmstring, "{l}\t$src")>;
}
-let Defs = [FPSW], Uses = [FPCW] in {
+let Uses = [FPCW], mayRaiseFPException = 1 in {
// FPBinary_rr just defines pseudo-instructions, no need to set a scheduling
// resources.
let hasNoSchedulingInfo = 1 in {
-defm ADD : FPBinary_rr<fadd>;
-defm SUB : FPBinary_rr<fsub>;
-defm MUL : FPBinary_rr<fmul>;
-defm DIV : FPBinary_rr<fdiv>;
+defm ADD : FPBinary_rr<any_fadd>;
+defm SUB : FPBinary_rr<any_fsub>;
+defm MUL : FPBinary_rr<any_fmul>;
+defm DIV : FPBinary_rr<any_fdiv>;
}
// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
let SchedRW = [WriteFAddLd] in {
-defm ADD : FPBinary<fadd, MRM0m, "add">;
-defm SUB : FPBinary<fsub, MRM4m, "sub">;
-defm SUBR: FPBinary<fsub ,MRM5m, "subr", 0>;
+defm ADD : FPBinary<any_fadd, MRM0m, "add">;
+defm SUB : FPBinary<any_fsub, MRM4m, "sub">;
+defm SUBR: FPBinary<any_fsub ,MRM5m, "subr", 0>;
}
let SchedRW = [WriteFMulLd] in {
-defm MUL : FPBinary<fmul, MRM1m, "mul">;
+defm MUL : FPBinary<any_fmul, MRM1m, "mul">;
}
let SchedRW = [WriteFDivLd] in {
-defm DIV : FPBinary<fdiv, MRM6m, "div">;
-defm DIVR: FPBinary<fdiv, MRM7m, "divr", 0>;
+defm DIV : FPBinary<any_fdiv, MRM6m, "div">;
+defm DIVR: FPBinary<any_fdiv, MRM7m, "divr", 0>;
}
-} // Defs = [FPSW]
+} // Uses = [FPCW], mayRaiseFPException = 1
class FPST0rInst<Format fp, string asm>
: FPI<0xD8, fp, (outs), (ins RSTi:$op), asm>;
@@ -319,7 +319,7 @@ class FPrST0PInst<Format fp, string asm>
// NOTE: GAS and apparently all other AT&T style assemblers have a broken notion
// of some of the 'reverse' forms of the fsub and fdiv instructions. As such,
// we have to put some 'r's in and take them out of weird places.
-let SchedRW = [WriteFAdd], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFAdd], Uses = [FPCW], mayRaiseFPException = 1 in {
def ADD_FST0r : FPST0rInst <MRM0r, "fadd\t{$op, %st|st, $op}">;
def ADD_FrST0 : FPrST0Inst <MRM0r, "fadd\t{%st, $op|$op, st}">;
def ADD_FPrST0 : FPrST0PInst<MRM0r, "faddp\t{%st, $op|$op, st}">;
@@ -330,16 +330,16 @@ def SUB_FST0r : FPST0rInst <MRM4r, "fsub\t{$op, %st|st, $op}">;
def SUBR_FrST0 : FPrST0Inst <MRM4r, "fsub{|r}\t{%st, $op|$op, st}">;
def SUBR_FPrST0 : FPrST0PInst<MRM4r, "fsub{|r}p\t{%st, $op|$op, st}">;
} // SchedRW
-let SchedRW = [WriteFCom], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFCom], Uses = [FPCW], mayRaiseFPException = 1 in {
def COM_FST0r : FPST0rInst <MRM2r, "fcom\t$op">;
def COMP_FST0r : FPST0rInst <MRM3r, "fcomp\t$op">;
} // SchedRW
-let SchedRW = [WriteFMul], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFMul], Uses = [FPCW], mayRaiseFPException = 1 in {
def MUL_FST0r : FPST0rInst <MRM1r, "fmul\t{$op, %st|st, $op}">;
def MUL_FrST0 : FPrST0Inst <MRM1r, "fmul\t{%st, $op|$op, st}">;
def MUL_FPrST0 : FPrST0PInst<MRM1r, "fmulp\t{%st, $op|$op, st}">;
} // SchedRW
-let SchedRW = [WriteFDiv], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFDiv], Uses = [FPCW], mayRaiseFPException = 1 in {
def DIVR_FST0r : FPST0rInst <MRM7r, "fdivr\t{$op, %st|st, $op}">;
def DIV_FrST0 : FPrST0Inst <MRM7r, "fdiv{r}\t{%st, $op|$op, st}">;
def DIV_FPrST0 : FPrST0PInst<MRM7r, "fdiv{r}p\t{%st, $op|$op, st}">;
@@ -359,20 +359,14 @@ def _Fp80 : FpI_<(outs RFP80:$dst), (ins RFP80:$src), OneArgFPRW,
def _F : FPI<0xD9, fp, (outs), (ins), asmstring>;
}
-let Defs = [FPSW], Uses = [FPCW] in {
-
let SchedRW = [WriteFSign] in {
defm CHS : FPUnary<fneg, MRM_E0, "fchs">;
defm ABS : FPUnary<fabs, MRM_E1, "fabs">;
}
+let Uses = [FPCW], mayRaiseFPException = 1 in {
let SchedRW = [WriteFSqrt80] in
-defm SQRT: FPUnary<fsqrt,MRM_FA, "fsqrt">;
-
-let SchedRW = [WriteMicrocoded] in {
-defm SIN : FPUnary<fsin, MRM_FE, "fsin">;
-defm COS : FPUnary<fcos, MRM_FF, "fcos">;
-}
+defm SQRT: FPUnary<any_fsqrt,MRM_FA, "fsqrt">;
let SchedRW = [WriteFCom] in {
let hasSideEffects = 0 in {
@@ -383,11 +377,11 @@ def TST_Fp80 : FpI_<(outs), (ins RFP80:$src), OneArgFP, []>;
def TST_F : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
} // SchedRW
-} // Defs = [FPSW]
+} // Uses = [FPCW], mayRaiseFPException = 1
// Versions of FP instructions that take a single memory operand. Added for the
// disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFComLd], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1 in {
def FCOM32m : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
@@ -402,14 +396,21 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
} // SchedRW
let SchedRW = [WriteMicrocoded] in {
+let Defs = [FPSW, FPCW] in {
def FLDENVm : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
-def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
-
def FRSTORm : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
+}
+
+let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW] in {
+def FSTENVm : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
def FSAVEm : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
+}
+
+let Uses = [FPSW] in
def FNSTSWm : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
def FBLDm : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
+let Uses = [FPCW] ,mayRaiseFPException = 1 in
def FBSTPm : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
} // SchedRW
@@ -435,7 +436,6 @@ multiclass FPCMov<PatLeaf cc> {
Requires<[HasCMov]>;
}
-let Defs = [FPSW] in {
let SchedRW = [WriteFCMOV] in {
let Uses = [EFLAGS], Constraints = "$src1 = $dst" in {
defm CMOVB : FPCMov<X86_COND_B>;
@@ -469,6 +469,7 @@ def CMOVNP_F : FPI<0xDB, MRM3r, (outs), (ins RSTi:$op),
} // Predicates = [HasCMov]
} // SchedRW
+let mayRaiseFPException = 1 in {
// Floating point loads & stores.
let SchedRW = [WriteLoad], Uses = [FPCW] in {
let canFoldAsLoad = 1 in {
@@ -485,6 +486,7 @@ def LD_Fp64m80 : FpI_<(outs RFP80:$dst), (ins f64mem:$src), ZeroArgFP,
[(set RFP80:$dst, (f80 (extloadf64 addr:$src)))]>;
def LD_Fp32m80 : FpI_<(outs RFP80:$dst), (ins f32mem:$src), ZeroArgFP,
[(set RFP80:$dst, (f80 (extloadf32 addr:$src)))]>;
+let mayRaiseFPException = 0 in {
def ILD_Fp16m32: FpIf32<(outs RFP32:$dst), (ins i16mem:$src), ZeroArgFP,
[(set RFP32:$dst, (X86fild16 addr:$src))]>;
def ILD_Fp32m32: FpIf32<(outs RFP32:$dst), (ins i32mem:$src), ZeroArgFP,
@@ -503,6 +505,7 @@ def ILD_Fp32m80: FpI_<(outs RFP80:$dst), (ins i32mem:$src), ZeroArgFP,
[(set RFP80:$dst, (X86fild32 addr:$src))]>;
def ILD_Fp64m80: FpI_<(outs RFP80:$dst), (ins i64mem:$src), ZeroArgFP,
[(set RFP80:$dst, (X86fild64 addr:$src))]>;
+} // mayRaiseFPException = 0
} // SchedRW
let SchedRW = [WriteStore], Uses = [FPCW] in {
@@ -546,10 +549,12 @@ let mayLoad = 1, SchedRW = [WriteLoad], Uses = [FPCW] in {
def LD_F32m : FPI<0xD9, MRM0m, (outs), (ins f32mem:$src), "fld{s}\t$src">;
def LD_F64m : FPI<0xDD, MRM0m, (outs), (ins f64mem:$src), "fld{l}\t$src">;
def LD_F80m : FPI<0xDB, MRM5m, (outs), (ins f80mem:$src), "fld{t}\t$src">;
+let mayRaiseFPException = 0 in {
def ILD_F16m : FPI<0xDF, MRM0m, (outs), (ins i16mem:$src), "fild{s}\t$src">;
def ILD_F32m : FPI<0xDB, MRM0m, (outs), (ins i32mem:$src), "fild{l}\t$src">;
def ILD_F64m : FPI<0xDF, MRM5m, (outs), (ins i64mem:$src), "fild{ll}\t$src">;
}
+}
let mayStore = 1, SchedRW = [WriteStore], Uses = [FPCW] in {
def ST_F32m : FPI<0xD9, MRM2m, (outs), (ins f32mem:$dst), "fst{s}\t$dst">;
def ST_F64m : FPI<0xDD, MRM2m, (outs), (ins f64mem:$dst), "fst{l}\t$dst">;
@@ -621,7 +626,7 @@ def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
let SchedRW = [WriteFLD1], Uses = [FPCW] in
def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
-let SchedRW = [WriteFLDC], Uses = [FPCW] in {
+let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW] in {
def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
@@ -632,29 +637,44 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
// Floating point compares.
let SchedRW = [WriteFCom], Uses = [FPCW] in {
def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- [(set FPSW, (trunc (X86cmp RFP32:$lhs, RFP32:$rhs)))]>;
+ [(set FPSW, (trunc (X86any_fcmp RFP32:$lhs, RFP32:$rhs)))]>;
def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- [(set FPSW, (trunc (X86cmp RFP64:$lhs, RFP64:$rhs)))]>;
+ [(set FPSW, (trunc (X86any_fcmp RFP64:$lhs, RFP64:$rhs)))]>;
def UCOM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- [(set FPSW, (trunc (X86cmp RFP80:$lhs, RFP80:$rhs)))]>;
+ [(set FPSW, (trunc (X86any_fcmp RFP80:$lhs, RFP80:$rhs)))]>;
+def COM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86strict_fcmps RFP32:$lhs, RFP32:$rhs)))]>;
+def COM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86strict_fcmps RFP64:$lhs, RFP64:$rhs)))]>;
+def COM_Fpr80 : FpI_ <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set FPSW, (trunc (X86strict_fcmps RFP80:$lhs, RFP80:$rhs)))]>;
} // SchedRW
-} // Defs = [FPSW]
+} // mayRaiseFPException = 1
-let SchedRW = [WriteFCom] in {
+let SchedRW = [WriteFCom], mayRaiseFPException = 1 in {
// CC = ST(0) cmp ST(i)
-let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
+let Defs = [EFLAGS, FPCW], Uses = [FPCW] in {
def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP32:$lhs, RFP32:$rhs))]>,
+ [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>,
Requires<[FPStackf32, HasCMov]>;
def UCOM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP64:$lhs, RFP64:$rhs))]>,
+ [(set EFLAGS, (X86any_fcmp RFP64:$lhs, RFP64:$rhs))]>,
Requires<[FPStackf64, HasCMov]>;
def UCOM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
- [(set EFLAGS, (X86cmp RFP80:$lhs, RFP80:$rhs))]>,
+ [(set EFLAGS, (X86any_fcmp RFP80:$lhs, RFP80:$rhs))]>,
+ Requires<[HasCMov]>;
+def COM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
+ [(set EFLAGS, (X86strict_fcmps RFP32:$lhs, RFP32:$rhs))]>,
+ Requires<[FPStackf32, HasCMov]>;
+def COM_FpIr64: FpI_<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
+ [(set EFLAGS, (X86strict_fcmps RFP64:$lhs, RFP64:$rhs))]>,
+ Requires<[FPStackf64, HasCMov]>;
+def COM_FpIr80: FpI_<(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
+ [(set EFLAGS, (X86strict_fcmps RFP80:$lhs, RFP80:$rhs))]>,
Requires<[HasCMov]>;
}
-let Defs = [FPSW], Uses = [ST0, FPCW] in {
+let Uses = [ST0, FPCW] in {
def UCOM_Fr : FPI<0xDD, MRM4r, // FPSW = cmp ST(0) with ST(i)
(outs), (ins RSTi:$reg), "fucom\t$reg">;
def UCOM_FPr : FPI<0xDD, MRM5r, // FPSW = cmp ST(0) with ST(i), pop
@@ -678,7 +698,7 @@ def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg),
// Floating point flag ops.
let SchedRW = [WriteALU] in {
-let Defs = [AX], Uses = [FPSW] in
+let Defs = [AX, FPSW], Uses = [FPSW] in
def FNSTSW16r : I<0xDF, MRM_E0, // AX = fp flags
(outs), (ins), "fnstsw\t{%ax|ax}",
[(set AX, (X86fp_stsw FPSW))]>;
@@ -694,51 +714,61 @@ def FLDCW16m : I<0xD9, MRM5m, // X87 control world = [mem16]
// FPU control instructions
let SchedRW = [WriteMicrocoded] in {
-let Defs = [FPSW] in {
-def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
def FFREE : FPI<0xDD, MRM0r, (outs), (ins RSTi:$reg), "ffree\t$reg">;
def FFREEP : FPI<0xDF, MRM0r, (outs), (ins RSTi:$reg), "ffreep\t$reg">;
+let Defs = [FPSW, FPCW] in
+def FNINIT : I<0xDB, MRM_E3, (outs), (ins), "fninit", []>;
// Clear exceptions
+let Defs = [FPSW] in
def FNCLEX : I<0xDB, MRM_E2, (outs), (ins), "fnclex", []>;
-} // Defs = [FPSW]
} // SchedRW
// Operand-less floating-point instructions for the disassembler.
+let Defs = [FPSW] in
def FNOP : I<0xD9, MRM_D0, (outs), (ins), "fnop", []>, Sched<[WriteNop]>;
let SchedRW = [WriteMicrocoded] in {
let Defs = [FPSW] in {
def WAIT : I<0x9B, RawFrm, (outs), (ins), "wait", []>;
def FXAM : I<0xD9, MRM_E5, (outs), (ins), "fxam", []>;
+def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>;
+def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>;
+let Uses = [FPCW], mayRaiseFPException = 1 in {
def F2XM1 : I<0xD9, MRM_F0, (outs), (ins), "f2xm1", []>;
def FYL2X : I<0xD9, MRM_F1, (outs), (ins), "fyl2x", []>;
def FPTAN : I<0xD9, MRM_F2, (outs), (ins), "fptan", []>;
def FPATAN : I<0xD9, MRM_F3, (outs), (ins), "fpatan", []>;
def FXTRACT : I<0xD9, MRM_F4, (outs), (ins), "fxtract", []>;
def FPREM1 : I<0xD9, MRM_F5, (outs), (ins), "fprem1", []>;
-def FDECSTP : I<0xD9, MRM_F6, (outs), (ins), "fdecstp", []>;
-def FINCSTP : I<0xD9, MRM_F7, (outs), (ins), "fincstp", []>;
def FPREM : I<0xD9, MRM_F8, (outs), (ins), "fprem", []>;
def FYL2XP1 : I<0xD9, MRM_F9, (outs), (ins), "fyl2xp1", []>;
+def FSIN : I<0xD9, MRM_FE, (outs), (ins), "fsin", []>;
+def FCOS : I<0xD9, MRM_FF, (outs), (ins), "fcos", []>;
def FSINCOS : I<0xD9, MRM_FB, (outs), (ins), "fsincos", []>;
def FRNDINT : I<0xD9, MRM_FC, (outs), (ins), "frndint", []>;
def FSCALE : I<0xD9, MRM_FD, (outs), (ins), "fscale", []>;
def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
+} // Uses = [FPCW], mayRaiseFPException = 1
} // Defs = [FPSW]
+let Uses = [FPSW, FPCW] in {
def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
"fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
Requires<[HasFXSR]>;
def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
"fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
TB, Requires<[HasFXSR, In64BitMode]>;
+} // Uses = [FPSW, FPCW]
+
+let Defs = [FPSW, FPCW] in {
def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
"fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
TB, Requires<[HasFXSR]>;
def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
"fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
TB, Requires<[HasFXSR, In64BitMode]>;
+} // Defs = [FPSW, FPCW]
} // SchedRW
//===----------------------------------------------------------------------===//
@@ -747,7 +777,10 @@ def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
// Required for RET of f32 / f64 / f80 values.
def : Pat<(X86fldf32 addr:$src), (LD_Fp32m addr:$src)>;
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m64 addr:$src)>;
def : Pat<(X86fldf64 addr:$src), (LD_Fp64m addr:$src)>;
+def : Pat<(X86fldf32 addr:$src), (LD_Fp32m80 addr:$src)>;
+def : Pat<(X86fldf64 addr:$src), (LD_Fp64m80 addr:$src)>;
def : Pat<(X86fldf80 addr:$src), (LD_Fp80m addr:$src)>;
// Required for CALL which return f32 / f64 / f80 values.
@@ -775,19 +808,19 @@ def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>;
// FP extensions map onto simple pseudo-value conversions if they are to/from
// the FP stack.
-def : Pat<(f64 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
+def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
Requires<[FPStackf32]>;
-def : Pat<(f80 (fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
+def : Pat<(f80 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP80)>,
Requires<[FPStackf32]>;
-def : Pat<(f80 (fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
+def : Pat<(f80 (any_fpextend RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP80)>,
Requires<[FPStackf64]>;
// FP truncations map onto simple pseudo-value conversions if they are to/from
// the FP stack. We have validated that only value-preserving truncations make
// it through isel.
-def : Pat<(f32 (fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
+def : Pat<(f32 (any_fpround RFP64:$src)), (COPY_TO_REGCLASS RFP64:$src, RFP32)>,
Requires<[FPStackf32]>;
-def : Pat<(f32 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
+def : Pat<(f32 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP32)>,
Requires<[FPStackf32]>;
-def : Pat<(f64 (fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
+def : Pat<(f64 (any_fpround RFP80:$src)), (COPY_TO_REGCLASS RFP80:$src, RFP64)>,
Requires<[FPStackf64]>;
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index e8f0d937dff4..2f797fcfb8de 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -227,6 +227,7 @@ class EVEX_V512 { bit hasEVEX_L2 = 1; bit hasVEX_L = 0; }
class EVEX_V256 { bit hasEVEX_L2 = 0; bit hasVEX_L = 1; }
class EVEX_V128 { bit hasEVEX_L2 = 0; bit hasVEX_L = 0; }
class NOTRACK { bit hasNoTrackPrefix = 1; }
+class SIMD_EXC { list<Register> Uses = [MXCSR]; bit mayRaiseFPException = 1; }
// Specify AVX512 8-bit compressed displacement encoding based on the vector
// element size in bits (8, 16, 32, 64) and the CDisp8 form.
@@ -441,12 +442,15 @@ class Ii32PCRel<bits<8> o, Format f, dag outs, dag ins, string asm,
// FPStack Instruction Templates:
// FPI - Floating Point Instruction template.
class FPI<bits<8> o, Format F, dag outs, dag ins, string asm>
- : I<o, F, outs, ins, asm, []> {}
+ : I<o, F, outs, ins, asm, []> {
+ let Defs = [FPSW];
+}
// FpI_ - Floating Point Pseudo Instruction template. Not Predicated.
class FpI_<dag outs, dag ins, FPFormat fp, list<dag> pattern>
: PseudoI<outs, ins, pattern> {
let FPForm = fp;
+ let Defs = [FPSW];
}
// Templates for instructions that use a 16- or 32-bit segmented address as
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index de6f8a81dff6..3250123e5aa6 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -127,11 +127,32 @@ def X86vfpext : SDNode<"X86ISD::VFPEXT",
SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
SDTCVecEltisVT<1, f32>,
SDTCisSameSizeAs<0, 1>]>>;
+
+def X86strict_vfpext : SDNode<"X86ISD::STRICT_VFPEXT",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f64>,
+ SDTCVecEltisVT<1, f32>,
+ SDTCisSameSizeAs<0, 1>]>,
+ [SDNPHasChain]>;
+
+def X86any_vfpext : PatFrags<(ops node:$src),
+ [(X86strict_vfpext node:$src),
+ (X86vfpext node:$src)]>;
+
def X86vfpround: SDNode<"X86ISD::VFPROUND",
SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
SDTCVecEltisVT<1, f64>,
SDTCisOpSmallerThanOp<0, 1>]>>;
+def X86strict_vfpround: SDNode<"X86ISD::STRICT_VFPROUND",
+ SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+ SDTCVecEltisVT<1, f64>,
+ SDTCisOpSmallerThanOp<0, 1>]>,
+ [SDNPHasChain]>;
+
+def X86any_vfpround : PatFrags<(ops node:$src),
+ [(X86strict_vfpround node:$src),
+ (X86vfpround node:$src)]>;
+
def X86frounds : SDNode<"X86ISD::VFPROUNDS",
SDTypeProfile<1, 2, [SDTCVecEltisVT<0, f32>,
SDTCisSameAs<0, 1>,
@@ -169,10 +190,15 @@ def X86vshiftimm : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
def X86vshldq : SDNode<"X86ISD::VSHLDQ", X86vshiftimm>;
def X86vshrdq : SDNode<"X86ISD::VSRLDQ", X86vshiftimm>;
-def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
def X86pcmpeq : SDNode<"X86ISD::PCMPEQ", SDTIntBinOp, [SDNPCommutative]>;
def X86pcmpgt : SDNode<"X86ISD::PCMPGT", SDTIntBinOp>;
+def X86cmpp : SDNode<"X86ISD::CMPP", SDTX86VFCMP>;
+def X86strict_cmpp : SDNode<"X86ISD::STRICT_CMPP", SDTX86VFCMP, [SDNPHasChain]>;
+def X86any_cmpp : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_cmpp node:$src1, node:$src2, node:$src3),
+ (X86cmpp node:$src1, node:$src2, node:$src3)]>;
+
def X86CmpMaskCC :
SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
SDTCisVec<1>, SDTCisSameAs<2, 1>,
@@ -182,6 +208,10 @@ def X86CmpMaskCCScalar :
SDTCisVT<3, i8>]>;
def X86cmpm : SDNode<"X86ISD::CMPM", X86CmpMaskCC>;
+def X86strict_cmpm : SDNode<"X86ISD::STRICT_CMPM", X86CmpMaskCC, [SDNPHasChain]>;
+def X86any_cmpm : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_cmpm node:$src1, node:$src2, node:$src3),
+ (X86cmpm node:$src1, node:$src2, node:$src3)]>;
def X86cmpmSAE : SDNode<"X86ISD::CMPM_SAE", X86CmpMaskCC>;
def X86cmpms : SDNode<"X86ISD::FSETCCM", X86CmpMaskCCScalar>;
def X86cmpmsSAE : SDNode<"X86ISD::FSETCCM_SAE", X86CmpMaskCCScalar>;
@@ -436,6 +466,12 @@ def X86VRangeSAE : SDNode<"X86ISD::VRANGE_SAE", SDTFPBinOpImm>;
def X86VReduce : SDNode<"X86ISD::VREDUCE", SDTFPUnaryOpImm>;
def X86VReduceSAE : SDNode<"X86ISD::VREDUCE_SAE", SDTFPUnaryOpImm>;
def X86VRndScale : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImm>;
+def X86strict_VRndScale : SDNode<"X86ISD::STRICT_VRNDSCALE", SDTFPUnaryOpImm,
+ [SDNPHasChain]>;
+def X86any_VRndScale : PatFrags<(ops node:$src1, node:$src2),
+ [(X86strict_VRndScale node:$src1, node:$src2),
+ (X86VRndScale node:$src1, node:$src2)]>;
+
def X86VRndScaleSAE: SDNode<"X86ISD::VRNDSCALE_SAE", SDTFPUnaryOpImm>;
def X86VGetMant : SDNode<"X86ISD::VGETMANT", SDTFPUnaryOpImm>;
def X86VGetMantSAE : SDNode<"X86ISD::VGETMANT_SAE", SDTFPUnaryOpImm>;
@@ -493,7 +529,11 @@ def X86fgetexpSAE : SDNode<"X86ISD::FGETEXP_SAE", SDTFPUnaryOp>;
def X86fgetexps : SDNode<"X86ISD::FGETEXPS", SDTFPBinOp>;
def X86fgetexpSAEs : SDNode<"X86ISD::FGETEXPS_SAE", SDTFPBinOp>;
-def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86Fmadd : SDNode<"ISD::FMA", SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fmadd : SDNode<"ISD::STRICT_FMA", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+ [(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
+ (X86Fmadd node:$src1, node:$src2, node:$src3)]>;
def X86Fnmadd : SDNode<"X86ISD::FNMADD", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fmsub : SDNode<"X86ISD::FMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
def X86Fnmsub : SDNode<"X86ISD::FNMSUB", SDTFPTernaryOp, [SDNPCommutative]>;
@@ -621,9 +661,26 @@ def X86cvtp2UIntRnd : SDNode<"X86ISD::CVTP2UI_RND", SDTFloatToIntRnd>;
// cvtt fp-to-int staff
def X86cvttp2si : SDNode<"X86ISD::CVTTP2SI", SDTFloatToInt>;
def X86cvttp2ui : SDNode<"X86ISD::CVTTP2UI", SDTFloatToInt>;
+def X86strict_cvttp2si : SDNode<"X86ISD::STRICT_CVTTP2SI", SDTFloatToInt, [SDNPHasChain]>;
+def X86strict_cvttp2ui : SDNode<"X86ISD::STRICT_CVTTP2UI", SDTFloatToInt, [SDNPHasChain]>;
+def X86any_cvttp2si : PatFrags<(ops node:$src),
+ [(X86strict_cvttp2si node:$src),
+ (X86cvttp2si node:$src)]>;
+def X86any_cvttp2ui : PatFrags<(ops node:$src),
+ [(X86strict_cvttp2ui node:$src),
+ (X86cvttp2ui node:$src)]>;
def X86VSintToFP : SDNode<"X86ISD::CVTSI2P", SDTVintToFP>;
def X86VUintToFP : SDNode<"X86ISD::CVTUI2P", SDTVintToFP>;
+def X86strict_VSintToFP : SDNode<"X86ISD::STRICT_CVTSI2P", SDTVintToFP, [SDNPHasChain]>;
+def X86strict_VUintToFP : SDNode<"X86ISD::STRICT_CVTUI2P", SDTVintToFP, [SDNPHasChain]>;
+def X86any_VSintToFP : PatFrags<(ops node:$src),
+ [(X86strict_VSintToFP node:$src),
+ (X86VSintToFP node:$src)]>;
+def X86any_VUintToFP : PatFrags<(ops node:$src),
+ [(X86strict_VUintToFP node:$src),
+ (X86VUintToFP node:$src)]>;
+
// cvt int-to-fp staff
def X86cvtp2Int : SDNode<"X86ISD::CVTP2SI", SDTFloatToInt>;
@@ -706,6 +763,10 @@ def X86GF2P8affineinvqb : SDNode<"X86ISD::GF2P8AFFINEINVQB", SDTBlend>;
def X86GF2P8affineqb : SDNode<"X86ISD::GF2P8AFFINEQB", SDTBlend>;
def X86GF2P8mulb : SDNode<"X86ISD::GF2P8MULB", SDTIntBinOp>;
+def SDTX86MaskedStore: SDTypeProfile<0, 3, [ // masked store
+ SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>, SDTCisSameNumEltsAs<0, 2>
+]>;
+
//===----------------------------------------------------------------------===//
// SSE Complex Patterns
//===----------------------------------------------------------------------===//
@@ -1040,9 +1101,10 @@ def vinsert256_insert : PatFrag<(ops node:$bigvec, node:$smallvec,
INSERT_get_vinsert256_imm>;
def masked_load : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_ld node:$src1, node:$src2, node:$src3), [{
+ (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
return !cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
- cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD;
+ cast<MaskedLoadSDNode>(N)->getExtensionType() == ISD::NON_EXTLOAD &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1055,17 +1117,19 @@ def masked_load_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
}]>;
def X86mExpandingLoad : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_ld node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedLoadSDNode>(N)->isExpandingLoad();
+ (masked_ld node:$src1, undef, node:$src2, node:$src3), [{
+ return cast<MaskedLoadSDNode>(N)->isExpandingLoad() &&
+ cast<MaskedLoadSDNode>(N)->isUnindexed();
}]>;
// Masked store fragments.
// X86mstore can't be implemented in core DAG files because some targets
// do not support vector types (llvm-tblgen will fail).
def masked_store : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_st node:$src1, node:$src2, node:$src3), [{
- return (!cast<MaskedStoreSDNode>(N)->isTruncatingStore()) &&
- (!cast<MaskedStoreSDNode>(N)->isCompressingStore());
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return !cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ !cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1078,16 +1142,18 @@ def masked_store_aligned : PatFrag<(ops node:$src1, node:$src2, node:$src3),
}]>;
def X86mCompressingStore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_st node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->isCompressingStore();
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isCompressingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
// masked truncstore fragments
// X86mtruncstore can't be implemented in core DAG files because some targets
// doesn't support vector type ( llvm-tblgen will fail)
def X86mtruncstore : PatFrag<(ops node:$src1, node:$src2, node:$src3),
- (masked_st node:$src1, node:$src2, node:$src3), [{
- return cast<MaskedStoreSDNode>(N)->isTruncatingStore();
+ (masked_st node:$src1, node:$src2, undef, node:$src3), [{
+ return cast<MaskedStoreSDNode>(N)->isTruncatingStore() &&
+ cast<MaskedStoreSDNode>(N)->isUnindexed();
}]>;
def masked_truncstorevi8 :
PatFrag<(ops node:$src1, node:$src2, node:$src3),
@@ -1111,10 +1177,10 @@ def X86TruncSStore : SDNode<"X86ISD::VTRUNCSTORES", SDTStore,
def X86TruncUSStore : SDNode<"X86ISD::VTRUNCSTOREUS", SDTStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTMaskedStore,
+def X86MTruncSStore : SDNode<"X86ISD::VMTRUNCSTORES", SDTX86MaskedStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTMaskedStore,
+def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS", SDTX86MaskedStore,
[SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index c29029daeec9..245346d82731 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -1761,10 +1761,11 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
case X86::VCMPPSZ128rrik:
case X86::VCMPPDZ256rrik:
case X86::VCMPPSZ256rrik: {
- unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x1f;
+ unsigned Imm =
+ MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
Imm = X86::getSwappedVCMPImm(Imm);
auto &WorkingMI = cloneIfNew(MI);
- WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
+ WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
OpIdx1, OpIdx2);
}
@@ -2304,7 +2305,7 @@ unsigned X86::getCMovOpcode(unsigned RegBytes, bool HasMemoryOperand) {
default: llvm_unreachable("Illegal register size!");
case 2: return HasMemoryOperand ? X86::CMOV16rm : X86::CMOV16rr;
case 4: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV32rr;
- case 8: return HasMemoryOperand ? X86::CMOV32rm : X86::CMOV64rr;
+ case 8: return HasMemoryOperand ? X86::CMOV64rm : X86::CMOV64rr;
}
}
@@ -2963,8 +2964,8 @@ static unsigned CopyToFromAsymmetricReg(unsigned DestReg, unsigned SrcReg,
void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI,
- const DebugLoc &DL, unsigned DestReg,
- unsigned SrcReg, bool KillSrc) const {
+ const DebugLoc &DL, MCRegister DestReg,
+ MCRegister SrcReg, bool KillSrc) const {
// First deal with the normal symmetric copies.
bool HasAVX = Subtarget.hasAVX();
bool HasVLX = Subtarget.hasVLX();
@@ -3046,15 +3047,11 @@ void X86InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
report_fatal_error("Cannot emit physreg copy instruction");
}
-bool X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI,
- const MachineOperand *&Src,
- const MachineOperand *&Dest) const {
- if (MI.isMoveReg()) {
- Dest = &MI.getOperand(0);
- Src = &MI.getOperand(1);
- return true;
- }
- return false;
+Optional<DestSourcePair>
+X86InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
+ if (MI.isMoveReg())
+ return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
+ return None;
}
static unsigned getLoadStoreRegOpcode(unsigned Reg,
@@ -3221,8 +3218,9 @@ bool X86InstrInfo::getMemOperandWithOffset(
Offset = DispMO.getImm();
- assert(BaseOp->isReg() && "getMemOperandWithOffset only supports base "
- "operands of type register.");
+ if (!BaseOp->isReg())
+ return false;
+
return true;
}
@@ -3963,9 +3961,7 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
MachineFunction &MF = *MBB.getParent();
const X86FrameLowering *TFL = Subtarget.getFrameLowering();
bool IsWin64Prologue = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
- bool NeedsDwarfCFI =
- !IsWin64Prologue &&
- (MF.getMMI().hasDebugInfo() || MF.getFunction().needsUnwindTableEntry());
+ bool NeedsDwarfCFI = !IsWin64Prologue && MF.needsFrameMoves();
bool EmitCFI = !TFL->hasFP(MF) && NeedsDwarfCFI;
if (EmitCFI) {
TFL->BuildCFI(MBB, I, DL,
@@ -4708,6 +4704,10 @@ static MachineInstr *FuseInst(MachineFunction &MF, unsigned Opcode,
updateOperandRegConstraints(MF, *NewMI, TII);
+ // Copy the NoFPExcept flag from the instruction we're fusing.
+ if (MI.getFlag(MachineInstr::MIFlag::NoFPExcept))
+ NewMI->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
MachineBasicBlock *MBB = InsertPt->getParent();
MBB->insert(InsertPt, NewMI);
@@ -7233,8 +7233,8 @@ bool X86InstrInfo::hasHighOperandLatency(const TargetSchedModel &SchedModel,
bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
const MachineBasicBlock *MBB) const {
- assert((Inst.getNumOperands() == 3 || Inst.getNumOperands() == 4) &&
- "Reassociation needs binary operators");
+ assert(Inst.getNumExplicitOperands() == 3 && Inst.getNumExplicitDefs() == 1 &&
+ Inst.getNumDefs() <= 2 && "Reassociation needs binary operators");
// Integer binary math/logic instructions have a third source operand:
// the EFLAGS register. That operand must be both defined here and never
@@ -7242,13 +7242,11 @@ bool X86InstrInfo::hasReassociableOperands(const MachineInstr &Inst,
// not change anything because rearranging the operands could affect other
// instructions that depend on the exact status flags (zero, sign, etc.)
// that are set by using these particular operands with this operation.
- if (Inst.getNumOperands() == 4) {
- assert(Inst.getOperand(3).isReg() &&
- Inst.getOperand(3).getReg() == X86::EFLAGS &&
- "Unexpected operand in reassociable instruction");
- if (!Inst.getOperand(3).isDead())
- return false;
- }
+ const MachineOperand *FlagDef = Inst.findRegisterDefOperand(X86::EFLAGS);
+ assert((Inst.getNumDefs() == 1 || FlagDef) &&
+ "Implicit def isn't flags?");
+ if (FlagDef && !FlagDef->isDead())
+ return false;
return TargetInstrInfo::hasReassociableOperands(Inst, MBB);
}
@@ -7558,15 +7556,57 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
}
}
+/// If \p DescribedReg overlaps with the MOVrr instruction's destination
+/// register then, if possible, describe the value in terms of the source
+/// register.
+static Optional<ParamLoadedValue>
+describeMOVrrLoadedValue(const MachineInstr &MI, Register DescribedReg,
+ const TargetRegisterInfo *TRI) {
+ Register DestReg = MI.getOperand(0).getReg();
+ Register SrcReg = MI.getOperand(1).getReg();
+
+ auto Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
+
+ // If the described register is the destination, just return the source.
+ if (DestReg == DescribedReg)
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+
+ // If the described register is a sub-register of the destination register,
+ // then pick out the source register's corresponding sub-register.
+ if (unsigned SubRegIdx = TRI->getSubRegIndex(DestReg, DescribedReg)) {
+ unsigned SrcSubReg = TRI->getSubReg(SrcReg, SubRegIdx);
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcSubReg, false), Expr);
+ }
+
+ // The remaining case to consider is when the described register is a
+ // super-register of the destination register. MOV8rr and MOV16rr does not
+ // write to any of the other bytes in the register, meaning that we'd have to
+ // describe the value using a combination of the source register and the
+ // non-overlapping bits in the described register, which is not currently
+ // possible.
+ if (MI.getOpcode() == X86::MOV8rr || MI.getOpcode() == X86::MOV16rr ||
+ !TRI->isSuperRegister(DestReg, DescribedReg))
+ return None;
+
+ assert(MI.getOpcode() == X86::MOV32rr && "Unexpected super-register case");
+ return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+}
+
Optional<ParamLoadedValue>
-X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const {
+X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const {
const MachineOperand *Op = nullptr;
DIExpression *Expr = nullptr;
+ const TargetRegisterInfo *TRI = &getRegisterInfo();
+
switch (MI.getOpcode()) {
case X86::LEA32r:
case X86::LEA64r:
case X86::LEA64_32r: {
+ // We may need to describe a 64-bit parameter with a 32-bit LEA.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+
// Operand 4 could be global address. For now we do not support
// such situation.
if (!MI.getOperand(4).isImm() || !MI.getOperand(2).isImm())
@@ -7574,7 +7614,6 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const {
const MachineOperand &Op1 = MI.getOperand(1);
const MachineOperand &Op2 = MI.getOperand(3);
- const TargetRegisterInfo *TRI = &getRegisterInfo();
assert(Op2.isReg() && (Op2.getReg() == X86::NoRegister ||
Register::isPhysicalRegister(Op2.getReg())));
@@ -7638,13 +7677,56 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI) const {
return ParamLoadedValue(*Op, Expr);;
}
+ case X86::MOV32ri:
+ case X86::MOV64ri:
+ case X86::MOV64ri32:
+ // MOV32ri may be used for producing zero-extended 32-bit immediates in
+ // 64-bit parameters, so we need to consider super-registers.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+ return ParamLoadedValue(MI.getOperand(1), Expr);
+ case X86::MOV8rr:
+ case X86::MOV16rr:
+ case X86::MOV32rr:
+ case X86::MOV64rr:
+ return describeMOVrrLoadedValue(MI, Reg, TRI);
case X86::XOR32rr: {
+ // 64-bit parameters are zero-materialized using XOR32rr, so also consider
+ // super-registers.
+ if (!TRI->isSuperRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
if (MI.getOperand(1).getReg() == MI.getOperand(2).getReg())
return ParamLoadedValue(MachineOperand::CreateImm(0), Expr);
return None;
}
+ case X86::MOVSX64rr32: {
+ // We may need to describe the lower 32 bits of the MOVSX; for example, in
+ // cases like this:
+ //
+ // $ebx = [...]
+ // $rdi = MOVSX64rr32 $ebx
+ // $esi = MOV32rr $edi
+ if (!TRI->isSubRegisterEq(MI.getOperand(0).getReg(), Reg))
+ return None;
+
+ Expr = DIExpression::get(MI.getMF()->getFunction().getContext(), {});
+
+ // If the described register is the destination register we need to
+ // sign-extend the source register from 32 bits. The other case we handle
+ // is when the described register is the 32-bit sub-register of the
+ // destination register, in case we just need to return the source
+ // register.
+ if (Reg == MI.getOperand(0).getReg())
+ Expr = DIExpression::appendExt(Expr, 32, 64, true);
+ else
+ assert(X86MCRegisterClasses[X86::GR32RegClassID].contains(Reg) &&
+ "Unhandled sub-register case for MOVSX64rr32");
+
+ return ParamLoadedValue(MI.getOperand(1), Expr);
+ }
default:
- return TargetInstrInfo::describeLoadedValue(MI);
+ assert(!MI.isMoveImmediate() && "Unexpected MoveImm instruction");
+ return TargetInstrInfo::describeLoadedValue(MI, Reg);
}
}
@@ -7654,38 +7736,31 @@ void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
MachineInstr &OldMI2,
MachineInstr &NewMI1,
MachineInstr &NewMI2) const {
- // Integer instructions define an implicit EFLAGS source register operand as
- // the third source (fourth total) operand.
- if (OldMI1.getNumOperands() != 4 || OldMI2.getNumOperands() != 4)
- return;
+ // Integer instructions may define an implicit EFLAGS dest register operand.
+ MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS);
+ MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS);
- assert(NewMI1.getNumOperands() == 4 && NewMI2.getNumOperands() == 4 &&
+ assert(!OldFlagDef1 == !OldFlagDef2 &&
"Unexpected instruction type for reassociation");
- MachineOperand &OldOp1 = OldMI1.getOperand(3);
- MachineOperand &OldOp2 = OldMI2.getOperand(3);
- MachineOperand &NewOp1 = NewMI1.getOperand(3);
- MachineOperand &NewOp2 = NewMI2.getOperand(3);
+ if (!OldFlagDef1 || !OldFlagDef2)
+ return;
- assert(OldOp1.isReg() && OldOp1.getReg() == X86::EFLAGS && OldOp1.isDead() &&
- "Must have dead EFLAGS operand in reassociable instruction");
- assert(OldOp2.isReg() && OldOp2.getReg() == X86::EFLAGS && OldOp2.isDead() &&
+ assert(OldFlagDef1->isDead() && OldFlagDef2->isDead() &&
"Must have dead EFLAGS operand in reassociable instruction");
- (void)OldOp1;
- (void)OldOp2;
+ MachineOperand *NewFlagDef1 = NewMI1.findRegisterDefOperand(X86::EFLAGS);
+ MachineOperand *NewFlagDef2 = NewMI2.findRegisterDefOperand(X86::EFLAGS);
- assert(NewOp1.isReg() && NewOp1.getReg() == X86::EFLAGS &&
- "Unexpected operand in reassociable instruction");
- assert(NewOp2.isReg() && NewOp2.getReg() == X86::EFLAGS &&
+ assert(NewFlagDef1 && NewFlagDef2 &&
"Unexpected operand in reassociable instruction");
// Mark the new EFLAGS operands as dead to be helpful to subsequent iterations
// of this pass or other passes. The EFLAGS operands must be dead in these new
// instructions because the EFLAGS operands in the original instructions must
// be dead in order for reassociation to occur.
- NewOp1.setIsDead();
- NewOp2.setIsDead();
+ NewFlagDef1->setIsDead();
+ NewFlagDef2->setIsDead();
}
std::pair<unsigned, unsigned>
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 22b7b1d4cb19..1d2da5305357 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -312,7 +312,7 @@ public:
ArrayRef<MachineOperand> Cond, unsigned TrueReg,
unsigned FalseReg) const override;
void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
- const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+ const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
bool KillSrc) const override;
void storeRegToStackSlot(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI, unsigned SrcReg,
@@ -522,8 +522,8 @@ public:
return MI.getDesc().TSFlags & X86II::LOCK;
}
- Optional<ParamLoadedValue>
- describeLoadedValue(const MachineInstr &MI) const override;
+ Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
+ Register Reg) const override;
protected:
/// Commutes the operands in the given instruction by changing the operands
@@ -542,10 +542,10 @@ protected:
unsigned CommuteOpIdx2) const override;
/// If the specific machine instruction is a instruction that moves/copies
- /// value from one register to another register return true along with
- /// @Source machine operand and @Destination machine operand.
- bool isCopyInstrImpl(const MachineInstr &MI, const MachineOperand *&Source,
- const MachineOperand *&Destination) const override;
+ /// value from one register to another register return destination and source
+ /// registers as machine operands.
+ Optional<DestSourcePair>
+ isCopyInstrImpl(const MachineInstr &MI) const override;
private:
/// This is a helper for convertToThreeAddress for 8 and 16-bit instructions.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index e452145f3b65..ca5425e8b89f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -142,6 +142,8 @@ def X86shld : SDNode<"X86ISD::SHLD", SDTIntShiftDOp>;
def X86shrd : SDNode<"X86ISD::SHRD", SDTIntShiftDOp>;
def X86cmp : SDNode<"X86ISD::CMP" , SDTX86CmpTest>;
+def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86CmpTest, [SDNPHasChain]>;
+def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86CmpTest, [SDNPHasChain]>;
def X86bt : SDNode<"X86ISD::BT", SDTX86CmpTest>;
def X86cmov : SDNode<"X86ISD::CMOV", SDTX86Cmov>;
@@ -375,6 +377,9 @@ class X86VMemOperand<RegisterClass RC, string printMethod,
}
def anymem : X86MemOperand<"printanymem">;
+def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
+ [(X86strict_fcmp node:$lhs, node:$rhs),
+ (X86cmp node:$lhs, node:$rhs)]>;
// FIXME: Right now we allow any size during parsing, but we might want to
// restrict to only unsized memory.
@@ -449,18 +454,6 @@ def i64mem_TC : Operand<i64> {
let OperandType = "OPERAND_MEMORY";
}
-let OperandType = "OPERAND_PCREL",
- ParserMatchClass = X86AbsMemAsmOperand,
- PrintMethod = "printPCRelImm" in {
-def i32imm_pcrel : Operand<i32>;
-def i16imm_pcrel : Operand<i16>;
-
-// Branch targets have OtherVT type and print as pc-relative values.
-def brtarget : Operand<OtherVT>;
-def brtarget8 : Operand<OtherVT>;
-
-}
-
// Special parser to detect 16-bit mode to select 16-bit displacement.
def X86AbsMem16AsmOperand : AsmOperandClass {
let Name = "AbsMem16";
@@ -468,15 +461,27 @@ def X86AbsMem16AsmOperand : AsmOperandClass {
let SuperClasses = [X86AbsMemAsmOperand];
}
-// Branch targets have OtherVT type and print as pc-relative values.
-let OperandType = "OPERAND_PCREL",
- PrintMethod = "printPCRelImm" in {
-let ParserMatchClass = X86AbsMem16AsmOperand in
- def brtarget16 : Operand<OtherVT>;
-let ParserMatchClass = X86AbsMemAsmOperand in
- def brtarget32 : Operand<OtherVT>;
+// Branch targets print as pc-relative values.
+class BranchTargetOperand<ValueType ty> : Operand<ty> {
+ let OperandType = "OPERAND_PCREL";
+ let PrintMethod = "printPCRelImm";
+ let ParserMatchClass = X86AbsMemAsmOperand;
}
+def i32imm_brtarget : BranchTargetOperand<i32>;
+def i16imm_brtarget : BranchTargetOperand<i16>;
+
+// 64-bits but only 32 bits are significant, and those bits are treated as being
+// pc relative.
+def i64i32imm_brtarget : BranchTargetOperand<i64>;
+
+def brtarget : BranchTargetOperand<OtherVT>;
+def brtarget8 : BranchTargetOperand<OtherVT>;
+def brtarget16 : BranchTargetOperand<OtherVT> {
+ let ParserMatchClass = X86AbsMem16AsmOperand;
+}
+def brtarget32 : BranchTargetOperand<OtherVT>;
+
let RenderMethod = "addSrcIdxOperands" in {
def X86SrcIdx8Operand : AsmOperandClass {
let Name = "SrcIdx8";
@@ -751,14 +756,6 @@ def i64u8imm : Operand<i64> {
let OperandType = "OPERAND_IMMEDIATE";
}
-// 64-bits but only 32 bits are significant, and those bits are treated as being
-// pc relative.
-def i64i32imm_pcrel : Operand<i64> {
- let PrintMethod = "printPCRelImm";
- let ParserMatchClass = X86AbsMemAsmOperand;
- let OperandType = "OPERAND_PCREL";
-}
-
def lea64_32mem : Operand<i32> {
let PrintMethod = "printanymem";
let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
@@ -983,12 +980,12 @@ def IsNotPIC : Predicate<"!TM.isPositionIndependent()">;
// the Function object through the <Target>Subtarget and objections were raised
// to that (see post-commit review comments for r301750).
let RecomputePerFunction = 1 in {
- def OptForSize : Predicate<"MF->getFunction().hasOptSize()">;
+ def OptForSize : Predicate<"shouldOptForSize(MF)">;
def OptForMinSize : Predicate<"MF->getFunction().hasMinSize()">;
- def OptForSpeed : Predicate<"!MF->getFunction().hasOptSize()">;
+ def OptForSpeed : Predicate<"!shouldOptForSize(MF)">;
def UseIncDec : Predicate<"!Subtarget->slowIncDec() || "
- "MF->getFunction().hasOptSize()">;
- def NoSSE41_Or_OptForSize : Predicate<"MF->getFunction().hasOptSize() || "
+ "shouldOptForSize(MF)">;
+ def NoSSE41_Or_OptForSize : Predicate<"shouldOptForSize(MF) || "
"!Subtarget->hasSSE41()">;
}
@@ -2846,7 +2843,7 @@ let SchedRW = [WriteStore], Defs = [EFLAGS] in {
//===----------------------------------------------------------------------===//
// CLZERO Instruction
//
-let SchedRW = [WriteSystem] in {
+let SchedRW = [WriteLoad] in {
let Uses = [EAX] in
def CLZERO32r : I<0x01, MRM_FC, (outs), (ins), "clzero", []>,
TB, Requires<[HasCLZERO, Not64BitMode]>;
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index cd9a866c91cb..0f4d4d764cc9 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -508,16 +508,16 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
// -- Conversion Instructions
defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
- WriteCvtPS2I, SSEPackedSingle>, PS;
+ WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
defm MMX_CVTPD2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtpd2pi,
f128mem, memop, "cvtpd2pi\t{$src, $dst|$dst, $src}",
- WriteCvtPD2I, SSEPackedDouble>, PD;
+ WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
defm MMX_CVTTPS2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttps2pi,
f64mem, load, "cvttps2pi\t{$src, $dst|$dst, $src}",
- WriteCvtPS2I, SSEPackedSingle>, PS;
+ WriteCvtPS2I, SSEPackedSingle>, PS, SIMD_EXC;
defm MMX_CVTTPD2PI : sse12_cvt_pint<0x2C, VR128, VR64, int_x86_sse_cvttpd2pi,
f128mem, memop, "cvttpd2pi\t{$src, $dst|$dst, $src}",
- WriteCvtPD2I, SSEPackedDouble>, PD;
+ WriteCvtPD2I, SSEPackedDouble>, PD, SIMD_EXC;
defm MMX_CVTPI2PD : sse12_cvt_pint<0x2A, VR64, VR128, int_x86_sse_cvtpi2pd,
i64mem, load, "cvtpi2pd\t{$src, $dst|$dst, $src}",
WriteCvtI2PD, SSEPackedDouble>, PD;
@@ -525,7 +525,7 @@ let Constraints = "$src1 = $dst" in {
defm MMX_CVTPI2PS : sse12_cvt_pint_3addr<0x2A, VR64, VR128,
int_x86_sse_cvtpi2ps,
i64mem, load, "cvtpi2ps\t{$src2, $dst|$dst, $src2}",
- SSEPackedSingle>, PS;
+ SSEPackedSingle>, PS, SIMD_EXC;
}
// Extract / Insert
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index 09a04c0338b4..c45f342ed75b 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -823,7 +823,9 @@ let Constraints = "$src1 = $dst" in {
multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
SDNode OpNode, X86MemOperand x86memop, PatFrag ld_frag,
string asm, string mem, X86FoldableSchedWrite sched,
+ Domain d,
SchedRead Int2Fpu = ReadDefault> {
+ let ExeDomain = d in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
!strconcat(asm,"\t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, (OpNode SrcRC:$src))]>,
@@ -832,18 +834,19 @@ multiclass sse12_cvt_s<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
mem#"\t{$src, $dst|$dst, $src}",
[(set DstRC:$dst, (OpNode (ld_frag addr:$src)))]>,
Sched<[sched.Folded]>;
+ }
}
multiclass sse12_cvt_p<bits<8> opc, RegisterClass RC, X86MemOperand x86memop,
ValueType DstTy, ValueType SrcTy, PatFrag ld_frag,
string asm, Domain d, X86FoldableSchedWrite sched> {
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr : I<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src), asm,
- [(set RC:$dst, (DstTy (sint_to_fp (SrcTy RC:$src))))], d>,
+ [(set RC:$dst, (DstTy (any_sint_to_fp (SrcTy RC:$src))))], d>,
Sched<[sched]>;
let mayLoad = 1 in
def rm : I<opc, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src), asm,
- [(set RC:$dst, (DstTy (sint_to_fp
+ [(set RC:$dst, (DstTy (any_sint_to_fp
(SrcTy (ld_frag addr:$src)))))], d>,
Sched<[sched.Folded]>;
}
@@ -851,8 +854,8 @@ let hasSideEffects = 0 in {
multiclass sse12_vcvt_avx<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
X86MemOperand x86memop, string asm, string mem,
- X86FoldableSchedWrite sched> {
-let hasSideEffects = 0, Predicates = [UseAVX] in {
+ X86FoldableSchedWrite sched, Domain d> {
+let hasSideEffects = 0, Predicates = [UseAVX], ExeDomain = d in {
def rr : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src),
!strconcat(asm,"\t{$src, $src1, $dst|$dst, $src1, $src}"), []>,
Sched<[sched, ReadDefault, ReadInt2Fpu]>;
@@ -864,22 +867,22 @@ let hasSideEffects = 0, Predicates = [UseAVX] in {
} // hasSideEffects = 0
}
-let isCodeGenOnly = 1, Predicates = [UseAVX] in {
-defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+let isCodeGenOnly = 1, Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
+defm VCVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
"cvttss2si", "cvttss2si",
- WriteCvtSS2I>,
+ WriteCvtSS2I, SSEPackedSingle>,
XS, VEX, VEX_LIG;
-defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+defm VCVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
"cvttss2si", "cvttss2si",
- WriteCvtSS2I>,
+ WriteCvtSS2I, SSEPackedSingle>,
XS, VEX, VEX_W, VEX_LIG;
-defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+defm VCVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
"cvttsd2si", "cvttsd2si",
- WriteCvtSD2I>,
+ WriteCvtSD2I, SSEPackedDouble>,
XD, VEX, VEX_LIG;
-defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
"cvttsd2si", "cvttsd2si",
- WriteCvtSD2I>,
+ WriteCvtSD2I, SSEPackedDouble>,
XD, VEX, VEX_W, VEX_LIG;
}
@@ -889,60 +892,64 @@ defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
// where appropriate to do so.
let isCodeGenOnly = 1 in {
defm VCVTSI2SS : sse12_vcvt_avx<0x2A, GR32, FR32, i32mem, "cvtsi2ss", "l",
- WriteCvtI2SS>, XS, VEX_4V, VEX_LIG;
+ WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+ VEX_LIG, SIMD_EXC;
defm VCVTSI642SS : sse12_vcvt_avx<0x2A, GR64, FR32, i64mem, "cvtsi2ss", "q",
- WriteCvtI2SS>, XS, VEX_4V, VEX_W, VEX_LIG;
+ WriteCvtI2SS, SSEPackedSingle>, XS, VEX_4V,
+ VEX_W, VEX_LIG, SIMD_EXC;
defm VCVTSI2SD : sse12_vcvt_avx<0x2A, GR32, FR64, i32mem, "cvtsi2sd", "l",
- WriteCvtI2SD>, XD, VEX_4V, VEX_LIG;
+ WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+ VEX_LIG;
defm VCVTSI642SD : sse12_vcvt_avx<0x2A, GR64, FR64, i64mem, "cvtsi2sd", "q",
- WriteCvtI2SD>, XD, VEX_4V, VEX_W, VEX_LIG;
+ WriteCvtI2SD, SSEPackedDouble>, XD, VEX_4V,
+ VEX_W, VEX_LIG, SIMD_EXC;
} // isCodeGenOnly = 1
let Predicates = [UseAVX] in {
- def : Pat<(f32 (sint_to_fp (loadi32 addr:$src))),
+ def : Pat<(f32 (any_sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
- def : Pat<(f32 (sint_to_fp (loadi64 addr:$src))),
+ def : Pat<(f32 (any_sint_to_fp (loadi64 addr:$src))),
(VCVTSI642SSrm (f32 (IMPLICIT_DEF)), addr:$src)>;
- def : Pat<(f64 (sint_to_fp (loadi32 addr:$src))),
+ def : Pat<(f64 (any_sint_to_fp (loadi32 addr:$src))),
(VCVTSI2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
- def : Pat<(f64 (sint_to_fp (loadi64 addr:$src))),
+ def : Pat<(f64 (any_sint_to_fp (loadi64 addr:$src))),
(VCVTSI642SDrm (f64 (IMPLICIT_DEF)), addr:$src)>;
- def : Pat<(f32 (sint_to_fp GR32:$src)),
+ def : Pat<(f32 (any_sint_to_fp GR32:$src)),
(VCVTSI2SSrr (f32 (IMPLICIT_DEF)), GR32:$src)>;
- def : Pat<(f32 (sint_to_fp GR64:$src)),
+ def : Pat<(f32 (any_sint_to_fp GR64:$src)),
(VCVTSI642SSrr (f32 (IMPLICIT_DEF)), GR64:$src)>;
- def : Pat<(f64 (sint_to_fp GR32:$src)),
+ def : Pat<(f64 (any_sint_to_fp GR32:$src)),
(VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
- def : Pat<(f64 (sint_to_fp GR64:$src)),
+ def : Pat<(f64 (any_sint_to_fp GR64:$src)),
(VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
}
let isCodeGenOnly = 1 in {
-defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, fp_to_sint, f32mem, loadf32,
+defm CVTTSS2SI : sse12_cvt_s<0x2C, FR32, GR32, any_fp_to_sint, f32mem, loadf32,
"cvttss2si", "cvttss2si",
- WriteCvtSS2I>, XS;
-defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, fp_to_sint, f32mem, loadf32,
+ WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+defm CVTTSS2SI64 : sse12_cvt_s<0x2C, FR32, GR64, any_fp_to_sint, f32mem, loadf32,
"cvttss2si", "cvttss2si",
- WriteCvtSS2I>, XS, REX_W;
-defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, fp_to_sint, f64mem, loadf64,
+ WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
"cvttsd2si", "cvttsd2si",
- WriteCvtSD2I>, XD;
-defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, fp_to_sint, f64mem, loadf64,
+ WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
"cvttsd2si", "cvttsd2si",
- WriteCvtSD2I>, XD, REX_W;
-defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, sint_to_fp, i32mem, loadi32,
+ WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+defm CVTSI2SS : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
"cvtsi2ss", "cvtsi2ss{l}",
- WriteCvtI2SS, ReadInt2Fpu>, XS;
-defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, sint_to_fp, i64mem, loadi64,
+ WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
+defm CVTSI642SS : sse12_cvt_s<0x2A, GR64, FR32, any_sint_to_fp, i64mem, loadi64,
"cvtsi2ss", "cvtsi2ss{q}",
- WriteCvtI2SS, ReadInt2Fpu>, XS, REX_W;
-defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, sint_to_fp, i32mem, loadi32,
+ WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, REX_W, SIMD_EXC;
+defm CVTSI2SD : sse12_cvt_s<0x2A, GR32, FR64, any_sint_to_fp, i32mem, loadi32,
"cvtsi2sd", "cvtsi2sd{l}",
- WriteCvtI2SD, ReadInt2Fpu>, XD;
-defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
+ WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD;
+defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
"cvtsi2sd", "cvtsi2sd{q}",
- WriteCvtI2SD, ReadInt2Fpu>, XD, REX_W;
+ WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
} // isCodeGenOnly = 1
// Conversion Instructions Intrinsics - Match intrinsics which expect MM
@@ -951,7 +958,8 @@ defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, sint_to_fp, i64mem, loadi64,
multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
ValueType DstVT, ValueType SrcVT, SDNode OpNode,
Operand memop, ComplexPattern mem_cpat, string asm,
- X86FoldableSchedWrite sched> {
+ X86FoldableSchedWrite sched, Domain d> {
+let ExeDomain = d in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
!strconcat(asm, "\t{$src, $dst|$dst, $src}"),
[(set DstRC:$dst, (DstVT (OpNode (SrcVT SrcRC:$src))))]>,
@@ -961,12 +969,13 @@ multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
[(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
Sched<[sched.Folded]>;
}
+}
multiclass sse12_cvt_sint_3addr<bits<8> opc, RegisterClass SrcRC,
RegisterClass DstRC, X86MemOperand x86memop,
string asm, string mem, X86FoldableSchedWrite sched,
- bit Is2Addr = 1> {
-let hasSideEffects = 0 in {
+ Domain d, bit Is2Addr = 1> {
+let hasSideEffects = 0, ExeDomain = d in {
def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins DstRC:$src1, SrcRC:$src2),
!if(Is2Addr,
!strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
@@ -982,39 +991,50 @@ let hasSideEffects = 0 in {
}
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let Predicates = [UseAVX] in {
defm VCVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64,
X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
- WriteCvtSD2I>, XD, VEX, VEX_LIG;
+ WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
defm VCVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64,
X86cvts2si, sdmem, sse_load_f64, "cvtsd2si",
- WriteCvtSD2I>, XD, VEX, VEX_W, VEX_LIG;
+ WriteCvtSD2I, SSEPackedDouble>, XD, VEX, VEX_W, VEX_LIG;
}
defm CVTSD2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v2f64, X86cvts2si,
- sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD;
+ sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
+ SSEPackedDouble>, XD;
defm CVTSD2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v2f64, X86cvts2si,
- sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I>, XD, REX_W;
-
+ sdmem, sse_load_f64, "cvtsd2si", WriteCvtSD2I,
+ SSEPackedDouble>, XD, REX_W;
+}
let Predicates = [UseAVX] in {
defm VCVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2ss", "l", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG;
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle, 0>,
+ XS, VEX_4V, VEX_LIG, SIMD_EXC;
defm VCVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2ss", "q", WriteCvtI2SS, 0>, XS, VEX_4V, VEX_LIG, VEX_W;
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle, 0>,
+ XS, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
defm VCVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2sd", "l", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG;
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble, 0>,
+ XD, VEX_4V, VEX_LIG;
defm VCVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2sd", "q", WriteCvtI2SD, 0>, XD, VEX_4V, VEX_LIG, VEX_W;
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble, 0>,
+ XD, VEX_4V, VEX_LIG, VEX_W, SIMD_EXC;
}
let Constraints = "$src1 = $dst" in {
defm CVTSI2SS : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2ss", "l", WriteCvtI2SS>, XS;
+ i32mem, "cvtsi2ss", "l", WriteCvtI2SS, SSEPackedSingle>,
+ XS, SIMD_EXC;
defm CVTSI642SS : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2ss", "q", WriteCvtI2SS>, XS, REX_W;
+ i64mem, "cvtsi2ss", "q", WriteCvtI2SS, SSEPackedSingle>,
+ XS, REX_W, SIMD_EXC;
defm CVTSI2SD : sse12_cvt_sint_3addr<0x2A, GR32, VR128,
- i32mem, "cvtsi2sd", "l", WriteCvtI2SD>, XD;
+ i32mem, "cvtsi2sd", "l", WriteCvtI2SD, SSEPackedDouble>,
+ XD;
defm CVTSI642SD : sse12_cvt_sint_3addr<0x2A, GR64, VR128,
- i64mem, "cvtsi2sd", "q", WriteCvtI2SD>, XD, REX_W;
+ i64mem, "cvtsi2sd", "q", WriteCvtI2SD, SSEPackedDouble>,
+ XD, REX_W, SIMD_EXC;
}
def : InstAlias<"vcvtsi2ss{l}\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1048,34 +1068,38 @@ def : InstAlias<"cvtsi2sd\t{$src, $dst|$dst, $src}",
/// SSE 1 Only
// Aliases for intrinsics
-let Predicates = [UseAVX] in {
+let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
defm VCVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
ssmem, sse_load_f32, "cvttss2si",
- WriteCvtSS2I>, XS, VEX, VEX_LIG;
+ WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
defm VCVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
X86cvtts2Int, ssmem, sse_load_f32,
- "cvttss2si", WriteCvtSS2I>,
+ "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
XS, VEX, VEX_LIG, VEX_W;
defm VCVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
sdmem, sse_load_f64, "cvttsd2si",
- WriteCvtSS2I>, XD, VEX, VEX_LIG;
+ WriteCvtSS2I, SSEPackedDouble>, XD, VEX, VEX_LIG;
defm VCVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
X86cvtts2Int, sdmem, sse_load_f64,
- "cvttsd2si", WriteCvtSS2I>,
+ "cvttsd2si", WriteCvtSS2I, SSEPackedDouble>,
XD, VEX, VEX_LIG, VEX_W;
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm CVTTSS2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v4f32, X86cvtts2Int,
ssmem, sse_load_f32, "cvttss2si",
- WriteCvtSS2I>, XS;
+ WriteCvtSS2I, SSEPackedSingle>, XS;
defm CVTTSS2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v4f32,
X86cvtts2Int, ssmem, sse_load_f32,
- "cvttss2si", WriteCvtSS2I>, XS, REX_W;
+ "cvttss2si", WriteCvtSS2I, SSEPackedSingle>,
+ XS, REX_W;
defm CVTTSD2SI : sse12_cvt_sint<0x2C, VR128, GR32, i32, v2f64, X86cvtts2Int,
sdmem, sse_load_f64, "cvttsd2si",
- WriteCvtSD2I>, XD;
+ WriteCvtSD2I, SSEPackedDouble>, XD;
defm CVTTSD2SI64 : sse12_cvt_sint<0x2C, VR128, GR64, i64, v2f64,
X86cvtts2Int, sdmem, sse_load_f64,
- "cvttsd2si", WriteCvtSD2I>, XD, REX_W;
+ "cvttsd2si", WriteCvtSD2I, SSEPackedDouble>,
+ XD, REX_W;
+}
def : InstAlias<"vcvttss2si{l}\t{$src, $dst|$dst, $src}",
(VCVTTSS2SIrr_Int GR32:$dst, VR128:$src), 0, "att">;
@@ -1111,20 +1135,21 @@ def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
def : InstAlias<"cvttsd2si{q}\t{$src, $dst|$dst, $src}",
(CVTTSD2SI64rm_Int GR64:$dst, f64mem:$src), 0, "att">;
-let Predicates = [UseAVX] in {
+let Predicates = [UseAVX], Uses = [MXCSR], mayRaiseFPException = 1 in {
defm VCVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
ssmem, sse_load_f32, "cvtss2si",
- WriteCvtSS2I>, XS, VEX, VEX_LIG;
+ WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_LIG;
defm VCVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
ssmem, sse_load_f32, "cvtss2si",
- WriteCvtSS2I>, XS, VEX, VEX_W, VEX_LIG;
+ WriteCvtSS2I, SSEPackedSingle>, XS, VEX, VEX_W, VEX_LIG;
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm CVTSS2SI : sse12_cvt_sint<0x2D, VR128, GR32, i32, v4f32, X86cvts2si,
ssmem, sse_load_f32, "cvtss2si",
- WriteCvtSS2I>, XS;
+ WriteCvtSS2I, SSEPackedSingle>, XS;
defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, i64, v4f32, X86cvts2si,
ssmem, sse_load_f32, "cvtss2si",
- WriteCvtSS2I>, XS, REX_W;
+ WriteCvtSS2I, SSEPackedSingle>, XS, REX_W;
defm VCVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, load,
"vcvtdq2ps\t{$src, $dst|$dst, $src}",
@@ -1139,6 +1164,7 @@ defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memop,
"cvtdq2ps\t{$src, $dst|$dst, $src}",
SSEPackedSingle, WriteCvtI2PS>,
PS, Requires<[UseSSE2]>;
+}
// AVX aliases
def : InstAlias<"vcvtss2si{l}\t{$src, $dst|$dst, $src}",
@@ -1184,31 +1210,32 @@ def VCVTSD2SSrr : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
(ins FR32:$src1, FR64:$src2),
"cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
VEX_4V, VEX_LIG, VEX_WIG,
- Sched<[WriteCvtSD2SS]>;
+ Sched<[WriteCvtSD2SS]>, SIMD_EXC;
let mayLoad = 1 in
def VCVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst),
(ins FR32:$src1, f64mem:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
XD, VEX_4V, VEX_LIG, VEX_WIG,
- Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
+ Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>, SIMD_EXC;
}
-def : Pat<(f32 (fpround FR64:$src)),
+def : Pat<(f32 (any_fpround FR64:$src)),
(VCVTSD2SSrr (f32 (IMPLICIT_DEF)), FR64:$src)>,
Requires<[UseAVX]>;
let isCodeGenOnly = 1 in {
def CVTSD2SSrr : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fpround FR64:$src))]>,
- Sched<[WriteCvtSD2SS]>;
+ [(set FR32:$dst, (any_fpround FR64:$src))]>,
+ Sched<[WriteCvtSD2SS]>, SIMD_EXC;
def CVTSD2SSrm : I<0x5A, MRMSrcMem, (outs FR32:$dst), (ins f64mem:$src),
"cvtsd2ss\t{$src, $dst|$dst, $src}",
- [(set FR32:$dst, (fpround (loadf64 addr:$src)))]>,
+ [(set FR32:$dst, (any_fpround (loadf64 addr:$src)))]>,
XD, Requires<[UseSSE2, OptForSize]>,
- Sched<[WriteCvtSD2SS.Folded]>;
+ Sched<[WriteCvtSD2SS.Folded]>, SIMD_EXC;
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
def VCVTSD2SSrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1238,6 +1265,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
XD, Requires<[UseSSE2]>,
Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
}
+}
// Convert scalar single to scalar double
// SSE2 instructions with XS prefix
@@ -1246,34 +1274,34 @@ def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
(ins FR64:$src1, FR32:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
XS, VEX_4V, VEX_LIG, VEX_WIG,
- Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>;
+ Sched<[WriteCvtSS2SD]>, Requires<[UseAVX]>, SIMD_EXC;
let mayLoad = 1 in
def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
(ins FR64:$src1, f32mem:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
XS, VEX_4V, VEX_LIG, VEX_WIG,
Sched<[WriteCvtSS2SD.Folded, WriteCvtSS2SD.ReadAfterFold]>,
- Requires<[UseAVX, OptForSize]>;
+ Requires<[UseAVX, OptForSize]>, SIMD_EXC;
} // isCodeGenOnly = 1, hasSideEffects = 0
-def : Pat<(f64 (fpextend FR32:$src)),
+def : Pat<(f64 (any_fpextend FR32:$src)),
(VCVTSS2SDrr (f64 (IMPLICIT_DEF)), FR32:$src)>, Requires<[UseAVX]>;
-def : Pat<(fpextend (loadf32 addr:$src)),
+def : Pat<(any_fpextend (loadf32 addr:$src)),
(VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX, OptForSize]>;
let isCodeGenOnly = 1 in {
def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (fpextend FR32:$src))]>,
- XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>;
+ [(set FR64:$dst, (any_fpextend FR32:$src))]>,
+ XS, Requires<[UseSSE2]>, Sched<[WriteCvtSS2SD]>, SIMD_EXC;
def CVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst), (ins f32mem:$src),
"cvtss2sd\t{$src, $dst|$dst, $src}",
- [(set FR64:$dst, (fpextend (loadf32 addr:$src)))]>,
+ [(set FR64:$dst, (any_fpextend (loadf32 addr:$src)))]>,
XS, Requires<[UseSSE2, OptForSize]>,
- Sched<[WriteCvtSS2SD.Folded]>;
+ Sched<[WriteCvtSS2SD.Folded]>, SIMD_EXC;
} // isCodeGenOnly = 1
-let hasSideEffects = 0 in {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1 in {
def VCVTSS2SDrr_Int: I<0x5A, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
"vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -1307,53 +1335,53 @@ let Predicates = [UseAVX] in {
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector
- (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
(VCVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector
- (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
(VCVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
(VCVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
(VCVTSI642SSrm_Int VR128:$dst, addr:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
(VCVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
(VCVTSI2SSrm_Int VR128:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
(VCVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
(VCVTSI642SDrm_Int VR128:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
(VCVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
(VCVTSI2SDrm_Int VR128:$dst, addr:$src)>;
} // Predicates = [UseAVX]
@@ -1361,55 +1389,55 @@ let Predicates = [UseSSE2] in {
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
(v4f32 (scalar_to_vector
- (f32 (fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
+ (f32 (any_fpround (f64 (extractelt VR128:$src, (iPTR 0))))))))),
(CVTSD2SSrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
(v2f64 (scalar_to_vector
- (f64 (fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
+ (f64 (any_fpextend (f32 (extractelt VR128:$src, (iPTR 0))))))))),
(CVTSS2SDrr_Int VR128:$dst, VR128:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp GR64:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR64:$src)))))),
(CVTSI642SDrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi64 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi64 addr:$src))))))),
(CVTSI642SDrm_Int VR128:$dst, addr:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp GR32:$src)))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp GR32:$src)))))),
(CVTSI2SDrr_Int VR128:$dst, GR32:$src)>;
def : Pat<(v2f64 (X86Movsd
(v2f64 VR128:$dst),
- (v2f64 (scalar_to_vector (f64 (sint_to_fp (loadi32 addr:$src))))))),
+ (v2f64 (scalar_to_vector (f64 (any_sint_to_fp (loadi32 addr:$src))))))),
(CVTSI2SDrm_Int VR128:$dst, addr:$src)>;
} // Predicates = [UseSSE2]
let Predicates = [UseSSE1] in {
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp GR64:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR64:$src)))))),
(CVTSI642SSrr_Int VR128:$dst, GR64:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi64 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi64 addr:$src))))))),
(CVTSI642SSrm_Int VR128:$dst, addr:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp GR32:$src)))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp GR32:$src)))))),
(CVTSI2SSrr_Int VR128:$dst, GR32:$src)>;
def : Pat<(v4f32 (X86Movss
(v4f32 VR128:$dst),
- (v4f32 (scalar_to_vector (f32 (sint_to_fp (loadi32 addr:$src))))))),
+ (v4f32 (scalar_to_vector (f32 (any_sint_to_fp (loadi32 addr:$src))))))),
(CVTSI2SSrm_Int VR128:$dst, addr:$src)>;
} // Predicates = [UseSSE1]
@@ -1418,36 +1446,36 @@ let Predicates = [HasAVX, NoVLX] in {
def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
- VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
+ VEX, Sched<[WriteCvtPS2I]>, VEX_WIG, SIMD_EXC;
def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (loadv4f32 addr:$src))))]>,
- VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
+ VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG, SIMD_EXC;
def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(v8i32 (X86cvtp2Int (v8f32 VR256:$src))))]>,
- VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
+ VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG, SIMD_EXC;
def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
(v8i32 (X86cvtp2Int (loadv8f32 addr:$src))))]>,
- VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
+ VEX, VEX_L, Sched<[WriteCvtPS2IYLd]>, VEX_WIG, SIMD_EXC;
}
def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst, (v4i32 (X86cvtp2Int (v4f32 VR128:$src))))]>,
- Sched<[WriteCvtPS2I]>;
+ Sched<[WriteCvtPS2I]>, SIMD_EXC;
def CVTPS2DQrm : PDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (memopv4f32 addr:$src))))]>,
- Sched<[WriteCvtPS2ILd]>;
+ Sched<[WriteCvtPS2ILd]>, SIMD_EXC;
// Convert Packed Double FP to Packed DW Integers
-let Predicates = [HasAVX, NoVLX] in {
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
@@ -1486,35 +1514,36 @@ def CVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (memopv2f64 addr:$src))))]>,
- Sched<[WriteCvtPD2ILd]>;
+ Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
def CVTPD2DQrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
(v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
- Sched<[WriteCvtPD2I]>;
+ Sched<[WriteCvtPD2I]>, SIMD_EXC;
// Convert with truncation packed single/double fp to doubleword
// SSE2 packed instructions with XS prefix
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let Predicates = [HasAVX, NoVLX] in {
def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
+ (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
VEX, Sched<[WriteCvtPS2I]>, VEX_WIG;
def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (loadv4f32 addr:$src))))]>,
+ (v4i32 (X86any_cvttp2si (loadv4f32 addr:$src))))]>,
VEX, Sched<[WriteCvtPS2ILd]>, VEX_WIG;
def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (v8i32 (X86cvttp2si (v8f32 VR256:$src))))]>,
+ (v8i32 (X86any_cvttp2si (v8f32 VR256:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPS2IY]>, VEX_WIG;
def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (v8i32 (X86cvttp2si (loadv8f32 addr:$src))))]>,
+ (v8i32 (X86any_cvttp2si (loadv8f32 addr:$src))))]>,
VEX, VEX_L,
Sched<[WriteCvtPS2IYLd]>, VEX_WIG;
}
@@ -1522,40 +1551,41 @@ def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src)
def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v4f32 VR128:$src))))]>,
+ (v4i32 (X86any_cvttp2si (v4f32 VR128:$src))))]>,
Sched<[WriteCvtPS2I]>;
def CVTTPS2DQrm : S2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttps2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (memopv4f32 addr:$src))))]>,
+ (v4i32 (X86any_cvttp2si (memopv4f32 addr:$src))))]>,
Sched<[WriteCvtPS2ILd]>;
+}
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
-let Predicates = [HasAVX, NoVLX] in {
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
// XMM only
def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
+ (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
VEX, Sched<[WriteCvtPD2I]>, VEX_WIG;
def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))]>,
+ (v4i32 (X86any_cvttp2si (loadv2f64 addr:$src))))]>,
VEX, Sched<[WriteCvtPD2ILd]>, VEX_WIG;
// YMM only
def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v4f64 VR256:$src))))]>,
+ (v4i32 (X86any_cvttp2si (v4f64 VR256:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPD2IY]>, VEX_WIG;
def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (loadv4f64 addr:$src))))]>,
+ (v4i32 (X86any_cvttp2si (loadv4f64 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtPD2IYLd]>, VEX_WIG;
} // Predicates = [HasAVX, NoVLX]
@@ -1565,29 +1595,29 @@ def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
(VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0, "att">;
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4i32 (fp_to_sint (v4f64 VR256:$src))),
+ def : Pat<(v4i32 (any_fp_to_sint (v4f64 VR256:$src))),
(VCVTTPD2DQYrr VR256:$src)>;
- def : Pat<(v4i32 (fp_to_sint (loadv4f64 addr:$src))),
+ def : Pat<(v4i32 (any_fp_to_sint (loadv4f64 addr:$src))),
(VCVTTPD2DQYrm addr:$src)>;
}
def CVTTPD2DQrr : PDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (v2f64 VR128:$src))))]>,
- Sched<[WriteCvtPD2I]>;
+ (v4i32 (X86any_cvttp2si (v2f64 VR128:$src))))]>,
+ Sched<[WriteCvtPD2I]>, SIMD_EXC;
def CVTTPD2DQrm : PDI<0xE6, MRMSrcMem, (outs VR128:$dst),(ins f128mem:$src),
"cvttpd2dq\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v4i32 (X86cvttp2si (memopv2f64 addr:$src))))]>,
- Sched<[WriteCvtPD2ILd]>;
+ (v4i32 (X86any_cvttp2si (memopv2f64 addr:$src))))]>,
+ Sched<[WriteCvtPD2ILd]>, SIMD_EXC;
// Convert packed single to packed double
-let Predicates = [HasAVX, NoVLX] in {
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
// SSE2 instructions without OpSize prefix
def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
+ [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
PS, VEX, Sched<[WriteCvtPS2PD]>, VEX_WIG;
def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
@@ -1595,7 +1625,7 @@ def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
PS, VEX, Sched<[WriteCvtPS2PD.Folded]>, VEX_WIG;
def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))]>,
+ [(set VR256:$dst, (v4f64 (any_fpextend (v4f32 VR128:$src))))]>,
PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY]>, VEX_WIG;
def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
"vcvtps2pd\t{$src, $dst|$dst, $src}",
@@ -1603,10 +1633,10 @@ def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
PS, VEX, VEX_L, Sched<[WriteCvtPS2PDY.Folded]>, VEX_WIG;
}
-let Predicates = [UseSSE2] in {
+let Predicates = [UseSSE2], Uses = [MXCSR], mayRaiseFPException = 1 in {
def CVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))]>,
+ [(set VR128:$dst, (v2f64 (X86any_vfpext (v4f32 VR128:$src))))]>,
PS, Sched<[WriteCvtPS2PD]>;
def CVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
"cvtps2pd\t{$src, $dst|$dst, $src}",
@@ -1620,7 +1650,7 @@ let hasSideEffects = 0, mayLoad = 1 in
def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP
+ (v2f64 (X86any_VSintToFP
(bc_v4i32
(v2i64 (scalar_to_vector
(loadi64 addr:$src)))))))]>,
@@ -1628,18 +1658,18 @@ def VCVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
def VCVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+ (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
VEX, Sched<[WriteCvtI2PD]>, VEX_WIG;
def VCVTDQ2PDYrm : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (v4f64 (sint_to_fp (loadv4i32 addr:$src))))]>,
+ (v4f64 (any_sint_to_fp (loadv4i32 addr:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtI2PDYLd]>,
VEX_WIG;
def VCVTDQ2PDYrr : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
"vcvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR256:$dst,
- (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
+ (v4f64 (any_sint_to_fp (v4i32 VR128:$src))))]>,
VEX, VEX_L, Sched<[WriteCvtI2PDY]>, VEX_WIG;
}
@@ -1647,7 +1677,7 @@ let hasSideEffects = 0, mayLoad = 1 in
def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP
+ (v2f64 (X86any_VSintToFP
(bc_v4i32
(v2i64 (scalar_to_vector
(loadi64 addr:$src)))))))]>,
@@ -1655,18 +1685,18 @@ def CVTDQ2PDrm : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
def CVTDQ2PDrr : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtdq2pd\t{$src, $dst|$dst, $src}",
[(set VR128:$dst,
- (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
+ (v2f64 (X86any_VSintToFP (v4i32 VR128:$src))))]>,
Sched<[WriteCvtI2PD]>;
// AVX register conversion intrinsics
let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(VCVTDQ2PDrm addr:$src)>;
} // Predicates = [HasAVX, NoVLX]
// SSE2 register conversion intrinsics
let Predicates = [UseSSE2] in {
- def : Pat<(v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
+ def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
(CVTDQ2PDrm addr:$src)>;
} // Predicates = [UseSSE2]
@@ -1674,24 +1704,24 @@ let Predicates = [UseSSE2] in {
// The assembler can recognize rr 256-bit instructions by seeing a ymm
// register, but the same isn't true when using memory operands instead.
// Provide other assembly rr and rm forms to address this explicitly.
-let Predicates = [HasAVX, NoVLX] in {
+let Predicates = [HasAVX, NoVLX], Uses = [MXCSR], mayRaiseFPException = 1 in {
// XMM only
def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
+ [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
VEX, Sched<[WriteCvtPD2PS]>, VEX_WIG;
def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))]>,
+ [(set VR128:$dst, (X86any_vfpround (loadv2f64 addr:$src)))]>,
VEX, Sched<[WriteCvtPD2PS.Folded]>, VEX_WIG;
def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround VR256:$src))]>,
+ [(set VR128:$dst, (X86any_vfpround VR256:$src))]>,
VEX, VEX_L, Sched<[WriteCvtPD2PSY]>, VEX_WIG;
def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
"cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (loadv4f64 addr:$src)))]>,
+ [(set VR128:$dst, (X86any_vfpround (loadv4f64 addr:$src)))]>,
VEX, VEX_L, Sched<[WriteCvtPD2PSY.Folded]>, VEX_WIG;
} // Predicates = [HasAVX, NoVLX]
@@ -1702,19 +1732,12 @@ def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
def CVTPD2PSrr : PDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))]>,
- Sched<[WriteCvtPD2PS]>;
+ [(set VR128:$dst, (X86any_vfpround (v2f64 VR128:$src)))]>,
+ Sched<[WriteCvtPD2PS]>, SIMD_EXC;
def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
"cvtpd2ps\t{$src, $dst|$dst, $src}",
- [(set VR128:$dst, (X86vfpround (memopv2f64 addr:$src)))]>,
- Sched<[WriteCvtPD2PS.Folded]>;
-
-let Predicates = [HasAVX, NoVLX] in {
- def : Pat<(v4f32 (fpround (v4f64 VR256:$src))),
- (VCVTPD2PSYrr VR256:$src)>;
- def : Pat<(v4f32 (fpround (loadv4f64 addr:$src))),
- (VCVTPD2PSYrm addr:$src)>;
-}
+ [(set VR128:$dst, (X86any_vfpround (memopv2f64 addr:$src)))]>,
+ Sched<[WriteCvtPD2PS.Folded]>, SIMD_EXC;
//===----------------------------------------------------------------------===//
// SSE 1 & 2 - Compare Instructions
@@ -1725,6 +1748,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
SDNode OpNode, ValueType VT,
PatFrag ld_frag, string asm,
X86FoldableSchedWrite sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let isCommutable = 1 in
def rr : SIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
@@ -1736,6 +1760,7 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
(ld_frag addr:$src2), timm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
let isCodeGenOnly = 1 in {
let ExeDomain = SSEPackedSingle in
@@ -1763,6 +1788,7 @@ let isCodeGenOnly = 1 in {
multiclass sse12_cmp_scalar_int<Operand memop,
Intrinsic Int, string asm, X86FoldableSchedWrite sched,
ComplexPattern mem_cpat> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
(ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
[(set VR128:$dst, (Int VR128:$src1,
@@ -1775,6 +1801,7 @@ let mayLoad = 1 in
mem_cpat:$src, timm:$cc))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
// Aliases to match intrinsics which expect XMM operand(s).
let ExeDomain = SSEPackedSingle in
@@ -1802,9 +1829,10 @@ let Constraints = "$src1 = $dst" in {
// sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, X86MemOperand x86memop,
- PatFrag ld_frag, string OpcodeStr,
- X86FoldableSchedWrite sched> {
-let hasSideEffects = 0 in {
+ PatFrag ld_frag, string OpcodeStr, Domain d,
+ X86FoldableSchedWrite sched = WriteFCom> {
+let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
+ ExeDomain = d in {
def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
@@ -1822,7 +1850,9 @@ let mayLoad = 1 in
multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
ValueType vt, Operand memop,
ComplexPattern mem_cpat, string OpcodeStr,
- X86FoldableSchedWrite sched> {
+ Domain d,
+ X86FoldableSchedWrite sched = WriteFCom> {
+let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
!strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
[(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
@@ -1834,52 +1864,48 @@ let mayLoad = 1 in
mem_cpat:$src2))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
let Defs = [EFLAGS] in {
- defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
- defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
- let Pattern = []<dag> in {
- defm VCOMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
- "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
- defm VCOMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
- "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
- }
+ defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
+ defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
+ defm VCOMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
+ "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
+ defm VCOMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
+ "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
let isCodeGenOnly = 1 in {
defm VUCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
+ sse_load_f32, "ucomiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
defm VUCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
+ sse_load_f64, "ucomisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
defm VCOMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", WriteFCom>, PS, VEX, VEX_LIG, VEX_WIG;
+ sse_load_f32, "comiss", SSEPackedSingle>, PS, VEX, VEX_LIG, VEX_WIG;
defm VCOMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", WriteFCom>, PD, VEX, VEX_LIG, VEX_WIG;
- }
- defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
- "ucomiss", WriteFCom>, PS;
- defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
- "ucomisd", WriteFCom>, PD;
-
- let Pattern = []<dag> in {
- defm COMISS : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
- "comiss", WriteFCom>, PS;
- defm COMISD : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
- "comisd", WriteFCom>, PD;
+ sse_load_f64, "comisd", SSEPackedDouble>, PD, VEX, VEX_LIG, VEX_WIG;
}
+ defm UCOMISS : sse12_ord_cmp<0x2E, FR32, X86any_fcmp, f32, f32mem, loadf32,
+ "ucomiss", SSEPackedSingle>, PS;
+ defm UCOMISD : sse12_ord_cmp<0x2E, FR64, X86any_fcmp, f64, f64mem, loadf64,
+ "ucomisd", SSEPackedDouble>, PD;
+ defm COMISS : sse12_ord_cmp<0x2F, FR32, X86strict_fcmps, f32, f32mem, loadf32,
+ "comiss", SSEPackedSingle>, PS;
+ defm COMISD : sse12_ord_cmp<0x2F, FR64, X86strict_fcmps, f64, f64mem, loadf64,
+ "comisd", SSEPackedDouble>, PD;
let isCodeGenOnly = 1 in {
defm UCOMISS : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
- sse_load_f32, "ucomiss", WriteFCom>, PS;
+ sse_load_f32, "ucomiss", SSEPackedSingle>, PS;
defm UCOMISD : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
- sse_load_f64, "ucomisd", WriteFCom>, PD;
+ sse_load_f64, "ucomisd", SSEPackedDouble>, PD;
defm COMISS : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
- sse_load_f32, "comiss", WriteFCom>, PS;
+ sse_load_f32, "comiss", SSEPackedSingle>, PS;
defm COMISD : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
- sse_load_f64, "comisd", WriteFCom>, PD;
+ sse_load_f64, "comisd", SSEPackedDouble>, PD;
}
} // Defs = [EFLAGS]
@@ -1888,17 +1914,19 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
ValueType VT, string asm,
X86FoldableSchedWrite sched,
Domain d, PatFrag ld_frag> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let isCommutable = 1 in
def rri : PIi8<0xC2, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
- [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
+ [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
Sched<[sched]>;
def rmi : PIi8<0xC2, MRMSrcMem,
(outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
[(set RC:$dst,
- (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
+ (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
"cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
@@ -1928,20 +1956,20 @@ def CommutableCMPCC : PatLeaf<(timm), [{
// Patterns to select compares with loads in first operand.
let Predicates = [HasAVX] in {
- def : Pat<(v4f64 (X86cmpp (loadv4f64 addr:$src2), VR256:$src1,
- CommutableCMPCC:$cc)),
+ def : Pat<(v4f64 (X86any_cmpp (loadv4f64 addr:$src2), VR256:$src1,
+ CommutableCMPCC:$cc)),
(VCMPPDYrmi VR256:$src1, addr:$src2, timm:$cc)>;
- def : Pat<(v8f32 (X86cmpp (loadv8f32 addr:$src2), VR256:$src1,
- CommutableCMPCC:$cc)),
+ def : Pat<(v8f32 (X86any_cmpp (loadv8f32 addr:$src2), VR256:$src1,
+ CommutableCMPCC:$cc)),
(VCMPPSYrmi VR256:$src1, addr:$src2, timm:$cc)>;
- def : Pat<(v2f64 (X86cmpp (loadv2f64 addr:$src2), VR128:$src1,
- CommutableCMPCC:$cc)),
+ def : Pat<(v2f64 (X86any_cmpp (loadv2f64 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
(VCMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
- def : Pat<(v4f32 (X86cmpp (loadv4f32 addr:$src2), VR128:$src1,
- CommutableCMPCC:$cc)),
+ def : Pat<(v4f32 (X86any_cmpp (loadv4f32 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
(VCMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
@@ -1954,8 +1982,8 @@ let Predicates = [HasAVX] in {
}
let Predicates = [UseSSE2] in {
- def : Pat<(v2f64 (X86cmpp (memopv2f64 addr:$src2), VR128:$src1,
- CommutableCMPCC:$cc)),
+ def : Pat<(v2f64 (X86any_cmpp (memopv2f64 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
(CMPPDrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f64 (X86cmps (loadf64 addr:$src2), FR64:$src1,
@@ -1964,8 +1992,8 @@ let Predicates = [UseSSE2] in {
}
let Predicates = [UseSSE1] in {
- def : Pat<(v4f32 (X86cmpp (memopv4f32 addr:$src2), VR128:$src1,
- CommutableCMPCC:$cc)),
+ def : Pat<(v4f32 (X86any_cmpp (memopv4f32 addr:$src2), VR128:$src1,
+ CommutableCMPCC:$cc)),
(CMPPSrmi VR128:$src1, addr:$src2, timm:$cc)>;
def : Pat<(f32 (X86cmps (loadf32 addr:$src2), FR32:$src1,
@@ -2555,6 +2583,7 @@ def : Pat<(X86fandn VR128:$src1, (memopv4f32 addr:$src2)),
/// classes below
multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
SDNode OpNode, X86SchedWriteSizes sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let Predicates = [HasAVX, NoVLX] in {
defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
VR128, v4f32, f128mem, loadv4f32,
@@ -2580,9 +2609,11 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
sched.PD.XMM>, PD;
}
}
+}
multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
X86SchedWriteSizes sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
OpNode, FR32, f32mem, SSEPackedSingle, sched.PS.Scl, 0>,
XS, VEX_4V, VEX_LIG, VEX_WIG;
@@ -2599,10 +2630,12 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
sched.PD.Scl>, XD;
}
}
+}
multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
SDPatternOperator OpNode,
X86SchedWriteSizes sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
!strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
SSEPackedSingle, sched.PS.Scl, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
@@ -2619,20 +2652,21 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
SSEPackedDouble, sched.PD.Scl>, XD;
}
}
+}
// Binary Arithmetic instructions
-defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SchedWriteFAddSizes>,
- basic_sse12_fp_binop_s<0x58, "add", fadd, SchedWriteFAddSizes>,
+defm ADD : basic_sse12_fp_binop_p<0x58, "add", any_fadd, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s<0x58, "add", any_fadd, SchedWriteFAddSizes>,
basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SchedWriteFAddSizes>;
-defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SchedWriteFMulSizes>,
- basic_sse12_fp_binop_s<0x59, "mul", fmul, SchedWriteFMulSizes>,
+defm MUL : basic_sse12_fp_binop_p<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
+ basic_sse12_fp_binop_s<0x59, "mul", any_fmul, SchedWriteFMulSizes>,
basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SchedWriteFMulSizes>;
let isCommutable = 0 in {
- defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SchedWriteFAddSizes>,
- basic_sse12_fp_binop_s<0x5C, "sub", fsub, SchedWriteFAddSizes>,
+ defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
+ basic_sse12_fp_binop_s<0x5C, "sub", any_fsub, SchedWriteFAddSizes>,
basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, SchedWriteFAddSizes>;
- defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SchedWriteFDivSizes>,
- basic_sse12_fp_binop_s<0x5E, "div", fdiv, SchedWriteFDivSizes>,
+ defm DIV : basic_sse12_fp_binop_p<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
+ basic_sse12_fp_binop_s<0x5E, "div", any_fdiv, SchedWriteFDivSizes>,
basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, SchedWriteFDivSizes>;
defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SchedWriteFCmpSizes>,
@@ -2727,15 +2761,15 @@ multiclass scalar_math_patterns<SDNode Op, string OpcPrefix, SDNode Move,
}
}
-defm : scalar_math_patterns<fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fadd, "ADDSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fsub, "SUBSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fmul, "MULSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
+defm : scalar_math_patterns<any_fdiv, "DIVSS", X86Movss, v4f32, f32, FR32, loadf32, UseSSE1>;
-defm : scalar_math_patterns<fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
-defm : scalar_math_patterns<fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
-defm : scalar_math_patterns<fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
-defm : scalar_math_patterns<fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fadd, "ADDSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fsub, "SUBSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fmul, "MULSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
+defm : scalar_math_patterns<any_fdiv, "DIVSD", X86Movsd, v2f64, f64, FR64, loadf64, UseSSE2>;
/// Unop Arithmetic
/// In addition, we also have a special variant of the scalar form here to
@@ -2961,10 +2995,10 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
}
// Square root.
-defm SQRT : sse1_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt, UseAVX>,
- sse1_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
- sse2_fp_unop_s<0x51, "sqrt", fsqrt, SchedWriteFSqrt64, UseAVX>,
- sse2_fp_unop_p<0x51, "sqrt", fsqrt, SchedWriteFSqrt64>;
+defm SQRT : sse1_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, UseAVX>,
+ sse1_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt, [HasAVX, NoVLX]>,
+ sse2_fp_unop_s<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64, UseAVX>,
+ sse2_fp_unop_p<0x51, "sqrt", any_fsqrt, SchedWriteFSqrt64>, SIMD_EXC;
// Reciprocal approximations. Note that these typically require refinement
// in order to obtain suitable precision.
@@ -2993,8 +3027,8 @@ multiclass scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix, SDNode Mo
}
}
-defm : scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
-defm : scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
+defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32, UseSSE1>;
+defm : scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64, UseSSE2>;
multiclass scalar_unary_math_intr_patterns<Intrinsic Intr, string OpcPrefix,
SDNode Move, ValueType VT,
@@ -4436,6 +4470,7 @@ def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
X86MemOperand x86memop, X86FoldableSchedWrite sched,
PatFrag ld_frag, bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr : I<0xD0, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
@@ -4451,6 +4486,7 @@ multiclass sse3_addsub<string OpcodeStr, ValueType vt, RegisterClass RC,
[(set RC:$dst, (vt (X86Addsub RC:$src1, (ld_frag addr:$src2))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
@@ -4488,6 +4524,7 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
X86MemOperand x86memop, SDNode OpNode,
X86FoldableSchedWrite sched, PatFrag ld_frag,
bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr : S3DI<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -4502,10 +4539,12 @@ multiclass S3D_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
X86MemOperand x86memop, SDNode OpNode,
X86FoldableSchedWrite sched, PatFrag ld_frag,
bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
def rr : S3I<o, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
!if(Is2Addr,
!strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
@@ -4520,6 +4559,7 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
[(set RC:$dst, (vt (OpNode RC:$src1, (ld_frag addr:$src2))))]>,
Sched<[sched.Folded, sched.ReadAfterFold]>;
}
+}
let Predicates = [HasAVX] in {
let ExeDomain = SSEPackedSingle in {
@@ -5348,6 +5388,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
X86FoldableSchedWrite sched> {
// Intrinsic operation, reg.
// Vector intrinsic operation, reg
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
def r : SS4AIi8<opc, MRMSrcReg,
(outs RC:$dst), (ins RC:$src1, i32u8imm:$src2),
!strconcat(OpcodeStr,
@@ -5364,6 +5405,7 @@ multiclass sse41_fp_unop_p<bits<8> opc, string OpcodeStr,
(VT (OpNode (mem_frag addr:$src1), timm:$src2)))]>,
Sched<[sched.Folded]>;
}
+}
multiclass avx_fp_unop_rm<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
@@ -5400,6 +5442,7 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
multiclass sse41_fp_unop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let ExeDomain = SSEPackedSingle, hasSideEffects = 0, isCodeGenOnly = 1 in {
def SSr : SS4AIi8<opcss, MRMSrcReg,
(outs FR32:$dst), (ins FR32:$src1, i32u8imm:$src2),
@@ -5430,11 +5473,13 @@ let ExeDomain = SSEPackedDouble, hasSideEffects = 0, isCodeGenOnly = 1 in {
[]>, Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedDouble, hasSideEffects = 0
}
+}
multiclass sse41_fp_binop_s<bits<8> opcss, bits<8> opcsd,
string OpcodeStr, X86FoldableSchedWrite sched,
ValueType VT32, ValueType VT64,
SDNode OpNode, bit Is2Addr = 1> {
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let ExeDomain = SSEPackedSingle in {
def SSr_Int : SS4AIi8<opcss, MRMSrcReg,
(outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32u8imm:$src3),
@@ -5481,56 +5526,57 @@ let ExeDomain = SSEPackedDouble in {
Sched<[sched.Folded, sched.ReadAfterFold]>;
} // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
}
+}
// FP round - roundss, roundps, roundsd, roundpd
let Predicates = [HasAVX, NoVLX] in {
- let ExeDomain = SSEPackedSingle in {
+ let ExeDomain = SSEPackedSingle, Uses = [MXCSR], mayRaiseFPException = 1 in {
// Intrinsic form
defm VROUNDPS : sse41_fp_unop_p<0x08, "vroundps", f128mem, VR128, v4f32,
- loadv4f32, X86VRndScale, SchedWriteFRnd.XMM>,
+ loadv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>,
VEX, VEX_WIG;
defm VROUNDPSY : sse41_fp_unop_p<0x08, "vroundps", f256mem, VR256, v8f32,
- loadv8f32, X86VRndScale, SchedWriteFRnd.YMM>,
+ loadv8f32, X86any_VRndScale, SchedWriteFRnd.YMM>,
VEX, VEX_L, VEX_WIG;
}
- let ExeDomain = SSEPackedDouble in {
+ let ExeDomain = SSEPackedDouble, Uses = [MXCSR], mayRaiseFPException = 1 in {
defm VROUNDPD : sse41_fp_unop_p<0x09, "vroundpd", f128mem, VR128, v2f64,
- loadv2f64, X86VRndScale, SchedWriteFRnd.XMM>,
+ loadv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>,
VEX, VEX_WIG;
defm VROUNDPDY : sse41_fp_unop_p<0x09, "vroundpd", f256mem, VR256, v4f64,
- loadv4f64, X86VRndScale, SchedWriteFRnd.YMM>,
+ loadv4f64, X86any_VRndScale, SchedWriteFRnd.YMM>,
VEX, VEX_L, VEX_WIG;
}
}
let Predicates = [UseAVX] in {
defm VROUND : sse41_fp_binop_s<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales, 0>,
- VEX_4V, VEX_LIG, VEX_WIG;
+ VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
defm VROUND : avx_fp_unop_rm<0x0A, 0x0B, "vround", SchedWriteFRnd.Scl>,
- VEX_4V, VEX_LIG, VEX_WIG;
+ VEX_4V, VEX_LIG, VEX_WIG, SIMD_EXC;
}
let Predicates = [UseAVX] in {
- def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+ def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
(VROUNDSSr (f32 (IMPLICIT_DEF)), FR32:$src1, timm:$src2)>;
- def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+ def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
(VROUNDSDr (f64 (IMPLICIT_DEF)), FR64:$src1, timm:$src2)>;
}
let Predicates = [UseAVX, OptForSize] in {
- def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+ def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
(VROUNDSSm (f32 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
- def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+ def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
(VROUNDSDm (f64 (IMPLICIT_DEF)), addr:$src1, timm:$src2)>;
}
let ExeDomain = SSEPackedSingle in
defm ROUNDPS : sse41_fp_unop_p<0x08, "roundps", f128mem, VR128, v4f32,
- memopv4f32, X86VRndScale, SchedWriteFRnd.XMM>;
+ memopv4f32, X86any_VRndScale, SchedWriteFRnd.XMM>;
let ExeDomain = SSEPackedDouble in
defm ROUNDPD : sse41_fp_unop_p<0x09, "roundpd", f128mem, VR128, v2f64,
- memopv2f64, X86VRndScale, SchedWriteFRnd.XMM>;
+ memopv2f64, X86any_VRndScale, SchedWriteFRnd.XMM>;
defm ROUND : sse41_fp_unop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl>;
@@ -5539,16 +5585,16 @@ defm ROUND : sse41_fp_binop_s<0x0A, 0x0B, "round", SchedWriteFRnd.Scl,
v4f32, v2f64, X86RndScales>;
let Predicates = [UseSSE41] in {
- def : Pat<(X86VRndScale FR32:$src1, timm:$src2),
+ def : Pat<(X86any_VRndScale FR32:$src1, timm:$src2),
(ROUNDSSr FR32:$src1, timm:$src2)>;
- def : Pat<(X86VRndScale FR64:$src1, timm:$src2),
+ def : Pat<(X86any_VRndScale FR64:$src1, timm:$src2),
(ROUNDSDr FR64:$src1, timm:$src2)>;
}
let Predicates = [UseSSE41, OptForSize] in {
- def : Pat<(X86VRndScale (loadf32 addr:$src1), timm:$src2),
+ def : Pat<(X86any_VRndScale (loadf32 addr:$src1), timm:$src2),
(ROUNDSSm addr:$src1, timm:$src2)>;
- def : Pat<(X86VRndScale (loadf64 addr:$src1), timm:$src2),
+ def : Pat<(X86any_VRndScale (loadf64 addr:$src1), timm:$src2),
(ROUNDSDm addr:$src1, timm:$src2)>;
}
@@ -5959,6 +6005,7 @@ let Predicates = [HasAVX] in {
SchedWriteMPSAD.XMM>, VEX_4V, VEX_WIG;
}
+let Uses = [MXCSR], mayRaiseFPException = 1 in {
let ExeDomain = SSEPackedSingle in
defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
VR128, load, f128mem, 0,
@@ -5972,6 +6019,7 @@ let Predicates = [HasAVX] in {
VR256, load, i256mem, 0,
SchedWriteDPPS.YMM>, VEX_4V, VEX_L, VEX_WIG;
}
+}
let Predicates = [HasAVX2] in {
let isCommutable = 0 in {
@@ -5991,11 +6039,11 @@ let Constraints = "$src1 = $dst" in {
let ExeDomain = SSEPackedSingle in
defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
VR128, memop, f128mem, 1,
- SchedWriteDPPS.XMM>;
+ SchedWriteDPPS.XMM>, SIMD_EXC;
let ExeDomain = SSEPackedDouble in
defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
VR128, memop, f128mem, 1,
- SchedWriteDPPD.XMM>;
+ SchedWriteDPPD.XMM>, SIMD_EXC;
}
/// SS41I_blend_rmi - SSE 4.1 blend with 8-bit immediate
@@ -7266,12 +7314,12 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
}
let Predicates = [HasF16C, NoVLX] in {
- defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>;
- defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L;
+ defm VCVTPH2PS : f16c_ph2ps<VR128, f64mem, WriteCvtPH2PS>, SIMD_EXC;
+ defm VCVTPH2PSY : f16c_ph2ps<VR256, f128mem, WriteCvtPH2PSY>, VEX_L, SIMD_EXC;
defm VCVTPS2PH : f16c_ps2ph<VR128, f64mem, WriteCvtPS2PH,
- WriteCvtPS2PHSt>;
+ WriteCvtPS2PHSt>, SIMD_EXC;
defm VCVTPS2PHY : f16c_ps2ph<VR256, f128mem, WriteCvtPS2PHY,
- WriteCvtPS2PHYSt>, VEX_L;
+ WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
// Pattern match vcvtph2ps of a scalar i64 load.
def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index 3a1212342a13..41b839425ccd 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -31,7 +31,7 @@ def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
"xbegin\t$dst", []>, OpSize32;
}
-// Psuedo instruction to fake the definition of EAX on the fallback code path.
+// Pseudo instruction to fake the definition of EAX on the fallback code path.
let isPseudo = 1, Defs = [EAX] in {
def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
}
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 01620b7b64c9..3f9d626ff912 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -34,6 +34,7 @@
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DataLayout.h"
#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/Support/AtomicOrdering.h"
#include "llvm/Support/CodeGen.h"
#include "llvm/Support/Debug.h"
@@ -111,8 +112,6 @@ private:
bool materializeFP(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectImplicitDefOrPHI(MachineInstr &I, MachineRegisterInfo &MRI) const;
- bool selectShift(MachineInstr &I, MachineRegisterInfo &MRI,
- MachineFunction &MF) const;
bool selectDivRem(MachineInstr &I, MachineRegisterInfo &MRI,
MachineFunction &MF) const;
bool selectIntrinsicWSideEffects(MachineInstr &I, MachineRegisterInfo &MRI,
@@ -342,7 +341,7 @@ bool X86InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_STORE:
case TargetOpcode::G_LOAD:
return selectLoadStoreOp(I, MRI, MF);
- case TargetOpcode::G_GEP:
+ case TargetOpcode::G_PTR_ADD:
case TargetOpcode::G_FRAME_INDEX:
return selectFrameIndexOrGep(I, MRI, MF);
case TargetOpcode::G_GLOBAL_VALUE:
@@ -380,10 +379,6 @@ bool X86InstructionSelector::select(MachineInstr &I) {
case TargetOpcode::G_IMPLICIT_DEF:
case TargetOpcode::G_PHI:
return selectImplicitDefOrPHI(I, MRI);
- case TargetOpcode::G_SHL:
- case TargetOpcode::G_ASHR:
- case TargetOpcode::G_LSHR:
- return selectShift(I, MRI, MF);
case TargetOpcode::G_SDIV:
case TargetOpcode::G_UDIV:
case TargetOpcode::G_SREM:
@@ -482,7 +477,7 @@ static void X86SelectAddress(const MachineInstr &I,
assert(MRI.getType(I.getOperand(0).getReg()).isPointer() &&
"unsupported type.");
- if (I.getOpcode() == TargetOpcode::G_GEP) {
+ if (I.getOpcode() == TargetOpcode::G_PTR_ADD) {
if (auto COff = getConstantVRegVal(I.getOperand(2).getReg(), MRI)) {
int64_t Imm = *COff;
if (isInt<32>(Imm)) { // Check for displacement overflow.
@@ -566,7 +561,7 @@ bool X86InstructionSelector::selectFrameIndexOrGep(MachineInstr &I,
MachineFunction &MF) const {
unsigned Opc = I.getOpcode();
- assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_GEP) &&
+ assert((Opc == TargetOpcode::G_FRAME_INDEX || Opc == TargetOpcode::G_PTR_ADD) &&
"unexpected instruction");
const Register DefReg = I.getOperand(0).getReg();
@@ -1225,7 +1220,7 @@ bool X86InstructionSelector::emitExtractSubreg(unsigned DstReg, unsigned SrcReg,
if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, MRI) ||
!RBI.constrainGenericRegister(DstReg, *DstRC, MRI)) {
- LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+ LLVM_DEBUG(dbgs() << "Failed to constrain EXTRACT_SUBREG\n");
return false;
}
@@ -1519,78 +1514,6 @@ bool X86InstructionSelector::selectImplicitDefOrPHI(
return true;
}
-// Currently GlobalIsel TableGen generates patterns for shift imm and shift 1,
-// but with shiftCount i8. In G_LSHR/G_ASHR/G_SHL like LLVM-IR both arguments
-// has the same type, so for now only shift i8 can use auto generated
-// TableGen patterns.
-bool X86InstructionSelector::selectShift(MachineInstr &I,
- MachineRegisterInfo &MRI,
- MachineFunction &MF) const {
-
- assert((I.getOpcode() == TargetOpcode::G_SHL ||
- I.getOpcode() == TargetOpcode::G_ASHR ||
- I.getOpcode() == TargetOpcode::G_LSHR) &&
- "unexpected instruction");
-
- Register DstReg = I.getOperand(0).getReg();
- const LLT DstTy = MRI.getType(DstReg);
- const RegisterBank &DstRB = *RBI.getRegBank(DstReg, MRI, TRI);
-
- const static struct ShiftEntry {
- unsigned SizeInBits;
- unsigned OpLSHR;
- unsigned OpASHR;
- unsigned OpSHL;
- } OpTable[] = {
- {8, X86::SHR8rCL, X86::SAR8rCL, X86::SHL8rCL}, // i8
- {16, X86::SHR16rCL, X86::SAR16rCL, X86::SHL16rCL}, // i16
- {32, X86::SHR32rCL, X86::SAR32rCL, X86::SHL32rCL}, // i32
- {64, X86::SHR64rCL, X86::SAR64rCL, X86::SHL64rCL} // i64
- };
-
- if (DstRB.getID() != X86::GPRRegBankID)
- return false;
-
- auto ShiftEntryIt = std::find_if(
- std::begin(OpTable), std::end(OpTable), [DstTy](const ShiftEntry &El) {
- return El.SizeInBits == DstTy.getSizeInBits();
- });
- if (ShiftEntryIt == std::end(OpTable))
- return false;
-
- unsigned Opcode = 0;
- switch (I.getOpcode()) {
- case TargetOpcode::G_SHL:
- Opcode = ShiftEntryIt->OpSHL;
- break;
- case TargetOpcode::G_ASHR:
- Opcode = ShiftEntryIt->OpASHR;
- break;
- case TargetOpcode::G_LSHR:
- Opcode = ShiftEntryIt->OpLSHR;
- break;
- default:
- return false;
- }
-
- Register Op0Reg = I.getOperand(1).getReg();
- Register Op1Reg = I.getOperand(2).getReg();
-
- assert(MRI.getType(Op1Reg).getSizeInBits() == 8);
-
- BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(TargetOpcode::COPY),
- X86::CL)
- .addReg(Op1Reg);
-
- MachineInstr &ShiftInst =
- *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opcode), DstReg)
- .addReg(Op0Reg);
-
- constrainSelectedInstRegOperands(ShiftInst, TII, TRI, RBI);
- I.eraseFromParent();
- return true;
-}
-
bool X86InstructionSelector::selectDivRem(MachineInstr &I,
MachineRegisterInfo &MRI,
MachineFunction &MF) const {
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 1d7adbaa9e99..40bf28df3b90 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -15,6 +15,7 @@
#include "X86ISelLowering.h"
#include "X86InstrInfo.h"
+#include "llvm/IR/IntrinsicsX86.h"
namespace llvm {
diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index 04121f863c89..da53d6420021 100644
--- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -77,7 +77,7 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
setLegalizeScalarToDifferentSizeStrategy(MemOp, 0,
narrowToSmallerAndWidenToSmallest);
setLegalizeScalarToDifferentSizeStrategy(
- G_GEP, 1, widenToLargerTypesUnsupportedOtherwise);
+ G_PTR_ADD, 1, widenToLargerTypesUnsupportedOtherwise);
setLegalizeScalarToDifferentSizeStrategy(
G_CONSTANT, 0, widenToLargerTypesAndNarrowToLargest);
@@ -140,8 +140,8 @@ void X86LegalizerInfo::setLegalizerInfo32bit() {
setAction({G_FRAME_INDEX, p0}, Legal);
setAction({G_GLOBAL_VALUE, p0}, Legal);
- setAction({G_GEP, p0}, Legal);
- setAction({G_GEP, 1, s32}, Legal);
+ setAction({G_PTR_ADD, p0}, Legal);
+ setAction({G_PTR_ADD, 1, s32}, Legal);
if (!Subtarget.is64Bit()) {
getActionDefinitionsBuilder(G_PTRTOINT)
@@ -223,7 +223,7 @@ void X86LegalizerInfo::setLegalizerInfo64bit() {
setAction({MemOp, s64}, Legal);
// Pointer-handling
- setAction({G_GEP, 1, s64}, Legal);
+ setAction({G_PTR_ADD, 1, s64}, Legal);
getActionDefinitionsBuilder(G_PTRTOINT)
.legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
.maxScalar(0, s64)
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 78098fd6262f..2fc9a2af01d7 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -569,6 +569,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 0) {
unsigned NewOpc;
switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPEQBZ128rm; break;
case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPEQBZ128rmk; break;
case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPEQBZ128rr; break;
@@ -640,6 +641,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
if (OutMI.getOperand(OutMI.getNumOperands() - 1).getImm() == 6) {
unsigned NewOpc;
switch (OutMI.getOpcode()) {
+ default: llvm_unreachable("Invalid opcode");
case X86::VPCMPBZ128rmi: NewOpc = X86::VPCMPGTBZ128rm; break;
case X86::VPCMPBZ128rmik: NewOpc = X86::VPCMPGTBZ128rmk; break;
case X86::VPCMPBZ128rri: NewOpc = X86::VPCMPGTBZ128rr; break;
@@ -876,6 +878,52 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
case X86::MOVSX64rr32:
SimplifyMOVSX(OutMI);
break;
+
+ case X86::VCMPPDrri:
+ case X86::VCMPPDYrri:
+ case X86::VCMPPSrri:
+ case X86::VCMPPSYrri:
+ case X86::VCMPSDrr:
+ case X86::VCMPSSrr: {
+ // Swap the operands if it will enable a 2 byte VEX encoding.
+ // FIXME: Change the immediate to improve opportunities?
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg())) {
+ unsigned Imm = MI->getOperand(3).getImm() & 0x7;
+ switch (Imm) {
+ default: break;
+ case 0x00: // EQUAL
+ case 0x03: // UNORDERED
+ case 0x04: // NOT EQUAL
+ case 0x07: // ORDERED
+ std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
+ break;
+ }
+ }
+ break;
+ }
+
+ case X86::VMOVHLPSrr:
+ case X86::VUNPCKHPDrr:
+ // These are not truly commutable so hide them from the default case.
+ break;
+
+ default: {
+ // If the instruction is a commutable arithmetic instruction we might be
+ // able to commute the operands to get a 2 byte VEX prefix.
+ uint64_t TSFlags = MI->getDesc().TSFlags;
+ if (MI->getDesc().isCommutable() &&
+ (TSFlags & X86II::EncodingMask) == X86II::VEX &&
+ (TSFlags & X86II::OpMapMask) == X86II::TB &&
+ (TSFlags & X86II::FormMask) == X86II::MRMSrcReg &&
+ !(TSFlags & X86II::VEX_W) && (TSFlags & X86II::VEX_4V) &&
+ OutMI.getNumOperands() == 3) {
+ if (!X86II::isX86_64ExtendedReg(OutMI.getOperand(1).getReg()) &&
+ X86II::isX86_64ExtendedReg(OutMI.getOperand(2).getReg()))
+ std::swap(OutMI.getOperand(1), OutMI.getOperand(2));
+ }
+ break;
+ }
}
}
@@ -983,13 +1031,32 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
}
}
+/// Return the longest nop which can be efficiently decoded for the given
+/// target cpu. 15-bytes is the longest single NOP instruction, but some
+/// platforms can't decode the longest forms efficiently.
+static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) {
+ uint64_t MaxNopLength = 10;
+ if (STI.getFeatureBits()[X86::ProcIntelSLM])
+ MaxNopLength = 7;
+ else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
+ MaxNopLength = 15;
+ else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
+ MaxNopLength = 11;
+ return MaxNopLength;
+}
+
/// Emit the largest nop instruction smaller than or equal to \p NumBytes
/// bytes. Return the size of nop emitted.
static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
const MCSubtargetInfo &STI) {
- // This works only for 64bit. For 32bit we have to do additional checking if
- // the CPU supports multi-byte nops.
- assert(Is64Bit && "EmitNops only supports X86-64");
+ if (!Is64Bit) {
+ // TODO Do additional checking if the CPU supports multi-byte nops.
+ OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI);
+ return 1;
+ }
+
+ // Cap a single nop emission at the profitable value for the target
+ NumBytes = std::min(NumBytes, MaxLongNopLength(STI));
unsigned NopSize;
unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1094,10 +1161,35 @@ static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
}
}
+/// A RAII helper which defines a region of instructions which can't have
+/// padding added between them for correctness.
+struct NoAutoPaddingScope {
+ MCStreamer &OS;
+ const bool OldAllowAutoPadding;
+ NoAutoPaddingScope(MCStreamer &OS)
+ : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
+ changeAndComment(false);
+ }
+ ~NoAutoPaddingScope() {
+ changeAndComment(OldAllowAutoPadding);
+ }
+ void changeAndComment(bool b) {
+ if (b == OS.getAllowAutoPadding())
+ return;
+ OS.setAllowAutoPadding(b);
+ if (b)
+ OS.emitRawComment("autopadding");
+ else
+ OS.emitRawComment("noautopadding");
+ }
+};
+
void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
X86MCInstLower &MCIL) {
assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
StatepointOpers SOpers(&MI);
if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
@@ -1148,7 +1240,10 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
// Record our statepoint node in the same section used by STACKMAP
// and PATCHPOINT
- SM.recordStatepoint(MI);
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer->EmitLabel(MILabel);
+ SM.recordStatepoint(*MILabel, MI);
}
void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
@@ -1156,6 +1251,8 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
// FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
// <opcode>, <operands>
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
Register DefRegister = FaultingMI.getOperand(0).getReg();
FaultMaps::FaultKind FK =
static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
@@ -1163,8 +1260,12 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
unsigned Opcode = FaultingMI.getOperand(3).getImm();
unsigned OperandsBeginIdx = 4;
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *FaultingLabel = Ctx.createTempSymbol();
+ OutStreamer->EmitLabel(FaultingLabel);
+
assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
- FM.recordFaultingOp(FK, HandlerLabel);
+ FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
MCInst MI;
MI.setOpcode(Opcode);
@@ -1199,6 +1300,8 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
X86MCInstLower &MCIL) {
// PATCHABLE_OP minsize, opcode, operands
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
unsigned MinSize = MI.getOperand(0).getImm();
unsigned Opcode = MI.getOperand(1).getImm();
@@ -1236,7 +1339,12 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
// <id>, <shadowBytes>, ...
void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
- SM.recordStackMap(MI);
+
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer->EmitLabel(MILabel);
+
+ SM.recordStackMap(*MILabel, MI);
unsigned NumShadowBytes = MI.getOperand(1).getImm();
SMShadowTracker.reset(NumShadowBytes);
}
@@ -1249,7 +1357,12 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
- SM.recordPatchPoint(MI);
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ auto &Ctx = OutStreamer->getContext();
+ MCSymbol *MILabel = Ctx.createTempSymbol();
+ OutStreamer->EmitLabel(MILabel);
+ SM.recordPatchPoint(*MILabel, MI);
PatchPointOpers opers(&MI);
unsigned ScratchIdx = opers.getNextScratchIdx();
@@ -1305,6 +1418,8 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
X86MCInstLower &MCIL) {
assert(Subtarget->is64Bit() && "XRay custom events only supports X86-64");
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
// We want to emit the following pattern, which follows the x86 calling
// convention to prepare for the trampoline call to be patched in.
//
@@ -1337,10 +1452,10 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
// The default C calling convention will place two arguments into %rcx and
// %rdx -- so we only work with those.
- unsigned DestRegs[] = {X86::RDI, X86::RSI};
+ const Register DestRegs[] = {X86::RDI, X86::RSI};
bool UsedMask[] = {false, false};
// Filled out in loop.
- unsigned SrcRegs[] = {0, 0};
+ Register SrcRegs[] = {0, 0};
// Then we put the operands in the %rdi and %rsi registers. We spill the
// values in the register before we clobber them, and mark them as used in
@@ -1350,7 +1465,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
assert(Op->isReg() && "Only support arguments in registers");
- SrcRegs[I] = Op->getReg();
+ SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
if (SrcRegs[I] != DestRegs[I]) {
UsedMask[I] = true;
EmitAndCountInstruction(
@@ -1361,6 +1476,9 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
}
// Now that the register values are stashed, mov arguments into place.
+ // FIXME: This doesn't work if one of the later SrcRegs is equal to an
+ // earlier DestReg. We will have already overwritten over the register before
+ // we can copy from it.
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
if (SrcRegs[I] != DestRegs[I])
EmitAndCountInstruction(
@@ -1396,6 +1514,8 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
X86MCInstLower &MCIL) {
assert(Subtarget->is64Bit() && "XRay typed events only supports X86-64");
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
// We want to emit the following pattern, which follows the x86 calling
// convention to prepare for the trampoline call to be patched in.
//
@@ -1429,11 +1549,11 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
// An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
// so we'll work with those. Or we may be called via SystemV, in which case
// we don't have to do any translation.
- unsigned DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
+ const Register DestRegs[] = {X86::RDI, X86::RSI, X86::RDX};
bool UsedMask[] = {false, false, false};
// Will fill out src regs in the loop.
- unsigned SrcRegs[] = {0, 0, 0};
+ Register SrcRegs[] = {0, 0, 0};
// Then we put the operands in the SystemV registers. We spill the values in
// the registers before we clobber them, and mark them as used in UsedMask.
@@ -1443,7 +1563,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
if (auto Op = MCIL.LowerMachineOperand(&MI, MI.getOperand(I))) {
// TODO: Is register only support adequate?
assert(Op->isReg() && "Only supports arguments in registers");
- SrcRegs[I] = Op->getReg();
+ SrcRegs[I] = getX86SubSuperRegister(Op->getReg(), 64);
if (SrcRegs[I] != DestRegs[I]) {
UsedMask[I] = true;
EmitAndCountInstruction(
@@ -1459,6 +1579,9 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
// is clobbers. We've already added nops to account for the size of mov and
// push if the register is in the right place, so we only have to worry about
// emitting movs.
+ // FIXME: This doesn't work if one of the later SrcRegs is equal to an
+ // earlier DestReg. We will have already overwritten over the register before
+ // we can copy from it.
for (unsigned I = 0; I < MI.getNumOperands(); ++I)
if (UsedMask[I])
EmitAndCountInstruction(
@@ -1490,6 +1613,19 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
X86MCInstLower &MCIL) {
+
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
+ const Function &F = MF->getFunction();
+ if (F.hasFnAttribute("patchable-function-entry")) {
+ unsigned Num;
+ if (F.getFnAttribute("patchable-function-entry")
+ .getValueAsString()
+ .getAsInteger(10, Num))
+ return;
+ EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo());
+ return;
+ }
// We want to emit the following pattern:
//
// .p2align 1, ...
@@ -1517,6 +1653,8 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
X86MCInstLower &MCIL) {
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
// Since PATCHABLE_RET takes the opcode of the return statement as an
// argument, we use that to emit the correct form of the RET that we want.
// i.e. when we see this:
@@ -1547,6 +1685,8 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
X86MCInstLower &MCIL) {
+ NoAutoPaddingScope NoPadScope(*OutStreamer);
+
// Like PATCHABLE_RET, we have the actual instruction in the operands to this
// instruction so we lower that particular instruction and its operands.
// Unlike PATCHABLE_RET though, we put the sled before the JMP, much like how
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index c6da4b09dd60..b19d1263e0c9 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -11,6 +11,7 @@
//
//===----------------------------------------------------------------------===//
+#include "MCTargetDesc/X86BaseInfo.h"
#include "X86MacroFusion.h"
#include "X86Subtarget.h"
#include "llvm/CodeGen/MacroFusion.h"
@@ -18,160 +19,13 @@
using namespace llvm;
-namespace {
-
-// The classification for the first instruction.
-enum class FirstInstrKind { Test, Cmp, And, ALU, IncDec, Invalid };
-
-// The classification for the second instruction (jump).
-enum class JumpKind {
- // JE, JL, JG and variants.
- ELG,
- // JA, JB and variants.
- AB,
- // JS, JP, JO and variants.
- SPO,
- // Not a fusable jump.
- Invalid,
-};
-
-} // namespace
-
-static FirstInstrKind classifyFirst(const MachineInstr &MI) {
- switch (MI.getOpcode()) {
- default:
- return FirstInstrKind::Invalid;
- case X86::TEST8rr:
- case X86::TEST16rr:
- case X86::TEST32rr:
- case X86::TEST64rr:
- case X86::TEST8ri:
- case X86::TEST16ri:
- case X86::TEST32ri:
- case X86::TEST64ri32:
- case X86::TEST8mr:
- case X86::TEST16mr:
- case X86::TEST32mr:
- case X86::TEST64mr:
- return FirstInstrKind::Test;
- case X86::AND16ri:
- case X86::AND16ri8:
- case X86::AND16rm:
- case X86::AND16rr:
- case X86::AND32ri:
- case X86::AND32ri8:
- case X86::AND32rm:
- case X86::AND32rr:
- case X86::AND64ri32:
- case X86::AND64ri8:
- case X86::AND64rm:
- case X86::AND64rr:
- case X86::AND8ri:
- case X86::AND8rm:
- case X86::AND8rr:
- return FirstInstrKind::And;
- case X86::CMP16ri:
- case X86::CMP16ri8:
- case X86::CMP16rm:
- case X86::CMP16rr:
- case X86::CMP16mr:
- case X86::CMP32ri:
- case X86::CMP32ri8:
- case X86::CMP32rm:
- case X86::CMP32rr:
- case X86::CMP32mr:
- case X86::CMP64ri32:
- case X86::CMP64ri8:
- case X86::CMP64rm:
- case X86::CMP64rr:
- case X86::CMP64mr:
- case X86::CMP8ri:
- case X86::CMP8rm:
- case X86::CMP8rr:
- case X86::CMP8mr:
- return FirstInstrKind::Cmp;
- case X86::ADD16ri:
- case X86::ADD16ri8:
- case X86::ADD16ri8_DB:
- case X86::ADD16ri_DB:
- case X86::ADD16rm:
- case X86::ADD16rr:
- case X86::ADD16rr_DB:
- case X86::ADD32ri:
- case X86::ADD32ri8:
- case X86::ADD32ri8_DB:
- case X86::ADD32ri_DB:
- case X86::ADD32rm:
- case X86::ADD32rr:
- case X86::ADD32rr_DB:
- case X86::ADD64ri32:
- case X86::ADD64ri32_DB:
- case X86::ADD64ri8:
- case X86::ADD64ri8_DB:
- case X86::ADD64rm:
- case X86::ADD64rr:
- case X86::ADD64rr_DB:
- case X86::ADD8ri:
- case X86::ADD8ri_DB:
- case X86::ADD8rm:
- case X86::ADD8rr:
- case X86::ADD8rr_DB:
- case X86::SUB16ri:
- case X86::SUB16ri8:
- case X86::SUB16rm:
- case X86::SUB16rr:
- case X86::SUB32ri:
- case X86::SUB32ri8:
- case X86::SUB32rm:
- case X86::SUB32rr:
- case X86::SUB64ri32:
- case X86::SUB64ri8:
- case X86::SUB64rm:
- case X86::SUB64rr:
- case X86::SUB8ri:
- case X86::SUB8rm:
- case X86::SUB8rr:
- return FirstInstrKind::ALU;
- case X86::INC16r:
- case X86::INC32r:
- case X86::INC64r:
- case X86::INC8r:
- case X86::DEC16r:
- case X86::DEC32r:
- case X86::DEC64r:
- case X86::DEC8r:
- return FirstInstrKind::IncDec;
- }
+static X86::FirstMacroFusionInstKind classifyFirst(const MachineInstr &MI) {
+ return X86::classifyFirstOpcodeInMacroFusion(MI.getOpcode());
}
-static JumpKind classifySecond(const MachineInstr &MI) {
+static X86::SecondMacroFusionInstKind classifySecond(const MachineInstr &MI) {
X86::CondCode CC = X86::getCondFromBranch(MI);
- if (CC == X86::COND_INVALID)
- return JumpKind::Invalid;
-
- switch (CC) {
- default:
- return JumpKind::Invalid;
- case X86::COND_E:
- case X86::COND_NE:
- case X86::COND_L:
- case X86::COND_LE:
- case X86::COND_G:
- case X86::COND_GE:
- return JumpKind::ELG;
- case X86::COND_B:
- case X86::COND_BE:
- case X86::COND_A:
- case X86::COND_AE:
- return JumpKind::AB;
- case X86::COND_S:
- case X86::COND_NS:
- case X86::COND_P:
- case X86::COND_NP:
- case X86::COND_O:
- case X86::COND_NO:
- return JumpKind::SPO;
- }
+ return X86::classifySecondCondCodeInMacroFusion(CC);
}
/// Check if the instr pair, FirstMI and SecondMI, should be fused
@@ -187,40 +41,27 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
if (!(ST.hasBranchFusion() || ST.hasMacroFusion()))
return false;
- const JumpKind BranchKind = classifySecond(SecondMI);
+ const X86::SecondMacroFusionInstKind BranchKind = classifySecond(SecondMI);
- if (BranchKind == JumpKind::Invalid)
+ if (BranchKind == X86::SecondMacroFusionInstKind::Invalid)
return false; // Second cannot be fused with anything.
if (FirstMI == nullptr)
return true; // We're only checking whether Second can be fused at all.
- const FirstInstrKind TestKind = classifyFirst(*FirstMI);
+ const X86::FirstMacroFusionInstKind TestKind = classifyFirst(*FirstMI);
if (ST.hasBranchFusion()) {
// Branch fusion can merge CMP and TEST with all conditional jumps.
- return (TestKind == FirstInstrKind::Cmp ||
- TestKind == FirstInstrKind::Test);
+ return (TestKind == X86::FirstMacroFusionInstKind::Cmp ||
+ TestKind == X86::FirstMacroFusionInstKind::Test);
}
if (ST.hasMacroFusion()) {
- // Macro Fusion rules are a bit more complex. See Agner Fog's
- // Microarchitecture table 9.2 "Instruction Fusion".
- switch (TestKind) {
- case FirstInstrKind::Test:
- case FirstInstrKind::And:
- return true;
- case FirstInstrKind::Cmp:
- case FirstInstrKind::ALU:
- return BranchKind == JumpKind::ELG || BranchKind == JumpKind::AB;
- case FirstInstrKind::IncDec:
- return BranchKind == JumpKind::ELG;
- case FirstInstrKind::Invalid:
- return false;
- }
+ return X86::isMacroFused(TestKind, BranchKind);
}
- llvm_unreachable("unknown branch fusion type");
+ llvm_unreachable("unknown fusion type");
}
namespace llvm {
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 1aee01563c4b..0c791b6674dc 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -25,6 +25,8 @@
#include "llvm/ADT/Hashing.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
@@ -32,6 +34,7 @@
#include "llvm/CodeGen/MachineInstrBuilder.h"
#include "llvm/CodeGen/MachineOperand.h"
#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/TargetOpcodes.h"
#include "llvm/CodeGen/TargetRegisterInfo.h"
#include "llvm/IR/DebugInfoMetadata.h"
@@ -247,6 +250,12 @@ public:
static char ID;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
private:
using MemOpMap = DenseMap<MemOpKey, SmallVector<MachineInstr *, 16>>;
@@ -294,9 +303,9 @@ private:
DenseMap<const MachineInstr *, unsigned> InstrPos;
- MachineRegisterInfo *MRI;
- const X86InstrInfo *TII;
- const X86RegisterInfo *TRI;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
};
} // end anonymous namespace
@@ -681,6 +690,11 @@ bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
MRI = &MF.getRegInfo();
TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
TRI = MF.getSubtarget<X86Subtarget>().getRegisterInfo();
+ auto *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
// Process all basic blocks.
for (auto &MBB : MF) {
@@ -699,7 +713,9 @@ bool X86OptimizeLEAPass::runOnMachineFunction(MachineFunction &MF) {
// Remove redundant address calculations. Do it only for -Os/-Oz since only
// a code size gain is expected from this part of the pass.
- if (MF.getFunction().hasOptSize())
+ bool OptForSize = MF.getFunction().hasOptSize() ||
+ llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
+ if (OptForSize)
Changed |= removeRedundantAddrCalc(LEAs);
}
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index af974c805c36..4c6bd0ccc2cd 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -17,8 +17,11 @@
#include "X86InstrInfo.h"
#include "X86Subtarget.h"
#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
#include "llvm/CodeGen/Passes.h"
#include "llvm/CodeGen/TargetSchedule.h"
#include "llvm/IR/Function.h"
@@ -52,6 +55,12 @@ namespace {
bool runOnMachineFunction(MachineFunction &MF) override;
+ void getAnalysisUsage(AnalysisUsage &AU) const override {
+ AU.addRequired<ProfileSummaryInfoWrapperPass>();
+ AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+ MachineFunctionPass::getAnalysisUsage(AU);
+ }
+
MachineFunctionProperties getRequiredProperties() const override {
return MachineFunctionProperties().set(
MachineFunctionProperties::Property::NoVRegs);
@@ -105,6 +114,12 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
TSM.init(&MF.getSubtarget());
+ auto *PSI =
+ &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+ auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
+ &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
+ nullptr;
+
// Search through basic blocks and mark the ones that have early returns
ReturnBBs.clear();
VisitedBBs.clear();
@@ -118,6 +133,11 @@ bool PadShortFunc::runOnMachineFunction(MachineFunction &MF) {
MachineBasicBlock *MBB = I->first;
unsigned Cycles = I->second;
+ // Function::hasOptSize is already checked above.
+ bool OptForSize = llvm::shouldOptimizeForSize(MBB, PSI, MBFI);
+ if (OptForSize)
+ continue;
+
if (Cycles < Threshold) {
// BB ends in a return. Skip over any DBG_VALUE instructions
// trailing the terminator.
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index 5610f4bc8873..93238983afa2 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -81,14 +81,14 @@ def HaswellPfmCounters : ProcPfmCounters {
let CycleCounter = UnhaltedCoreCyclesPfmCounter;
let UopsCounter = UopsIssuedPfmCounter;
let IssueCounters = [
- PfmIssueCounter<"HWPort0", "uops_dispatched_port:port_0">,
- PfmIssueCounter<"HWPort1", "uops_dispatched_port:port_1">,
- PfmIssueCounter<"HWPort2", "uops_dispatched_port:port_2">,
- PfmIssueCounter<"HWPort3", "uops_dispatched_port:port_3">,
- PfmIssueCounter<"HWPort4", "uops_dispatched_port:port_4">,
- PfmIssueCounter<"HWPort5", "uops_dispatched_port:port_5">,
- PfmIssueCounter<"HWPort6", "uops_dispatched_port:port_6">,
- PfmIssueCounter<"HWPort7", "uops_dispatched_port:port_7">
+ PfmIssueCounter<"HWPort0", "uops_executed_port:port_0">,
+ PfmIssueCounter<"HWPort1", "uops_executed_port:port_1">,
+ PfmIssueCounter<"HWPort2", "uops_executed_port:port_2">,
+ PfmIssueCounter<"HWPort3", "uops_executed_port:port_3">,
+ PfmIssueCounter<"HWPort4", "uops_executed_port:port_4">,
+ PfmIssueCounter<"HWPort5", "uops_executed_port:port_5">,
+ PfmIssueCounter<"HWPort6", "uops_executed_port:port_6">,
+ PfmIssueCounter<"HWPort7", "uops_executed_port:port_7">
];
}
def : PfmCountersBinding<"haswell", HaswellPfmCounters>;
diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
index daddf4231897..9c076d2d6769 100644
--- a/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -40,8 +40,9 @@ X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
}
-const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
- const TargetRegisterClass &RC) const {
+const RegisterBank &
+X86RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const {
if (X86::GR8RegClass.hasSubClassEq(&RC) ||
X86::GR16RegClass.hasSubClassEq(&RC) ||
diff --git a/llvm/lib/Target/X86/X86RegisterBankInfo.h b/llvm/lib/Target/X86/X86RegisterBankInfo.h
index c1f3001c6180..d5afd2cae761 100644
--- a/llvm/lib/Target/X86/X86RegisterBankInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterBankInfo.h
@@ -64,8 +64,8 @@ private:
public:
X86RegisterBankInfo(const TargetRegisterInfo &TRI);
- const RegisterBank &
- getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+ const RegisterBank &getRegBankFromRegClass(const TargetRegisterClass &RC,
+ LLT) const override;
InstructionMappings
getInstrAlternativeMappings(const MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index ff625325b4c9..f69626b2622e 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -341,6 +341,10 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
return (HasSSE ? CSR_32_RegCall_SaveList :
CSR_32_RegCall_NoSSE_SaveList);
}
+ case CallingConv::CFGuard_Check:
+ assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86");
+ return (HasSSE ? CSR_Win32_CFGuard_Check_SaveList
+ : CSR_Win32_CFGuard_Check_NoSSE_SaveList);
case CallingConv::Cold:
if (Is64Bit)
return CSR_64_MostRegs_SaveList;
@@ -455,6 +459,10 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
return (HasSSE ? CSR_32_RegCall_RegMask :
CSR_32_RegCall_NoSSE_RegMask);
}
+ case CallingConv::CFGuard_Check:
+ assert(!Is64Bit && "CFGuard check mechanism only used on 32-bit X86");
+ return (HasSSE ? CSR_Win32_CFGuard_Check_RegMask
+ : CSR_Win32_CFGuard_Check_NoSSE_RegMask);
case CallingConv::Cold:
if (Is64Bit)
return CSR_64_MostRegs_RegMask;
@@ -515,24 +523,27 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
// Set the floating point control register as reserved.
Reserved.set(X86::FPCW);
+ // Set the floating point status register as reserved.
+ Reserved.set(X86::FPSW);
+
+ // Set the SIMD floating point control register as reserved.
+ Reserved.set(X86::MXCSR);
+
// Set the stack-pointer register and its aliases as reserved.
- for (MCSubRegIterator I(X86::RSP, this, /*IncludeSelf=*/true); I.isValid();
- ++I)
- Reserved.set(*I);
+ for (const MCPhysReg &SubReg : subregs_inclusive(X86::RSP))
+ Reserved.set(SubReg);
// Set the Shadow Stack Pointer as reserved.
Reserved.set(X86::SSP);
// Set the instruction pointer register and its aliases as reserved.
- for (MCSubRegIterator I(X86::RIP, this, /*IncludeSelf=*/true); I.isValid();
- ++I)
- Reserved.set(*I);
+ for (const MCPhysReg &SubReg : subregs_inclusive(X86::RIP))
+ Reserved.set(SubReg);
// Set the frame-pointer register and its aliases as reserved if needed.
if (TFI->hasFP(MF)) {
- for (MCSubRegIterator I(X86::RBP, this, /*IncludeSelf=*/true); I.isValid();
- ++I)
- Reserved.set(*I);
+ for (const MCPhysReg &SubReg : subregs_inclusive(X86::RBP))
+ Reserved.set(SubReg);
}
// Set the base-pointer register and its aliases as reserved if needed.
@@ -545,9 +556,8 @@ BitVector X86RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
"this calling convention.");
Register BasePtr = getX86SubSuperRegister(getBaseRegister(), 64);
- for (MCSubRegIterator I(BasePtr, this, /*IncludeSelf=*/true);
- I.isValid(); ++I)
- Reserved.set(*I);
+ for (const MCPhysReg &SubReg : subregs_inclusive(BasePtr))
+ Reserved.set(SubReg);
}
// Mark the segment registers as reserved.
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 0528b90c1fd5..3cfaf714e93e 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -294,6 +294,11 @@ def FPSW : X86Reg<"fpsr", 0>;
// Floating-point control word
def FPCW : X86Reg<"fpcr", 0>;
+// SIMD Floating-point control register.
+// Note: We only model the "Uses" of the control bits: current rounding modes,
+// DAZ, FTZ and exception masks. We don't model the "Defs" of flag bits.
+def MXCSR : X86Reg<"mxcsr", 0>;
+
// Status flags register.
//
// Note that some flags that are commonly thought of as part of the status
diff --git a/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/llvm/lib/Target/X86/X86RetpolineThunks.cpp
index f8464c7e8298..9085d7f068ac 100644
--- a/llvm/lib/Target/X86/X86RetpolineThunks.cpp
+++ b/llvm/lib/Target/X86/X86RetpolineThunks.cpp
@@ -63,13 +63,13 @@ public:
}
private:
- MachineModuleInfo *MMI;
- const TargetMachine *TM;
- bool Is64Bit;
- const X86Subtarget *STI;
- const X86InstrInfo *TII;
+ MachineModuleInfo *MMI = nullptr;
+ const TargetMachine *TM = nullptr;
+ bool Is64Bit = false;
+ const X86Subtarget *STI = nullptr;
+ const X86InstrInfo *TII = nullptr;
- bool InsertedThunks;
+ bool InsertedThunks = false;
void createThunkFunction(Module &M, StringRef Name);
void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index 78acb1065ec8..b0153ca9da36 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -888,8 +888,7 @@ def AtomWrite01_174 : SchedWriteRes<[AtomPort01]> {
let Latency = 174;
let ResourceCycles = [174];
}
-def : InstRW<[AtomWrite01_174], (instrs FSINCOS)>;
-def : InstRW<[AtomWrite01_174], (instregex "(COS|SIN)_F")>;
+def : InstRW<[AtomWrite01_174], (instrs FSINCOS, FSIN, FCOS)>;
def AtomWrite01_183 : SchedWriteRes<[AtomPort01]> {
let Latency = 183;
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index 8e3ce721f1a1..dcd155ea0e0e 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -202,8 +202,8 @@ defm : SLMWriteResPair<WriteFAddX, [SLM_FPC_RSV1], 3>;
defm : SLMWriteResPair<WriteFAddY, [SLM_FPC_RSV1], 3>;
defm : X86WriteResPairUnsupported<WriteFAddZ>;
defm : SLMWriteResPair<WriteFAdd64, [SLM_FPC_RSV1], 3>;
-defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 3>;
-defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFAdd64X, [SLM_FPC_RSV1], 4, [2]>;
+defm : SLMWriteResPair<WriteFAdd64Y, [SLM_FPC_RSV1], 4, [2]>;
defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
defm : SLMWriteResPair<WriteFCmp, [SLM_FPC_RSV1], 3>;
defm : SLMWriteResPair<WriteFCmpX, [SLM_FPC_RSV1], 3>;
@@ -219,8 +219,8 @@ defm : SLMWriteResPair<WriteFMulX, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>
defm : SLMWriteResPair<WriteFMulY, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
defm : X86WriteResPairUnsupported<WriteFMulZ>;
defm : SLMWriteResPair<WriteFMul64, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
-defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
-defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
+defm : SLMWriteResPair<WriteFMul64X, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>;
+defm : SLMWriteResPair<WriteFMul64Y, [SLM_FPC_RSV0, SLMFPMultiplier], 7, [1,4]>;
defm : X86WriteResPairUnsupported<WriteFMul64Z>;
defm : SLMWriteResPair<WriteFDiv, [SLM_FPC_RSV0, SLMFPDivider], 19, [1,17]>;
defm : SLMWriteResPair<WriteFDivX, [SLM_FPC_RSV0, SLMFPDivider], 39, [1,39]>;
@@ -380,8 +380,8 @@ def : WriteRes<WriteVecExtractSt, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
// Horizontal add/sub instructions.
////////////////////////////////////////////////////////////////////////////////
-defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 3, [2]>;
-defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 3, [2]>;
+defm : SLMWriteResPair<WriteFHAdd, [SLM_FPC_RSV01], 6, [6], 4>;
+defm : SLMWriteResPair<WriteFHAddY, [SLM_FPC_RSV01], 6, [6], 4>;
defm : X86WriteResPairUnsupported<WriteFHAddZ>;
defm : SLMWriteResPair<WritePHAdd, [SLM_FPC_RSV01], 1>;
defm : SLMWriteResPair<WritePHAddX, [SLM_FPC_RSV01], 1>;
@@ -486,7 +486,7 @@ defm : X86WriteResPairUnsupported<WriteFBlendZ>;
defm : SLMWriteResPair<WriteVarBlend, [SLM_FPC_RSV0], 1>;
defm : X86WriteResPairUnsupported<WriteVarBlendY>;
defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
-defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 1>;
+defm : SLMWriteResPair<WriteFVarBlend, [SLM_FPC_RSV0], 4, [4], 3>;
defm : X86WriteResPairUnsupported<WriteFVarBlendY>;
defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
defm : X86WriteResPairUnsupported<WriteFShuffle256>;
@@ -511,4 +511,20 @@ defm : X86WriteResUnsupported<WriteCvtPS2PHSt>;
defm : X86WriteResUnsupported<WriteCvtPS2PHYSt>;
defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+// Remaining SLM instrs.
+
+def SLMWriteResGroup1rr : SchedWriteRes<[SLM_FPC_RSV01]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+ let ResourceCycles = [4];
+}
+def: InstRW<[SLMWriteResGroup1rr], (instrs PADDQrr, PSUBQrr, PCMPEQQrr)>;
+
+def SLMWriteResGroup1rm : SchedWriteRes<[SLM_MEC_RSV,SLM_FPC_RSV01]> {
+ let Latency = 7;
+ let NumMicroOps = 3;
+ let ResourceCycles = [1,4];
+}
+def: InstRW<[SLMWriteResGroup1rm], (instrs PADDQrm, PSUBQrm, PCMPEQQrm)>;
+
} // SchedModel
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
new file mode 100644
index 000000000000..4537d9cc7956
--- /dev/null
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -0,0 +1,1548 @@
+//=- X86ScheduleZnver2.td - X86 Znver2 Scheduling -------------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the machine model for Znver2 to support instruction
+// scheduling and other instruction cost heuristics.
+//
+//===----------------------------------------------------------------------===//
+
+def Znver2Model : SchedMachineModel {
+ // Zen can decode 4 instructions per cycle.
+ let IssueWidth = 4;
+ // Based on the reorder buffer we define MicroOpBufferSize
+ let MicroOpBufferSize = 224;
+ let LoadLatency = 4;
+ let MispredictPenalty = 17;
+ let HighLatency = 25;
+ let PostRAScheduler = 1;
+
+ // FIXME: This variable is required for incomplete model.
+ // We haven't catered all instructions.
+ // So, we reset the value of this variable so as to
+ // say that the model is incomplete.
+ let CompleteModel = 0;
+}
+
+let SchedModel = Znver2Model in {
+
+// Zen can issue micro-ops to 10 different units in one cycle.
+// These are
+// * Four integer ALU units (ZALU0, ZALU1, ZALU2, ZALU3)
+// * Three AGU units (ZAGU0, ZAGU1, ZAGU2)
+// * Four FPU units (ZFPU0, ZFPU1, ZFPU2, ZFPU3)
+// AGUs feed load store queues @two loads and 1 store per cycle.
+
+// Four ALU units are defined below
+def Zn2ALU0 : ProcResource<1>;
+def Zn2ALU1 : ProcResource<1>;
+def Zn2ALU2 : ProcResource<1>;
+def Zn2ALU3 : ProcResource<1>;
+
+// Three AGU units are defined below
+def Zn2AGU0 : ProcResource<1>;
+def Zn2AGU1 : ProcResource<1>;
+def Zn2AGU2 : ProcResource<1>;
+
+// Four FPU units are defined below
+def Zn2FPU0 : ProcResource<1>;
+def Zn2FPU1 : ProcResource<1>;
+def Zn2FPU2 : ProcResource<1>;
+def Zn2FPU3 : ProcResource<1>;
+
+// FPU grouping
+def Zn2FPU013 : ProcResGroup<[Zn2FPU0, Zn2FPU1, Zn2FPU3]>;
+def Zn2FPU01 : ProcResGroup<[Zn2FPU0, Zn2FPU1]>;
+def Zn2FPU12 : ProcResGroup<[Zn2FPU1, Zn2FPU2]>;
+def Zn2FPU13 : ProcResGroup<[Zn2FPU1, Zn2FPU3]>;
+def Zn2FPU23 : ProcResGroup<[Zn2FPU2, Zn2FPU3]>;
+def Zn2FPU02 : ProcResGroup<[Zn2FPU0, Zn2FPU2]>;
+def Zn2FPU03 : ProcResGroup<[Zn2FPU0, Zn2FPU3]>;
+
+// Below are the grouping of the units.
+// Micro-ops to be issued to multiple units are tackled this way.
+
+// ALU grouping
+// Zn2ALU03 - 0,3 grouping
+def Zn2ALU03: ProcResGroup<[Zn2ALU0, Zn2ALU3]>;
+
+// 64 Entry (16x4 entries) Int Scheduler
+def Zn2ALU : ProcResGroup<[Zn2ALU0, Zn2ALU1, Zn2ALU2, Zn2ALU3]> {
+ let BufferSize=64;
+}
+
+// 28 Entry (14x2) AGU group. AGUs can't be used for all ALU operations
+// but are relevant for some instructions
+def Zn2AGU : ProcResGroup<[Zn2AGU0, Zn2AGU1, Zn2AGU2]> {
+ let BufferSize=28;
+}
+
+// Integer Multiplication issued on ALU1.
+def Zn2Multiplier : ProcResource<1>;
+
+// Integer division issued on ALU2.
+def Zn2Divider : ProcResource<1>;
+
+// 4 Cycles load-to use Latency is captured
+def : ReadAdvance<ReadAfterLd, 4>;
+
+// 7 Cycles vector load-to use Latency is captured
+def : ReadAdvance<ReadAfterVecLd, 7>;
+def : ReadAdvance<ReadAfterVecXLd, 7>;
+def : ReadAdvance<ReadAfterVecYLd, 7>;
+
+def : ReadAdvance<ReadInt2Fpu, 0>;
+
+// The Integer PRF for Zen is 168 entries, and it holds the architectural and
+// speculative version of the 64-bit integer registers.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def Zn2IntegerPRF : RegisterFile<168, [GR64, CCR]>;
+
+// 36 Entry (9x4 entries) floating-point Scheduler
+def Zn2FPU : ProcResGroup<[Zn2FPU0, Zn2FPU1, Zn2FPU2, Zn2FPU3]> {
+ let BufferSize=36;
+}
+
+// The Zen FP Retire Queue renames SIMD and FP uOps onto a pool of 160 128-bit
+// registers. Operations on 256-bit data types are cracked into two COPs.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+def Zn2FpuPRF: RegisterFile<160, [VR64, VR128, VR256], [1, 1, 2]>;
+
+// The unit can track up to 192 macro ops in-flight.
+// The retire unit handles in-order commit of up to 8 macro ops per cycle.
+// Reference: "Software Optimization Guide for AMD Family 17h Processors"
+// To be noted, the retire unit is shared between integer and FP ops.
+// In SMT mode it is 96 entry per thread. But, we do not use the conservative
+// value here because there is currently no way to fully mode the SMT mode,
+// so there is no point in trying.
+def Zn2RCU : RetireControlUnit<192, 8>;
+
+// (a folded load is an instruction that loads and does some operation)
+// Ex: ADDPD xmm,[mem]-> This instruction has two micro-ops
+// Instructions with folded loads are usually micro-fused, so they only appear
+// as two micro-ops.
+// a. load and
+// b. addpd
+// This multiclass is for folded loads for integer units.
+multiclass Zn2WriteResPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 4, int LoadUOps = 1> {
+ // Register variant takes 1-cycle on Execution Port.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on Zn2AGU
+ // adds LoadLat cycles to the latency (default = 4).
+ def : WriteRes<SchedRW.Folded, !listconcat([Zn2AGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// This multiclass is for folded loads for floating point units.
+multiclass Zn2WriteResFpuPair<X86FoldableSchedWrite SchedRW,
+ list<ProcResourceKind> ExePorts,
+ int Lat, list<int> Res = [], int UOps = 1,
+ int LoadLat = 7, int LoadUOps = 0> {
+ // Register variant takes 1-cycle on Execution Port.
+ def : WriteRes<SchedRW, ExePorts> {
+ let Latency = Lat;
+ let ResourceCycles = Res;
+ let NumMicroOps = UOps;
+ }
+
+ // Memory variant also uses a cycle on Zn2AGU
+ // adds LoadLat cycles to the latency (default = 7).
+ def : WriteRes<SchedRW.Folded, !listconcat([Zn2AGU], ExePorts)> {
+ let Latency = !add(Lat, LoadLat);
+ let ResourceCycles = !if(!empty(Res), [], !listconcat([1], Res));
+ let NumMicroOps = !add(UOps, LoadUOps);
+ }
+}
+
+// WriteRMW is set for instructions with Memory write
+// operation in codegen
+def : WriteRes<WriteRMW, [Zn2AGU]>;
+
+def : WriteRes<WriteStore, [Zn2AGU]>;
+def : WriteRes<WriteStoreNT, [Zn2AGU]>;
+def : WriteRes<WriteMove, [Zn2ALU]>;
+def : WriteRes<WriteLoad, [Zn2AGU]> { let Latency = 8; }
+
+def : WriteRes<WriteZero, []>;
+def : WriteRes<WriteLEA, [Zn2ALU]>;
+defm : Zn2WriteResPair<WriteALU, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteADC, [Zn2ALU], 1>;
+
+defm : Zn2WriteResPair<WriteIMul8, [Zn2ALU1, Zn2Multiplier], 4>;
+
+defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>;
+defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
+defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
+
+defm : Zn2WriteResPair<WriteShift, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteShiftCL, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteRotate, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteRotateCL, [Zn2ALU], 1>;
+
+defm : X86WriteRes<WriteSHDrri, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteResUnsupported<WriteSHDrrcl>;
+defm : X86WriteResUnsupported<WriteSHDmri>;
+defm : X86WriteResUnsupported<WriteSHDmrcl>;
+
+defm : Zn2WriteResPair<WriteJump, [Zn2ALU], 1>;
+defm : Zn2WriteResFpuPair<WriteCRC32, [Zn2FPU0], 3>;
+
+defm : Zn2WriteResPair<WriteCMOV, [Zn2ALU], 1>;
+def : WriteRes<WriteSETCC, [Zn2ALU]>;
+def : WriteRes<WriteSETCCStore, [Zn2ALU, Zn2AGU]>;
+defm : X86WriteRes<WriteLAHFSAHF, [Zn2ALU], 2, [1], 2>;
+
+defm : X86WriteRes<WriteBitTest, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteRes<WriteBitTestImmLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestRegLd, [Zn2ALU,Zn2AGU], 5, [1,1], 2>;
+defm : X86WriteRes<WriteBitTestSet, [Zn2ALU], 2, [1], 2>;
+
+// Bit counts.
+defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 3>;
+defm : Zn2WriteResPair<WriteLZCNT, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteTZCNT, [Zn2ALU], 2>;
+defm : Zn2WriteResPair<WritePOPCNT, [Zn2ALU], 1>;
+
+// Treat misc copies as a move.
+def : InstRW<[WriteMove], (instrs COPY)>;
+
+// BMI1 BEXTR, BMI2 BZHI
+defm : Zn2WriteResPair<WriteBEXTR, [Zn2ALU], 1>;
+defm : Zn2WriteResPair<WriteBZHI, [Zn2ALU], 1>;
+
+// IDIV
+defm : Zn2WriteResPair<WriteDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>;
+defm : Zn2WriteResPair<WriteDiv16, [Zn2ALU2, Zn2Divider], 17, [1,17], 2>;
+defm : Zn2WriteResPair<WriteDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>;
+defm : Zn2WriteResPair<WriteDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>;
+defm : Zn2WriteResPair<WriteIDiv8, [Zn2ALU2, Zn2Divider], 15, [1,15], 1>;
+defm : Zn2WriteResPair<WriteIDiv16, [Zn2ALU2, Zn2Divider], 17, [1,17], 2>;
+defm : Zn2WriteResPair<WriteIDiv32, [Zn2ALU2, Zn2Divider], 25, [1,25], 2>;
+defm : Zn2WriteResPair<WriteIDiv64, [Zn2ALU2, Zn2Divider], 41, [1,41], 2>;
+
+// IMULH
+def : WriteRes<WriteIMulH, [Zn2ALU1, Zn2Multiplier]>{
+ let Latency = 4;
+}
+
+// Floating point operations
+defm : X86WriteRes<WriteFLoad, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadX, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFLoadY, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteFMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,1], 2>;
+defm : X86WriteRes<WriteFMaskedStore32, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteFMaskedStore64, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteFMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+
+defm : X86WriteRes<WriteFStore, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreX, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNT, [Zn2AGU,Zn2FPU2], 8, [1,1], 1>;
+defm : X86WriteRes<WriteFStoreNTX, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFStoreNTY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMove, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveX, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteFMoveY, [Zn2FPU], 1, [1], 1>;
+
+defm : Zn2WriteResFpuPair<WriteFAdd, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAddY, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAddZ>;
+defm : Zn2WriteResFpuPair<WriteFAdd64, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64X, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFAdd64Y, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
+defm : Zn2WriteResFpuPair<WriteFCmp, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCmpX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCmpY, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmpZ>;
+defm : Zn2WriteResFpuPair<WriteFCmp64, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
+defm : Zn2WriteResFpuPair<WriteFCom, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WriteFBlend, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFBlendY, [Zn2FPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFBlendZ>;
+defm : Zn2WriteResFpuPair<WriteFVarBlend, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteFVarBlendY,[Zn2FPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarBlendZ>;
+defm : Zn2WriteResFpuPair<WriteVarBlend, [Zn2FPU0], 1>;
+defm : Zn2WriteResFpuPair<WriteVarBlendY, [Zn2FPU0], 1>;
+defm : X86WriteResPairUnsupported<WriteVarBlendZ>;
+defm : Zn2WriteResFpuPair<WriteCvtSS2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPS2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPS2IY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPS2IZ>;
+defm : Zn2WriteResFpuPair<WriteCvtSD2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPD2I, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtPD2IY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtPD2IZ>;
+defm : Zn2WriteResFpuPair<WriteCvtI2SS, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PS, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PSY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PSZ>;
+defm : Zn2WriteResFpuPair<WriteCvtI2SD, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PD, [Zn2FPU3], 5>;
+defm : Zn2WriteResFpuPair<WriteCvtI2PDY, [Zn2FPU3], 5>;
+defm : X86WriteResPairUnsupported<WriteCvtI2PDZ>;
+defm : Zn2WriteResFpuPair<WriteFDiv, [Zn2FPU3], 15>;
+defm : Zn2WriteResFpuPair<WriteFDivX, [Zn2FPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDivZ>;
+defm : Zn2WriteResFpuPair<WriteFDiv64, [Zn2FPU3], 15>;
+defm : Zn2WriteResFpuPair<WriteFDiv64X, [Zn2FPU3], 15>;
+defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
+defm : Zn2WriteResFpuPair<WriteFSign, [Zn2FPU3], 2>;
+defm : Zn2WriteResFpuPair<WriteFRnd, [Zn2FPU3], 4, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRndY, [Zn2FPU3], 4, [1], 1, 7, 0>;
+defm : X86WriteResPairUnsupported<WriteFRndZ>;
+defm : Zn2WriteResFpuPair<WriteFLogic, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteFLogicY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFLogicZ>;
+defm : Zn2WriteResFpuPair<WriteFTest, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteFTestY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteFTestZ>;
+defm : Zn2WriteResFpuPair<WriteFShuffle, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteFMul, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulX, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulY, [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMulZ>;
+defm : Zn2WriteResFpuPair<WriteFMul64, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64X, [Zn2FPU01], 3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64Y, [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFMul64Z>;
+defm : Zn2WriteResFpuPair<WriteFMA, [Zn2FPU03], 5>;
+defm : Zn2WriteResFpuPair<WriteFMAX, [Zn2FPU03], 5>;
+defm : Zn2WriteResFpuPair<WriteFMAY, [Zn2FPU03], 5>;
+defm : X86WriteResPairUnsupported<WriteFMAZ>;
+defm : Zn2WriteResFpuPair<WriteFRcp, [Zn2FPU01], 5>;
+defm : Zn2WriteResFpuPair<WriteFRcpX, [Zn2FPU01], 5>;
+defm : Zn2WriteResFpuPair<WriteFRcpY, [Zn2FPU01], 5, [1], 1, 7, 2>;
+defm : X86WriteResPairUnsupported<WriteFRcpZ>;
+defm : Zn2WriteResFpuPair<WriteFRsqrtX, [Zn2FPU01], 5, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFRsqrtZ>;
+defm : Zn2WriteResFpuPair<WriteFSqrt, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrtX, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrtY, [Zn2FPU3], 28, [28], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrtZ>;
+defm : Zn2WriteResFpuPair<WriteFSqrt64, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrt64X, [Zn2FPU3], 20, [20]>;
+defm : Zn2WriteResFpuPair<WriteFSqrt64Y, [Zn2FPU3], 20, [20], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteFSqrt64Z>;
+defm : Zn2WriteResFpuPair<WriteFSqrt80, [Zn2FPU3], 20, [20]>;
+
+// Vector integer operations which uses FPU units
+defm : X86WriteRes<WriteVecLoad, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadX, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadY, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNT, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecLoadNTY, [Zn2AGU], 8, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedLoad, [Zn2AGU,Zn2FPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedLoadY, [Zn2AGU,Zn2FPU01], 8, [1,2], 2>;
+defm : X86WriteRes<WriteVecStore, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreX, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNT, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecStoreNTY, [Zn2AGU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore, [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStoreY, [Zn2AGU,Zn2FPU01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteVecMove, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveX, [Zn2FPU], 1, [1], 1>;
+defm : X86WriteRes<WriteVecMoveY, [Zn2FPU], 2, [1], 2>;
+defm : X86WriteRes<WriteVecMoveToGpr, [Zn2FPU2], 2, [1], 1>;
+defm : X86WriteRes<WriteVecMoveFromGpr, [Zn2FPU2], 3, [1], 1>;
+defm : X86WriteRes<WriteEMMS, [Zn2FPU], 2, [1], 1>;
+
+defm : Zn2WriteResFpuPair<WriteVecShift, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftX, [Zn2FPU2], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftY, [Zn2FPU2], 2>;
+defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImm, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecShiftImmY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecShiftImmZ>;
+defm : Zn2WriteResFpuPair<WriteVecLogic, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecLogicX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecLogicY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
+defm : Zn2WriteResFpuPair<WriteVecTest, [Zn2FPU12], 1, [2], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteVecTestY, [Zn2FPU12], 1, [2], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WriteVecTestZ>;
+defm : Zn2WriteResFpuPair<WriteVecALU, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecALUX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVecALUY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVecALUZ>;
+defm : Zn2WriteResFpuPair<WriteVecIMul, [Zn2FPU0], 4>;
+defm : Zn2WriteResFpuPair<WriteVecIMulX, [Zn2FPU0], 4>;
+defm : Zn2WriteResFpuPair<WriteVecIMulY, [Zn2FPU0], 4>;
+defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
+defm : Zn2WriteResFpuPair<WritePMULLD, [Zn2FPU0], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WritePMULLDY, [Zn2FPU0], 3, [1], 1, 7, 1>;
+defm : X86WriteResPairUnsupported<WritePMULLDZ>;
+defm : Zn2WriteResFpuPair<WriteShuffle, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteShuffleX, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteShuffleY, [Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteVarShuffle, [Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVarShuffleX,[Zn2FPU], 1>;
+defm : Zn2WriteResFpuPair<WriteVarShuffleY,[Zn2FPU], 1>;
+defm : X86WriteResPairUnsupported<WriteVarShuffleZ>;
+defm : Zn2WriteResFpuPair<WriteBlend, [Zn2FPU01], 1>;
+defm : Zn2WriteResFpuPair<WriteBlendY, [Zn2FPU01], 1>;
+defm : X86WriteResPairUnsupported<WriteBlendZ>;
+defm : Zn2WriteResFpuPair<WriteShuffle256, [Zn2FPU], 2>;
+defm : Zn2WriteResFpuPair<WriteVarShuffle256, [Zn2FPU], 2>;
+defm : Zn2WriteResFpuPair<WritePSADBW, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WritePSADBWX, [Zn2FPU0], 3>;
+defm : Zn2WriteResFpuPair<WritePSADBWY, [Zn2FPU0], 3>;
+defm : X86WriteResPairUnsupported<WritePSADBWZ>;
+defm : Zn2WriteResFpuPair<WritePHMINPOS, [Zn2FPU0], 4>;
+
+// Vector Shift Operations
+defm : Zn2WriteResFpuPair<WriteVarVecShift, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 1>;
+defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
+
+// Vector insert/extract operations.
+defm : Zn2WriteResFpuPair<WriteVecInsert, [Zn2FPU], 1>;
+
+def : WriteRes<WriteVecExtract, [Zn2FPU12, Zn2FPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : WriteRes<WriteVecExtractSt, [Zn2AGU, Zn2FPU12, Zn2FPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2, 3];
+}
+
+// MOVMSK Instructions.
+def : WriteRes<WriteFMOVMSK, [Zn2FPU2]>;
+def : WriteRes<WriteMMXMOVMSK, [Zn2FPU2]>;
+def : WriteRes<WriteVecMOVMSK, [Zn2FPU2]>;
+
+def : WriteRes<WriteVecMOVMSKY, [Zn2FPU2]> {
+ let NumMicroOps = 2;
+ let Latency = 2;
+ let ResourceCycles = [2];
+}
+
+// AES Instructions.
+defm : Zn2WriteResFpuPair<WriteAESDecEnc, [Zn2FPU01], 4>;
+defm : Zn2WriteResFpuPair<WriteAESIMC, [Zn2FPU01], 4>;
+defm : Zn2WriteResFpuPair<WriteAESKeyGen, [Zn2FPU01], 4>;
+
+def : WriteRes<WriteFence, [Zn2AGU]>;
+def : WriteRes<WriteNop, []>;
+
+// Following instructions with latency=100 are microcoded.
+// We set long latency so as to block the entire pipeline.
+defm : Zn2WriteResFpuPair<WriteFShuffle256, [Zn2FPU], 100>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
+
+// Microcoded Instructions
+def Zn2WriteMicrocoded : SchedWriteRes<[]> {
+ let Latency = 100;
+}
+
+def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteSystem, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSAD, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSADY, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSADLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteMPSADYLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCLMul, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCLMulLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrM, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrMLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrI, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrILd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrM, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpEStrMLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrI, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePCmpIStrILd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteLDMXCSR, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteSTMXCSR, Zn2WriteMicrocoded>;
+
+//=== Regex based InstRW ===//
+// Notation:
+// - r: register.
+// - m = memory.
+// - i = immediate
+// - mm: 64 bit mmx register.
+// - x = 128 bit xmm register.
+// - (x)mm = mmx or xmm register.
+// - y = 256 bit ymm register.
+// - v = any vector register.
+
+//=== Integer Instructions ===//
+//-- Move instructions --//
+// MOV.
+// r16,m.
+def : InstRW<[WriteALULd, ReadAfterLd], (instregex "MOV16rm")>;
+
+// MOVSX, MOVZX.
+// r,m.
+def : InstRW<[WriteLoad], (instregex "MOV(S|Z)X32rm(8|16)")>;
+
+// XCHG.
+// r,r.
+def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> {
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[Zn2WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+
+// r,m.
+def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
+
+def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
+
+// POP16.
+// r.
+def Zn2WritePop16r : SchedWriteRes<[Zn2AGU]>{
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WritePop16r], (instregex "POP16rmm")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPF(16|32)")>;
+def : InstRW<[WriteMicrocoded], (instregex "POPA(16|32)")>;
+
+
+// PUSH.
+// r. Has default values.
+// m.
+def Zn2WritePUSH : SchedWriteRes<[Zn2AGU]>{
+ let Latency = 4;
+}
+def : InstRW<[Zn2WritePUSH], (instregex "PUSH(16|32)rmm")>;
+
+//PUSHF
+def : InstRW<[WriteMicrocoded], (instregex "PUSHF(16|32)")>;
+
+// PUSHA.
+def Zn2WritePushA : SchedWriteRes<[Zn2AGU]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WritePushA], (instregex "PUSHA(16|32)")>;
+
+//LAHF
+def : InstRW<[WriteMicrocoded], (instrs LAHF)>;
+
+// MOVBE.
+// r,m.
+def Zn2WriteMOVBE : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 5;
+}
+def : InstRW<[Zn2WriteMOVBE, ReadAfterLd], (instregex "MOVBE(16|32|64)rm")>;
+
+// m16,r16.
+def : InstRW<[Zn2WriteMOVBE], (instregex "MOVBE(16|32|64)mr")>;
+
+//-- Arithmetic instructions --//
+
+// ADD SUB.
+// m,r/i.
+def : InstRW<[WriteALULd], (instregex "(ADD|SUB)(8|16|32|64)m(r|i)",
+ "(ADD|SUB)(8|16|32|64)mi8",
+ "(ADD|SUB)64mi32")>;
+
+// ADC SBB.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(ADC|SBB)(8|16|32|64)m(r|i)",
+ "(ADC|SBB)(16|32|64)mi8",
+ "(ADC|SBB)64mi32")>;
+
+// INC DEC NOT NEG.
+// m.
+def : InstRW<[WriteALULd],
+ (instregex "(INC|DEC|NOT|NEG)(8|16|32|64)m")>;
+
+// MUL IMUL.
+// r16.
+def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteIMul16, Zn2WriteMul16>;
+def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16>;
+def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>;
+
+// m16.
+def Zn2WriteMul16Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+}
+def : SchedAlias<WriteIMul16Ld, Zn2WriteMul16Ld>;
+def : SchedAlias<WriteIMul16ImmLd, Zn2WriteMul16Ld>;
+def : SchedAlias<WriteIMul16RegLd, Zn2WriteMul16Ld>;
+
+// r32.
+def Zn2WriteMul32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteIMul32, Zn2WriteMul32>;
+def : SchedAlias<WriteIMul32Imm, Zn2WriteMul32>;
+def : SchedAlias<WriteIMul32Reg, Zn2WriteMul32>;
+
+// m32.
+def Zn2WriteMul32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+}
+def : SchedAlias<WriteIMul32Ld, Zn2WriteMul32Ld>;
+def : SchedAlias<WriteIMul32ImmLd, Zn2WriteMul32Ld>;
+def : SchedAlias<WriteIMul32RegLd, Zn2WriteMul32Ld>;
+
+// r64.
+def Zn2WriteMul64 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 4;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteIMul64, Zn2WriteMul64>;
+def : SchedAlias<WriteIMul64Imm, Zn2WriteMul64>;
+def : SchedAlias<WriteIMul64Reg, Zn2WriteMul64>;
+
+// m64.
+def Zn2WriteMul64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteIMul64Ld, Zn2WriteMul64Ld>;
+def : SchedAlias<WriteIMul64ImmLd, Zn2WriteMul64Ld>;
+def : SchedAlias<WriteIMul64RegLd, Zn2WriteMul64Ld>;
+
+// MULX.
+// r32,r32,r32.
+def Zn2WriteMulX32 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 3;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteMulX32], (instrs MULX32rr)>;
+
+// r32,r32,m32.
+def Zn2WriteMulX32Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+ let ResourceCycles = [1, 2, 2];
+}
+def : InstRW<[Zn2WriteMulX32Ld, ReadAfterLd], (instrs MULX32rm)>;
+
+// r64,r64,r64.
+def Zn2WriteMulX64 : SchedWriteRes<[Zn2ALU1]> {
+ let Latency = 3;
+}
+def : InstRW<[Zn2WriteMulX64], (instrs MULX64rr)>;
+
+// r64,r64,m64.
+def Zn2WriteMulX64Ld : SchedWriteRes<[Zn2AGU, Zn2ALU1, Zn2Multiplier]> {
+ let Latency = 7;
+}
+def : InstRW<[Zn2WriteMulX64Ld, ReadAfterLd], (instrs MULX64rm)>;
+
+//-- Control transfer instructions --//
+
+// J(E|R)CXZ.
+def Zn2WriteJCXZ : SchedWriteRes<[Zn2ALU03]>;
+def : InstRW<[Zn2WriteJCXZ], (instrs JCXZ, JECXZ, JRCXZ)>;
+
+// INTO
+def : InstRW<[WriteMicrocoded], (instrs INTO)>;
+
+// LOOP.
+def Zn2WriteLOOP : SchedWriteRes<[Zn2ALU03]>;
+def : InstRW<[Zn2WriteLOOP], (instrs LOOP)>;
+
+// LOOP(N)E, LOOP(N)Z
+def Zn2WriteLOOPE : SchedWriteRes<[Zn2ALU03]>;
+def : InstRW<[Zn2WriteLOOPE], (instrs LOOPE, LOOPNE)>;
+
+// CALL.
+// r.
+def Zn2WriteCALLr : SchedWriteRes<[Zn2AGU, Zn2ALU03]>;
+def : InstRW<[Zn2WriteCALLr], (instregex "CALL(16|32)r")>;
+
+def : InstRW<[WriteMicrocoded], (instregex "CALL(16|32)m")>;
+
+// RET.
+def Zn2WriteRET : SchedWriteRes<[Zn2ALU03]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteRET], (instregex "RET(L|Q|W)", "LRET(L|Q|W)",
+ "IRET(16|32|64)")>;
+
+//-- Logic instructions --//
+
+// AND OR XOR.
+// m,r/i.
+def : InstRW<[WriteALULd],
+ (instregex "(AND|OR|XOR)(8|16|32|64)m(r|i)",
+ "(AND|OR|XOR)(8|16|32|64)mi8", "(AND|OR|XOR)64mi32")>;
+
+// Define ALU latency variants
+def Zn2WriteALULat2 : SchedWriteRes<[Zn2ALU]> {
+ let Latency = 2;
+}
+def Zn2WriteALULat2Ld : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 6;
+}
+
+// BT.
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "BT(16|32|64)mi8")>;
+
+// BTR BTS BTC.
+// r,r,i.
+def Zn2WriteBTRSC : SchedWriteRes<[Zn2ALU]> {
+ let Latency = 2;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteBTRSC], (instregex "BT(R|S|C)(16|32|64)r(r|i8)")>;
+
+// m,r,i.
+def Zn2WriteBTRSCm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let Latency = 6;
+ let NumMicroOps = 2;
+}
+// m,r,i.
+def : SchedAlias<WriteBitTestSetImmRMW, Zn2WriteBTRSCm>;
+def : SchedAlias<WriteBitTestSetRegRMW, Zn2WriteBTRSCm>;
+
+// BLSI BLSMSK BLSR.
+// r,r.
+def : SchedAlias<WriteBLS, Zn2WriteALULat2>;
+// r,m.
+def : SchedAlias<WriteBLSLd, Zn2WriteALULat2Ld>;
+
+// CLD STD.
+def : InstRW<[WriteALU], (instrs STD, CLD)>;
+
+// PDEP PEXT.
+// r,r,r.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rr", "PEXT(32|64)rr")>;
+// r,r,m.
+def : InstRW<[WriteMicrocoded], (instregex "PDEP(32|64)rm", "PEXT(32|64)rm")>;
+
+// RCR RCL.
+// m,i.
+def : InstRW<[WriteMicrocoded], (instregex "RC(R|L)(8|16|32|64)m(1|i|CL)")>;
+
+// SHR SHL SAR.
+// m,i.
+def : InstRW<[WriteShiftLd], (instregex "S(A|H)(R|L)(8|16|32|64)m(i|1)")>;
+
+// SHRD SHLD.
+// m,r
+def : InstRW<[WriteShiftLd], (instregex "SH(R|L)D(16|32|64)mri8")>;
+
+// r,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)rrCL")>;
+
+// m,r,cl.
+def : InstRW<[WriteMicrocoded], (instregex "SH(R|L)D(16|32|64)mrCL")>;
+
+//-- Misc instructions --//
+// CMPXCHG8B.
+def Zn2WriteCMPXCHG8B : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
+ let NumMicroOps = 18;
+}
+def : InstRW<[Zn2WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
+
+def : InstRW<[WriteMicrocoded], (instrs CMPXCHG16B)>;
+
+// LEAVE
+def Zn2WriteLEAVE : SchedWriteRes<[Zn2ALU, Zn2AGU]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteLEAVE], (instregex "LEAVE")>;
+
+// PAUSE.
+def : InstRW<[WriteMicrocoded], (instrs PAUSE)>;
+
+// RDTSC.
+def : InstRW<[WriteMicrocoded], (instregex "RDTSC")>;
+
+// RDPMC.
+def : InstRW<[WriteMicrocoded], (instrs RDPMC)>;
+
+// RDRAND.
+def : InstRW<[WriteMicrocoded], (instregex "RDRAND(16|32|64)r")>;
+
+// XGETBV.
+def : InstRW<[WriteMicrocoded], (instregex "XGETBV")>;
+
+//-- String instructions --//
+// CMPS.
+def : InstRW<[WriteMicrocoded], (instregex "CMPS(B|L|Q|W)")>;
+
+// LODSB/W.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(B|W)")>;
+
+// LODSD/Q.
+def : InstRW<[WriteMicrocoded], (instregex "LODS(L|Q)")>;
+
+// MOVS.
+def : InstRW<[WriteMicrocoded], (instregex "MOVS(B|L|Q|W)")>;
+
+// SCAS.
+def : InstRW<[WriteMicrocoded], (instregex "SCAS(B|W|L|Q)")>;
+
+// STOS
+def : InstRW<[WriteMicrocoded], (instregex "STOS(B|L|Q|W)")>;
+
+// XADD.
+def Zn2XADD : SchedWriteRes<[Zn2ALU]>;
+def : InstRW<[Zn2XADD], (instregex "XADD(8|16|32|64)rr")>;
+def : InstRW<[WriteMicrocoded], (instregex "XADD(8|16|32|64)rm")>;
+
+//=== Floating Point x87 Instructions ===//
+//-- Move instructions --//
+
+def Zn2WriteFLDr : SchedWriteRes<[Zn2FPU13]> ;
+
+def Zn2WriteSTr: SchedWriteRes<[Zn2FPU23]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+}
+
+// LD_F.
+// r.
+def : InstRW<[Zn2WriteFLDr], (instregex "LD_Frr")>;
+
+// m.
+def Zn2WriteLD_F80m : SchedWriteRes<[Zn2AGU, Zn2FPU13]> {
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteLD_F80m], (instregex "LD_F80m")>;
+
+// FBLD.
+def : InstRW<[WriteMicrocoded], (instregex "FBLDm")>;
+
+// FST(P).
+// r.
+def : InstRW<[Zn2WriteSTr], (instregex "ST_(F|FP)rr")>;
+
+// m80.
+def Zn2WriteST_FP80m : SchedWriteRes<[Zn2AGU, Zn2FPU23]> {
+ let Latency = 5;
+}
+def : InstRW<[Zn2WriteST_FP80m], (instregex "ST_FP80m")>;
+
+// FBSTP.
+// m80.
+def : InstRW<[WriteMicrocoded], (instregex "FBSTPm")>;
+
+def Zn2WriteFXCH : SchedWriteRes<[Zn2FPU]>;
+
+// FXCHG.
+def : InstRW<[Zn2WriteFXCH], (instrs XCH_F)>;
+
+// FILD.
+def Zn2WriteFILD : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 11;
+ let NumMicroOps = 2;
+}
+def : InstRW<[Zn2WriteFILD], (instregex "ILD_F(16|32|64)m")>;
+
+// FIST(P) FISTTP.
+def Zn2WriteFIST : SchedWriteRes<[Zn2AGU, Zn2FPU23]> {
+ let Latency = 12;
+}
+def : InstRW<[Zn2WriteFIST], (instregex "IS(T|TT)_(F|FP)(16|32|64)m")>;
+
+def Zn2WriteFPU13 : SchedWriteRes<[Zn2AGU, Zn2FPU13]> {
+ let Latency = 8;
+}
+
+def Zn2WriteFPU3 : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 11;
+}
+
+// FLDZ.
+def : SchedAlias<WriteFLD0, Zn2WriteFPU13>;
+
+// FLD1.
+def : SchedAlias<WriteFLD1, Zn2WriteFPU3>;
+
+// FLDPI FLDL2E etc.
+def : SchedAlias<WriteFLDC, Zn2WriteFPU3>;
+
+// FNSTSW.
+// AX.
+def : InstRW<[WriteMicrocoded], (instrs FNSTSW16r)>;
+
+// m16.
+def : InstRW<[WriteMicrocoded], (instrs FNSTSWm)>;
+
+// FLDCW.
+def : InstRW<[WriteMicrocoded], (instrs FLDCW16m)>;
+
+// FNSTCW.
+def : InstRW<[WriteMicrocoded], (instrs FNSTCW16m)>;
+
+// FINCSTP FDECSTP.
+def : InstRW<[Zn2WriteFPU3], (instrs FINCSTP, FDECSTP)>;
+
+// FFREE.
+def : InstRW<[Zn2WriteFPU3], (instregex "FFREE")>;
+
+// FNSAVE.
+def : InstRW<[WriteMicrocoded], (instregex "FSAVEm")>;
+
+// FRSTOR.
+def : InstRW<[WriteMicrocoded], (instregex "FRSTORm")>;
+
+//-- Arithmetic instructions --//
+
+def Zn2WriteFPU3Lat1 : SchedWriteRes<[Zn2FPU3]> ;
+
+def Zn2WriteFPU0Lat1 : SchedWriteRes<[Zn2FPU0]> ;
+
+def Zn2WriteFPU0Lat1Ld : SchedWriteRes<[Zn2AGU, Zn2FPU0]> {
+ let Latency = 8;
+}
+
+// FCHS.
+def : InstRW<[Zn2WriteFPU3Lat1], (instregex "CHS_F")>;
+
+// FCOM(P) FUCOM(P).
+// r.
+def : InstRW<[Zn2WriteFPU0Lat1], (instregex "COM(P?)_FST0r", "UCOM_F(P?)r")>;
+// m.
+def : InstRW<[Zn2WriteFPU0Lat1Ld], (instregex "FCOM(P?)(32|64)m")>;
+
+// FCOMPP FUCOMPP.
+// r.
+def : InstRW<[Zn2WriteFPU0Lat1], (instrs FCOMPP, UCOM_FPPr)>;
+
+def Zn2WriteFPU02 : SchedWriteRes<[Zn2AGU, Zn2FPU02]>
+{
+ let Latency = 9;
+}
+
+// FCOMI(P) FUCOMI(P).
+// m.
+def : InstRW<[Zn2WriteFPU02], (instrs COM_FIPr, COM_FIr, UCOM_FIPr, UCOM_FIr)>;
+
+def Zn2WriteFPU03 : SchedWriteRes<[Zn2AGU, Zn2FPU03]>
+{
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,3];
+}
+
+// FICOM(P).
+def : InstRW<[Zn2WriteFPU03], (instregex "FICOM(P?)(16|32)m")>;
+
+// FTST.
+def : InstRW<[Zn2WriteFPU0Lat1], (instregex "TST_F")>;
+
+// FXAM.
+def : InstRW<[Zn2WriteFPU3Lat1], (instrs FXAM)>;
+
+// FPREM.
+def : InstRW<[WriteMicrocoded], (instrs FPREM)>;
+
+// FPREM1.
+def : InstRW<[WriteMicrocoded], (instrs FPREM1)>;
+
+// FRNDINT.
+def : InstRW<[WriteMicrocoded], (instrs FRNDINT)>;
+
+// FSCALE.
+def : InstRW<[WriteMicrocoded], (instrs FSCALE)>;
+
+// FXTRACT.
+def : InstRW<[WriteMicrocoded], (instrs FXTRACT)>;
+
+// FNOP.
+def : InstRW<[Zn2WriteFPU0Lat1], (instrs FNOP)>;
+
+// WAIT.
+def : InstRW<[Zn2WriteFPU0Lat1], (instrs WAIT)>;
+
+// FNCLEX.
+def : InstRW<[WriteMicrocoded], (instrs FNCLEX)>;
+
+// FNINIT.
+def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
+
+//=== Integer MMX and XMM Instructions ===//
+
+// PACKSSWB/DW.
+// mm <- mm.
+def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ;
+def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> {
+ let NumMicroOps = 2;
+}
+def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ;
+def Zn2WriteFPU12Ym : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+def : InstRW<[Zn2WriteFPU12], (instrs MMX_PACKSSDWirr,
+ MMX_PACKSSWBirr,
+ MMX_PACKUSWBirr)>;
+def : InstRW<[Zn2WriteFPU12m], (instrs MMX_PACKSSDWirm,
+ MMX_PACKSSWBirm,
+ MMX_PACKUSWBirm)>;
+
+// VPMOVSX/ZX BW BD BQ WD WQ DQ.
+// y <- x.
+def : InstRW<[Zn2WriteFPU12Y], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrr")>;
+def : InstRW<[Zn2WriteFPU12Ym], (instregex "VPMOV(SX|ZX)(BW|BD|BQ|WD|WQ|DQ)Yrm")>;
+
+def Zn2WriteFPU013 : SchedWriteRes<[Zn2FPU013]> ;
+def Zn2WriteFPU013Y : SchedWriteRes<[Zn2FPU013]> ;
+def Zn2WriteFPU013m : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def Zn2WriteFPU013Ld : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+def Zn2WriteFPU013LdY : SchedWriteRes<[Zn2AGU, Zn2FPU013]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+}
+
+// PBLENDW.
+// x,x,i / v,v,v,i
+def : InstRW<[Zn2WriteFPU013], (instregex "(V?)PBLENDWrri")>;
+// ymm
+def : InstRW<[Zn2WriteFPU013Y], (instrs VPBLENDWYrri)>;
+
+// x,m,i / v,v,m,i
+def : InstRW<[Zn2WriteFPU013Ld], (instregex "(V?)PBLENDWrmi")>;
+// y,m,i
+def : InstRW<[Zn2WriteFPU013LdY], (instrs VPBLENDWYrmi)>;
+
+def Zn2WriteFPU01 : SchedWriteRes<[Zn2FPU01]> ;
+def Zn2WriteFPU01Y : SchedWriteRes<[Zn2FPU01]> {
+ let NumMicroOps = 2;
+}
+
+// VPBLENDD.
+// v,v,v,i.
+def : InstRW<[Zn2WriteFPU01], (instrs VPBLENDDrri)>;
+// ymm
+def : InstRW<[Zn2WriteFPU01Y], (instrs VPBLENDDYrri)>;
+
+// v,v,m,i
+def Zn2WriteFPU01Op2 : SchedWriteRes<[Zn2AGU, Zn2FPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+ let ResourceCycles = [1, 2];
+}
+def Zn2WriteFPU01Op2Y : SchedWriteRes<[Zn2AGU, Zn2FPU01]> {
+ let NumMicroOps = 2;
+ let Latency = 9;
+ let ResourceCycles = [1, 3];
+}
+def : InstRW<[Zn2WriteFPU01Op2], (instrs VPBLENDDrmi)>;
+def : InstRW<[Zn2WriteFPU01Op2Y], (instrs VPBLENDDYrmi)>;
+
+// MASKMOVQ.
+def : InstRW<[WriteMicrocoded], (instregex "MMX_MASKMOVQ(64)?")>;
+
+// MASKMOVDQU.
+def : InstRW<[WriteMicrocoded], (instregex "(V?)MASKMOVDQU(64)?")>;
+
+// VPMASKMOVD.
+// ymm
+def : InstRW<[WriteMicrocoded],
+ (instregex "VPMASKMOVD(Y?)rm")>;
+// m, v,v.
+def : InstRW<[WriteMicrocoded], (instregex "VPMASKMOV(D|Q)(Y?)mr")>;
+
+// VPBROADCAST B/W.
+// x, m8/16.
+def Zn2WriteVPBROADCAST128Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteVPBROADCAST128Ld],
+ (instregex "VPBROADCAST(B|W)rm")>;
+
+// y, m8/16
+def Zn2WriteVPBROADCAST256Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 8;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteVPBROADCAST256Ld],
+ (instregex "VPBROADCAST(B|W)Yrm")>;
+
+// VPGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+// PHADD|PHSUB (S) W/D.
+def : SchedAlias<WritePHAdd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePHAddLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePHAddX, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePHAddXLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePHAddY, Zn2WriteMicrocoded>;
+def : SchedAlias<WritePHAddYLd, Zn2WriteMicrocoded>;
+
+// PCMPGTQ.
+def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
+def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
+
+// x <- x,m.
+def Zn2WritePCMPGTQm : SchedWriteRes<[Zn2AGU, Zn2FPU03]> {
+ let Latency = 8;
+}
+// ymm.
+def Zn2WritePCMPGTQYm : SchedWriteRes<[Zn2AGU, Zn2FPU03]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WritePCMPGTQm], (instregex "(V?)PCMPGTQrm")>;
+def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
+
+//-- Logic instructions --//
+
+// PSLL,PSRL,PSRA W/D/Q.
+// x,x / v,v,x.
+def Zn2WritePShift : SchedWriteRes<[Zn2FPU2]> ;
+def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> ;
+
+// PSLL,PSRL DQ.
+def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>;
+def : InstRW<[Zn2WritePShiftY], (instregex "(V?)PS(R|L)LDQYri")>;
+
+//=== Floating Point XMM and YMM Instructions ===//
+//-- Move instructions --//
+
+// VPERM2F128.
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rr)>;
+def : InstRW<[WriteMicrocoded], (instrs VPERM2F128rm)>;
+
+def Zn2WriteBROADCAST : SchedWriteRes<[Zn2AGU, Zn2FPU13]> {
+ let NumMicroOps = 2;
+ let Latency = 8;
+}
+// VBROADCASTF128.
+def : InstRW<[Zn2WriteBROADCAST], (instrs VBROADCASTF128)>;
+
+// EXTRACTPS.
+// r32,x,i.
+def Zn2WriteEXTRACTPSr : SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
+ let Latency = 2;
+ let ResourceCycles = [1, 2];
+}
+def : InstRW<[Zn2WriteEXTRACTPSr], (instregex "(V?)EXTRACTPSrr")>;
+
+def Zn2WriteEXTRACTPSm : SchedWriteRes<[Zn2AGU,Zn2FPU12, Zn2FPU2]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [5, 1, 2];
+}
+// m32,x,i.
+def : InstRW<[Zn2WriteEXTRACTPSm], (instregex "(V?)EXTRACTPSmr")>;
+
+// VEXTRACTF128.
+// x,y,i.
+def : InstRW<[Zn2WriteFPU013], (instrs VEXTRACTF128rr)>;
+
+// m128,y,i.
+def : InstRW<[Zn2WriteFPU013m], (instrs VEXTRACTF128mr)>;
+
+def Zn2WriteVINSERT128r: SchedWriteRes<[Zn2FPU013]> {
+ let Latency = 2;
+// let ResourceCycles = [2];
+}
+def Zn2WriteVINSERT128Ld: SchedWriteRes<[Zn2AGU,Zn2FPU013]> {
+ let Latency = 9;
+ let NumMicroOps = 2;
+}
+// VINSERTF128.
+// y,y,x,i.
+def : InstRW<[Zn2WriteVINSERT128r], (instrs VINSERTF128rr)>;
+def : InstRW<[Zn2WriteVINSERT128Ld], (instrs VINSERTF128rm)>;
+
+// VGATHER.
+def : InstRW<[WriteMicrocoded], (instregex "VGATHER(Q|D)(PD|PS)(Y?)rm")>;
+
+//-- Conversion instructions --//
+def Zn2WriteCVTPD2PSr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def Zn2WriteCVTPD2PSYr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+
+// CVTPD2PS.
+// x,x.
+def : SchedAlias<WriteCvtPD2PS, Zn2WriteCVTPD2PSr>;
+// y,y.
+def : SchedAlias<WriteCvtPD2PSY, Zn2WriteCVTPD2PSYr>;
+// z,z.
+defm : X86WriteResUnsupported<WriteCvtPD2PSZ>;
+
+def Zn2WriteCVTPD2PSLd: SchedWriteRes<[Zn2AGU,Zn2FPU03]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+// x,m128.
+def : SchedAlias<WriteCvtPD2PSLd, Zn2WriteCVTPD2PSLd>;
+
+// x,m256.
+def Zn2WriteCVTPD2PSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 10;
+}
+def : SchedAlias<WriteCvtPD2PSYLd, Zn2WriteCVTPD2PSYLd>;
+// z,m512
+defm : X86WriteResUnsupported<WriteCvtPD2PSZLd>;
+
+// CVTSD2SS.
+// x,x.
+// Same as WriteCVTPD2PSr
+def : SchedAlias<WriteCvtSD2SS, Zn2WriteCVTPD2PSr>;
+
+// x,m64.
+def : SchedAlias<WriteCvtSD2SSLd, Zn2WriteCVTPD2PSLd>;
+
+// CVTPS2PD.
+// x,x.
+def Zn2WriteCVTPS2PDr : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtPS2PD, Zn2WriteCVTPS2PDr>;
+
+// x,m64.
+// y,m128.
+def Zn2WriteCVTPS2PDLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteCvtPS2PDLd, Zn2WriteCVTPS2PDLd>;
+def : SchedAlias<WriteCvtPS2PDYLd, Zn2WriteCVTPS2PDLd>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZLd>;
+
+// y,x.
+def Zn2WriteVCVTPS2PDY : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtPS2PDY, Zn2WriteVCVTPS2PDY>;
+defm : X86WriteResUnsupported<WriteCvtPS2PDZ>;
+
+// CVTSS2SD.
+// x,x.
+def Zn2WriteCVTSS2SDr : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 3;
+}
+def : SchedAlias<WriteCvtSS2SD, Zn2WriteCVTSS2SDr>;
+
+// x,m32.
+def Zn2WriteCVTSS2SDLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 2];
+}
+def : SchedAlias<WriteCvtSS2SDLd, Zn2WriteCVTSS2SDLd>;
+
+def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> {
+ let Latency = 3;
+}
+// CVTDQ2PD.
+// x,x.
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
+
+// Same as xmm
+// y,x.
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PDYrr)>;
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>;
+
+def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> {
+ let Latency = 3;
+}
+// CVT(T)PD2DQ.
+// x,x.
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)PD2DQrr")>;
+
+def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> {
+ let Latency = 10;
+ let NumMicroOps = 2;
+}
+// x,m128.
+def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)PD2DQrm")>;
+// same as xmm handling
+// x,y.
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "VCVT(T?)PD2DQYrr")>;
+// x,m256.
+def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "VCVT(T?)PD2DQYrm")>;
+
+def Zn2WriteCVTPS2PIr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 4;
+}
+// CVT(T)PS2PI.
+// mm,x.
+def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PS2PIirr")>;
+
+// CVTPI2PD.
+// x,mm.
+def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
+
+// CVT(T)PD2PI.
+// mm,x.
+def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
+
+def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 4;
+}
+
+// same as CVTPD2DQr
+// CVT(T)SS2SI.
+// r32,x.
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
+// same as CVTPD2DQm
+// r32,m32.
+def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
+
+def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> {
+ let Latency = 4;
+}
+// CVTSI2SD.
+// x,r32/64.
+def : InstRW<[Zn2WriteCVSTSI2SDr], (instregex "(V?)CVTSI(64)?2SDrr")>;
+
+
+def Zn2WriteCVSTSI2SIr: SchedWriteRes<[Zn2FPU3, Zn2FPU2]> {
+ let Latency = 4;
+}
+def Zn2WriteCVSTSI2SILd: SchedWriteRes<[Zn2AGU, Zn2FPU3, Zn2FPU2]> {
+ let Latency = 11;
+}
+// CVTSD2SI.
+// r32/64
+def : InstRW<[Zn2WriteCVSTSI2SIr], (instregex "(V?)CVT(T?)SD2SI(64)?rr")>;
+// r32,m32.
+def : InstRW<[Zn2WriteCVSTSI2SILd], (instregex "(V?)CVT(T?)SD2SI(64)?rm")>;
+
+// VCVTPS2PH.
+// x,v,i.
+def : SchedAlias<WriteCvtPS2PH, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHY, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZ>;
+// m,v,i.
+def : SchedAlias<WriteCvtPS2PHSt, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPS2PHYSt, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPS2PHZSt>;
+
+// VCVTPH2PS.
+// v,x.
+def : SchedAlias<WriteCvtPH2PS, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSY, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZ>;
+// v,m.
+def : SchedAlias<WriteCvtPH2PSLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteCvtPH2PSYLd, Zn2WriteMicrocoded>;
+defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
+
+//-- SSE4A instructions --//
+// EXTRQ
+def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
+ let Latency = 2;
+}
+def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>;
+
+// INSERTQ
+def Zn2WriteINSERTQ: SchedWriteRes<[Zn2FPU03,Zn2FPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[Zn2WriteINSERTQ], (instregex "INSERTQ")>;
+
+//-- SHA instructions --//
+// SHA256MSG2
+def : InstRW<[WriteMicrocoded], (instregex "SHA256MSG2(Y?)r(r|m)")>;
+
+// SHA1MSG1, SHA256MSG1
+// x,x.
+def Zn2WriteSHA1MSG1r : SchedWriteRes<[Zn2FPU12]> {
+ let Latency = 2;
+}
+def : InstRW<[Zn2WriteSHA1MSG1r], (instregex "SHA(1|256)MSG1rr")>;
+// x,m.
+def Zn2WriteSHA1MSG1Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 9;
+}
+def : InstRW<[Zn2WriteSHA1MSG1Ld], (instregex "SHA(1|256)MSG1rm")>;
+
+// SHA1MSG2
+// x,x.
+def Zn2WriteSHA1MSG2r : SchedWriteRes<[Zn2FPU12]> ;
+def : InstRW<[Zn2WriteSHA1MSG2r], (instregex "SHA1MSG2rr")>;
+// x,m.
+def Zn2WriteSHA1MSG2Ld : SchedWriteRes<[Zn2AGU, Zn2FPU12]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WriteSHA1MSG2Ld], (instregex "SHA1MSG2rm")>;
+
+// SHA1NEXTE
+// x,x.
+def Zn2WriteSHA1NEXTEr : SchedWriteRes<[Zn2FPU1]> ;
+def : InstRW<[Zn2WriteSHA1NEXTEr], (instregex "SHA1NEXTErr")>;
+// x,m.
+def Zn2WriteSHA1NEXTELd : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 8;
+}
+def : InstRW<[Zn2WriteSHA1NEXTELd], (instregex "SHA1NEXTErm")>;
+
+// SHA1RNDS4
+// x,x.
+def Zn2WriteSHA1RNDS4r : SchedWriteRes<[Zn2FPU1]> {
+ let Latency = 6;
+}
+def : InstRW<[Zn2WriteSHA1RNDS4r], (instregex "SHA1RNDS4rr")>;
+// x,m.
+def Zn2WriteSHA1RNDS4Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 13;
+}
+def : InstRW<[Zn2WriteSHA1RNDS4Ld], (instregex "SHA1RNDS4rm")>;
+
+// SHA256RNDS2
+// x,x.
+def Zn2WriteSHA256RNDS2r : SchedWriteRes<[Zn2FPU1]> {
+ let Latency = 4;
+}
+def : InstRW<[Zn2WriteSHA256RNDS2r], (instregex "SHA256RNDS2rr")>;
+// x,m.
+def Zn2WriteSHA256RNDS2Ld : SchedWriteRes<[Zn2AGU, Zn2FPU1]> {
+ let Latency = 11;
+}
+def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
+
+//-- Arithmetic instructions --//
+
+// HADD, HSUB PS/PD
+def : SchedAlias<WriteFHAdd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteFHAddLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteFHAddY, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteFHAddYLd, Zn2WriteMicrocoded>;
+
+// VDIVPS.
+// TODO - convert to Zn2WriteResFpuPair
+// y,y,y.
+def Zn2WriteVDIVPSYr : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 10;
+ let ResourceCycles = [10];
+}
+def : SchedAlias<WriteFDivY, Zn2WriteVDIVPSYr>;
+
+// y,y,m256.
+def Zn2WriteVDIVPSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 17;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1, 17];
+}
+def : SchedAlias<WriteFDivYLd, Zn2WriteVDIVPSYLd>;
+
+// VDIVPD.
+// TODO - convert to Zn2WriteResFpuPair
+// y,y,y.
+def Zn2WriteVDIVPDY : SchedWriteRes<[Zn2FPU3]> {
+ let Latency = 13;
+ let ResourceCycles = [13];
+}
+def : SchedAlias<WriteFDiv64Y, Zn2WriteVDIVPDY>;
+
+// y,y,m256.
+def Zn2WriteVDIVPDYLd : SchedWriteRes<[Zn2AGU, Zn2FPU3]> {
+ let Latency = 20;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,20];
+}
+def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
+
+// DPPS.
+// x,x,i / v,v,v,i.
+def : SchedAlias<WriteDPPS, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteDPPSY, Zn2WriteMicrocoded>;
+
+// x,m,i / v,v,m,i.
+def : SchedAlias<WriteDPPSLd, Zn2WriteMicrocoded>;
+def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>;
+
+// DPPD.
+// x,x,i.
+def : SchedAlias<WriteDPPD, Zn2WriteMicrocoded>;
+
+// x,m,i.
+def : SchedAlias<WriteDPPDLd, Zn2WriteMicrocoded>;
+
+// RSQRTSS
+// TODO - convert to Zn2WriteResFpuPair
+// x,x.
+def Zn2WriteRSQRTSSr : SchedWriteRes<[Zn2FPU02]> {
+ let Latency = 5;
+}
+def : SchedAlias<WriteFRsqrt, Zn2WriteRSQRTSSr>;
+
+// x,m128.
+def Zn2WriteRSQRTSSLd: SchedWriteRes<[Zn2AGU, Zn2FPU02]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+ let ResourceCycles = [1,2];
+}
+def : SchedAlias<WriteFRsqrtLd, Zn2WriteRSQRTSSLd>;
+
+// RSQRTPS
+// TODO - convert to Zn2WriteResFpuPair
+// y,y.
+def Zn2WriteRSQRTPSYr : SchedWriteRes<[Zn2FPU01]> {
+ let Latency = 5;
+ let NumMicroOps = 2;
+ let ResourceCycles = [2];
+}
+def : SchedAlias<WriteFRsqrtY, Zn2WriteRSQRTPSYr>;
+
+// y,m256.
+def Zn2WriteRSQRTPSYLd : SchedWriteRes<[Zn2AGU, Zn2FPU01]> {
+ let Latency = 12;
+ let NumMicroOps = 2;
+}
+def : SchedAlias<WriteFRsqrtYLd, Zn2WriteRSQRTPSYLd>;
+
+//-- Other instructions --//
+
+// VZEROUPPER.
+def : InstRW<[WriteALU], (instrs VZEROUPPER)>;
+
+// VZEROALL.
+def : InstRW<[WriteMicrocoded], (instrs VZEROALL)>;
+
+} // SchedModel
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index b8980789258e..9aa47c532e82 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -148,8 +148,8 @@ private:
/// Manages the predicate state traced through the program.
struct PredState {
- unsigned InitialReg;
- unsigned PoisonReg;
+ unsigned InitialReg = 0;
+ unsigned PoisonReg = 0;
const TargetRegisterClass *RC;
MachineSSAUpdater SSA;
@@ -158,10 +158,10 @@ private:
: RC(RC), SSA(MF) {}
};
- const X86Subtarget *Subtarget;
- MachineRegisterInfo *MRI;
- const X86InstrInfo *TII;
- const TargetRegisterInfo *TRI;
+ const X86Subtarget *Subtarget = nullptr;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86InstrInfo *TII = nullptr;
+ const TargetRegisterInfo *TRI = nullptr;
Optional<PredState> PS;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index f8f78da52cc2..75c3a70b430a 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -324,8 +324,8 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
MaybeAlign StackAlignOverride,
unsigned PreferVectorWidthOverride,
unsigned RequiredVectorWidth)
- : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::None), TM(TM),
- TargetTriple(TT), StackAlignOverride(StackAlignOverride),
+ : X86GenSubtargetInfo(TT, CPU, FS), PICStyle(PICStyles::Style::None),
+ TM(TM), TargetTriple(TT), StackAlignOverride(StackAlignOverride),
PreferVectorWidthOverride(PreferVectorWidthOverride),
RequiredVectorWidth(RequiredVectorWidth),
In64BitMode(TargetTriple.getArch() == Triple::x86_64),
@@ -337,15 +337,15 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
FrameLowering(*this, getStackAlignment()) {
// Determine the PICStyle based on the target selected.
if (!isPositionIndependent())
- setPICStyle(PICStyles::None);
+ setPICStyle(PICStyles::Style::None);
else if (is64Bit())
- setPICStyle(PICStyles::RIPRel);
+ setPICStyle(PICStyles::Style::RIPRel);
else if (isTargetCOFF())
- setPICStyle(PICStyles::None);
+ setPICStyle(PICStyles::Style::None);
else if (isTargetDarwin())
- setPICStyle(PICStyles::StubPIC);
+ setPICStyle(PICStyles::Style::StubPIC);
else if (isTargetELF())
- setPICStyle(PICStyles::GOT);
+ setPICStyle(PICStyles::Style::GOT);
CallLoweringInfo.reset(new X86CallLowering(*getTargetLowering()));
Legalizer.reset(new X86LegalizerInfo(*this, TM));
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index e8efe8f2afe5..f4e8d30328ca 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -40,7 +40,7 @@ class GlobalValue;
///
namespace PICStyles {
-enum Style {
+enum class Style {
StubPIC, // Used on i386-darwin in pic mode.
GOT, // Used on 32 bit elf on when in pic mode.
RIPRel, // Used on X86-64 when in pic mode.
@@ -56,10 +56,7 @@ public:
enum X86ProcFamilyEnum {
Others,
IntelAtom,
- IntelSLM,
- IntelGLM,
- IntelGLP,
- IntelTRM
+ IntelSLM
};
protected:
@@ -256,9 +253,9 @@ protected:
/// mask over multiple fixed shuffles.
bool HasFastVariableShuffle = false;
- /// True if there is no performance penalty to writing only the lower parts
- /// of a YMM or ZMM register without clearing the upper part.
- bool HasFastPartialYMMorZMMWrite = false;
+ /// True if vzeroupper instructions should be inserted after code that uses
+ /// ymm or zmm registers.
+ bool InsertVZEROUPPER = false;
/// True if there is no performance penalty for writing NOPs with up to
/// 11 bytes.
@@ -445,9 +442,15 @@ protected:
/// Indicates target prefers 256 bit instructions.
bool Prefer256Bit = false;
+ /// Indicates target prefers AVX512 mask registers.
+ bool PreferMaskRegisters = false;
+
/// Threeway branch is profitable in this subtarget.
bool ThreewayBranchProfitable = false;
+ /// Use Goldmont specific floating point div/sqrt costs.
+ bool UseGLMDivSqrtCosts = false;
+
/// What processor and OS we're targeting.
Triple TargetTriple;
@@ -655,9 +658,7 @@ public:
bool hasFastVariableShuffle() const {
return HasFastVariableShuffle;
}
- bool hasFastPartialYMMorZMMWrite() const {
- return HasFastPartialYMMorZMMWrite;
- }
+ bool insertVZEROUPPER() const { return InsertVZEROUPPER; }
bool hasFastGather() const { return HasFastGather; }
bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
@@ -706,6 +707,8 @@ public:
return UseRetpolineIndirectBranches;
}
bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
+ bool preferMaskRegisters() const { return PreferMaskRegisters; }
+ bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
@@ -738,11 +741,6 @@ public:
/// TODO: to be removed later and replaced with suitable properties
bool isAtom() const { return X86ProcFamily == IntelAtom; }
bool isSLM() const { return X86ProcFamily == IntelSLM; }
- bool isGLM() const {
- return X86ProcFamily == IntelGLM ||
- X86ProcFamily == IntelGLP ||
- X86ProcFamily == IntelTRM;
- }
bool useSoftFloat() const { return UseSoftFloat; }
bool useAA() const override { return UseAA; }
@@ -801,11 +799,11 @@ public:
bool isTargetWin32() const { return !In64BitMode && isOSWindows(); }
- bool isPICStyleGOT() const { return PICStyle == PICStyles::GOT; }
- bool isPICStyleRIPRel() const { return PICStyle == PICStyles::RIPRel; }
+ bool isPICStyleGOT() const { return PICStyle == PICStyles::Style::GOT; }
+ bool isPICStyleRIPRel() const { return PICStyle == PICStyles::Style::RIPRel; }
bool isPICStyleStubPIC() const {
- return PICStyle == PICStyles::StubPIC;
+ return PICStyle == PICStyles::Style::StubPIC;
}
bool isPositionIndependent() const { return TM.isPositionIndependent(); }
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index c15297134e4d..8c696e9adbed 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -46,6 +46,7 @@
#include "llvm/Support/TargetRegistry.h"
#include "llvm/Target/TargetLoweringObjectFile.h"
#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/CFGuard.h"
#include <memory>
#include <string>
@@ -60,7 +61,7 @@ static cl::opt<bool> EnableCondBrFoldingPass("x86-condbr-folding",
"folding pass"),
cl::init(false), cl::Hidden);
-extern "C" void LLVMInitializeX86Target() {
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
// Register the target.
RegisterTargetMachine<X86TargetMachine> X(getTheX86_32Target());
RegisterTargetMachine<X86TargetMachine> Y(getTheX86_64Target());
@@ -229,9 +230,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
this->Options.NoTrapAfterNoreturn = TT.isOSBinFormatMachO();
}
- // Outlining is available for x86-64.
- if (TT.getArch() == Triple::x86_64)
- setMachineOutliner(true);
+ setMachineOutliner(true);
initAsmInfo();
}
@@ -414,6 +413,16 @@ void X86PassConfig::addIRPasses() {
// thunk. These will be a no-op unless a function subtarget has the retpoline
// feature enabled.
addPass(createIndirectBrExpandPass());
+
+ // Add Control Flow Guard checks.
+ const Triple &TT = TM->getTargetTriple();
+ if (TT.isOSWindows()) {
+ if (TT.getArch() == Triple::x86_64) {
+ addPass(createCFGuardDispatchPass());
+ } else {
+ addPass(createCFGuardCheckPass());
+ }
+ }
}
bool X86PassConfig::addInstSelector() {
@@ -530,6 +539,9 @@ void X86PassConfig::addPreEmitPass2() {
(!TT.isOSWindows() ||
MAI->getExceptionHandlingType() == ExceptionHandling::DwarfCFI))
addPass(createCFIInstrInserter());
+ // Identify valid longjmp targets for Windows Control Flow Guard.
+ if (TT.isOSWindows())
+ addPass(createCFGuardLongjmpPass());
}
std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index 70fd857fcf01..b754836ea517 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -169,12 +169,13 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
return 2;
}
-int X86TTIImpl::getArithmeticInstrCost(
- unsigned Opcode, Type *Ty,
- TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
- TTI::OperandValueProperties Opd1PropInfo,
- TTI::OperandValueProperties Opd2PropInfo,
- ArrayRef<const Value *> Args) {
+int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+ TTI::OperandValueKind Op1Info,
+ TTI::OperandValueKind Op2Info,
+ TTI::OperandValueProperties Opd1PropInfo,
+ TTI::OperandValueProperties Opd2PropInfo,
+ ArrayRef<const Value *> Args,
+ const Instruction *CxtI) {
// Legalize the type.
std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
@@ -188,7 +189,7 @@ int X86TTIImpl::getArithmeticInstrCost(
{ ISD::FDIV, MVT::v2f64, 65 }, // divpd
};
- if (ST->isGLM())
+ if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTable, ISD,
LT.second))
return LT.first * Entry->Cost;
@@ -280,7 +281,7 @@ int X86TTIImpl::getArithmeticInstrCost(
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
- if (ISD == ISD::UREM)
+ else // UREM
return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
TargetTransformInfo::OP_None,
TargetTransformInfo::OP_None);
@@ -1389,6 +1390,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v4f64, MVT::v4i64, 5 },
{ ISD::UINT_TO_FP, MVT::v8f64, MVT::v8i64, 5 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 1 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 1 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 1 },
@@ -1397,6 +1399,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f32, 1 },
{ ISD::FP_TO_UINT, MVT::v4i32, MVT::v4f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f32, 1 },
+ { ISD::FP_TO_UINT, MVT::v8i32, MVT::v8f64, 1 },
{ ISD::FP_TO_UINT, MVT::v8i16, MVT::v8f64, 2 },
{ ISD::FP_TO_UINT, MVT::v8i8, MVT::v8f64, 2 },
{ ISD::FP_TO_UINT, MVT::v16i32, MVT::v16f32, 1 },
@@ -1550,6 +1553,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::TRUNCATE, MVT::v16i16, MVT::v16i32, 6 },
{ ISD::TRUNCATE, MVT::v2i8, MVT::v2i64, 1 }, // PSHUFB
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 4 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 4 },
};
@@ -1576,9 +1580,14 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
{ ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
{ ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
+ { ISD::FP_TO_SINT, MVT::v4i16, MVT::v4f32, 2 },
+ { ISD::FP_TO_SINT, MVT::v2i16, MVT::v2f64, 2 },
+
{ ISD::FP_TO_SINT, MVT::v2i32, MVT::v2f64, 3 },
+ { ISD::UINT_TO_FP, MVT::f32, MVT::i64, 6 },
{ ISD::UINT_TO_FP, MVT::f64, MVT::i64, 6 },
+
{ ISD::FP_TO_UINT, MVT::i64, MVT::f32, 4 },
{ ISD::FP_TO_UINT, MVT::i64, MVT::f64, 4 },
@@ -2199,7 +2208,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
MVT MTy = LT.second;
// Attempt to lookup cost.
- if (ST->isGLM())
+ if (ST->useGLMDivSqrtCosts())
if (const auto *Entry = CostTableLookup(GLMCostTbl, ISD, MTy))
return LT.first * Entry->Cost;
@@ -2374,6 +2383,13 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
}
int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+ static const CostTblEntry SLMCostTbl[] = {
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i8, 4 },
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i16, 4 },
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i32, 4 },
+ { ISD::EXTRACT_VECTOR_ELT, MVT::i64, 7 }
+ };
+
assert(Val->isVectorTy() && "This must be a vector type");
Type *ScalarType = Val->getScalarType();
@@ -2390,9 +2406,22 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
unsigned Width = LT.second.getVectorNumElements();
Index = Index % Width;
- // Floating point scalars are already located in index #0.
- if (ScalarType->isFloatingPointTy() && Index == 0)
- return 0;
+ if (Index == 0) {
+ // Floating point scalars are already located in index #0.
+ if (ScalarType->isFloatingPointTy())
+ return 0;
+
+ // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
+ if (ScalarType->isIntegerTy())
+ return 1;
+ }
+
+ int ISD = TLI->InstructionOpcodeToISD(Opcode);
+ assert(ISD && "Unexpected vector opcode");
+ MVT MScalarTy = LT.second.getScalarType();
+ if (ST->isSLM())
+ if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
+ return LT.first * Entry->Cost;
}
// Add to the base cost if we know that the extracted element of a vector is
@@ -2404,8 +2433,9 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
}
-int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
- unsigned AddressSpace, const Instruction *I) {
+int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+ MaybeAlign Alignment, unsigned AddressSpace,
+ const Instruction *I) {
// Handle non-power-of-two vectors such as <3 x float>
if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
unsigned NumElem = VTy->getVectorNumElements();
@@ -2456,7 +2486,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
if (!SrcVTy)
// To calculate scalar take the regular cost, without mask
- return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace);
+ return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace);
unsigned NumElem = SrcVTy->getVectorNumElements();
VectorType *MaskTy =
@@ -2474,7 +2504,7 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
int MemopCost =
NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- Alignment, AddressSpace);
+ MaybeAlign(Alignment), AddressSpace);
return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
}
@@ -2533,6 +2563,11 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
// We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
// and make it as the cost.
+ static const CostTblEntry SLMCostTblPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 3 },
+ { ISD::ADD, MVT::v2i64, 5 },
+ };
+
static const CostTblEntry SSE2CostTblPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
@@ -2558,6 +2593,11 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
{ ISD::ADD, MVT::v32i8, 4 },
};
+ static const CostTblEntry SLMCostTblNoPairWise[] = {
+ { ISD::FADD, MVT::v2f64, 3 },
+ { ISD::ADD, MVT::v2i64, 5 },
+ };
+
static const CostTblEntry SSE2CostTblNoPairWise[] = {
{ ISD::FADD, MVT::v2f64, 2 },
{ ISD::FADD, MVT::v4f32, 4 },
@@ -2594,6 +2634,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
if (VT.isSimple()) {
MVT MTy = VT.getSimpleVT();
if (IsPairwise) {
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
+ return Entry->Cost;
+
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
return Entry->Cost;
@@ -2602,6 +2646,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
return Entry->Cost;
} else {
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return Entry->Cost;
+
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
return Entry->Cost;
@@ -2617,6 +2665,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
MVT MTy = LT.second;
if (IsPairwise) {
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
@@ -2625,6 +2677,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
return LT.first * Entry->Cost;
} else {
+ if (ST->isSLM())
+ if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+ return LT.first * Entry->Cost;
+
if (ST->hasAVX())
if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
return LT.first * Entry->Cost;
@@ -2634,6 +2690,24 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
return LT.first * Entry->Cost;
}
+ // FIXME: These assume a naive kshift+binop lowering, which is probably
+ // conservative in most cases.
+ // FIXME: This doesn't cost large types like v128i1 correctly.
+ static const CostTblEntry AVX512BoolReduction[] = {
+ { ISD::AND, MVT::v2i1, 3 },
+ { ISD::AND, MVT::v4i1, 5 },
+ { ISD::AND, MVT::v8i1, 7 },
+ { ISD::AND, MVT::v16i1, 9 },
+ { ISD::AND, MVT::v32i1, 11 },
+ { ISD::AND, MVT::v64i1, 13 },
+ { ISD::OR, MVT::v2i1, 3 },
+ { ISD::OR, MVT::v4i1, 5 },
+ { ISD::OR, MVT::v8i1, 7 },
+ { ISD::OR, MVT::v16i1, 9 },
+ { ISD::OR, MVT::v32i1, 11 },
+ { ISD::OR, MVT::v64i1, 13 },
+ };
+
static const CostTblEntry AVX2BoolReduction[] = {
{ ISD::AND, MVT::v16i16, 2 }, // vpmovmskb + cmp
{ ISD::AND, MVT::v32i8, 2 }, // vpmovmskb + cmp
@@ -2664,7 +2738,10 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
};
// Handle bool allof/anyof patterns.
- if (ValTy->getVectorElementType()->isIntegerTy(1)) {
+ if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) {
+ if (ST->hasAVX512())
+ if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
+ return LT.first * Entry->Cost;
if (ST->hasAVX2())
if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
return LT.first * Entry->Cost;
@@ -2956,7 +3033,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
return std::max(1, Cost);
}
-int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
+int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
Type *Ty) {
assert(Ty->isIntegerTy());
@@ -3053,8 +3130,8 @@ int X86TTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
return X86TTIImpl::getIntImmCost(Imm, Ty);
}
-int X86TTIImpl::getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty) {
+int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+ const APInt &Imm, Type *Ty) {
assert(Ty->isIntegerTy());
unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3164,7 +3241,7 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
? ST->getGatherOverhead()
: ST->getScatterOverhead();
return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- Alignment, AddressSpace);
+ MaybeAlign(Alignment), AddressSpace);
}
/// Return the cost of full scalarization of gather / scatter operation.
@@ -3194,7 +3271,7 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
// The cost of the scalar loads/stores.
int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
- Alignment, AddressSpace);
+ MaybeAlign(Alignment), AddressSpace);
int InsertExtractCost = 0;
if (Opcode == Instruction::Load)
@@ -3224,8 +3301,10 @@ int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
unsigned AddressSpace = PtrTy->getAddressSpace();
bool Scalarize = false;
- if ((Opcode == Instruction::Load && !isLegalMaskedGather(SrcVTy)) ||
- (Opcode == Instruction::Store && !isLegalMaskedScatter(SrcVTy)))
+ if ((Opcode == Instruction::Load &&
+ !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) ||
+ (Opcode == Instruction::Store &&
+ !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment))))
Scalarize = true;
// Gather / Scatter for vector 2 is not profitable on KNL / SKX
// Vector-4 of gather/scatter instruction does not exist on KNL.
@@ -3348,7 +3427,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
return isLegalMaskedExpandLoad(DataTy);
}
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
// Some CPUs have better gather performance than others.
// TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
// enable gather with a -march.
@@ -3386,11 +3465,11 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy) {
return IntWidth == 32 || IntWidth == 64;
}
-bool X86TTIImpl::isLegalMaskedScatter(Type *DataType) {
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
// AVX2 doesn't support scatter
if (!ST->hasAVX512())
return false;
- return isLegalMaskedGather(DataType);
+ return isLegalMaskedGather(DataType, Alignment);
}
bool X86TTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
@@ -3443,10 +3522,9 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
// version is not as fast for three way compare (see #33329).
const unsigned PreferredWidth = ST->getPreferVectorWidth();
if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
- if (PreferredWidth >= 256 && ST->hasAVX2()) Options.LoadSizes.push_back(32);
+ if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
- // All GPR and vector loads can be unaligned. SIMD compare requires integer
- // vectors (SSE2/AVX2).
+ // All GPR and vector loads can be unaligned.
Options.AllowOverlappingLoads = true;
}
if (ST->is64Bit()) {
@@ -3520,8 +3598,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
// Get the cost of one memory operation.
Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
LegalVT.getVectorNumElements());
- unsigned MemOpCost =
- getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
+ MaybeAlign(Alignment), AddressSpace);
VectorType *VT = VectorType::get(ScalarTy, VF);
EVT ETy = TLI->getValueType(DL, VT);
@@ -3620,8 +3698,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
// Get the cost of one memory operation.
Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
LegalVT.getVectorNumElements());
- unsigned MemOpCost =
- getMemoryOpCost(Opcode, SingleMemOpTy, Alignment, AddressSpace);
+ unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
+ MaybeAlign(Alignment), AddressSpace);
unsigned VF = VecTy->getVectorNumElements() / Factor;
MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index 7581257f41f8..b9c2dbd78058 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -51,7 +51,6 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureFastBEXTR,
X86::FeatureFastHorizontalOps,
X86::FeatureFastLZCNT,
- X86::FeatureFastPartialYMMorZMMWrite,
X86::FeatureFastScalarFSQRT,
X86::FeatureFastSHLDRotate,
X86::FeatureFastScalarShiftMasks,
@@ -77,6 +76,9 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
X86::FeatureSlowSHLD,
X86::FeatureSlowTwoMemOps,
X86::FeatureSlowUAMem16,
+ X86::FeaturePreferMaskRegisters,
+ X86::FeatureInsertVZEROUPPER,
+ X86::FeatureUseGLMDivSqrtCosts,
// Perf-tuning flags.
X86::FeatureHasFastGather,
@@ -88,10 +90,7 @@ class X86TTIImpl : public BasicTTIImplBase<X86TTIImpl> {
// CPU name enums. These just follow CPU string.
X86::ProcIntelAtom,
- X86::ProcIntelGLM,
- X86::ProcIntelGLP,
X86::ProcIntelSLM,
- X86::ProcIntelTRM,
};
public:
@@ -126,14 +125,15 @@ public:
TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
- ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+ ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
+ const Instruction *CxtI = nullptr);
int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
const Instruction *I = nullptr);
int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
const Instruction *I = nullptr);
int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
- int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+ int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
unsigned AddressSpace, const Instruction *I = nullptr);
int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
unsigned AddressSpace);
@@ -179,9 +179,9 @@ public:
unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
- int getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
- int getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
- Type *Ty);
+ int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+ int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
+ Type *Ty);
bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
TargetTransformInfo::LSRCost &C2);
bool canMacroFuseCmp();
@@ -189,8 +189,8 @@ public:
bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment);
bool isLegalNTLoad(Type *DataType, Align Alignment);
bool isLegalNTStore(Type *DataType, Align Alignment);
- bool isLegalMaskedGather(Type *DataType);
- bool isLegalMaskedScatter(Type *DataType);
+ bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment);
+ bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment);
bool isLegalMaskedExpandLoad(Type *DataType);
bool isLegalMaskedCompressStore(Type *DataType);
bool hasDivRemOp(Type *DataType, bool IsSigned);
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 9280d030b5d5..7a8308ef1ba9 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -279,7 +279,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
/// function calls.
bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
- if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
+ if (!ST.hasAVX() || !ST.insertVZEROUPPER())
return false;
TII = ST.getInstrInfo();
MachineRegisterInfo &MRI = MF.getRegInfo();
diff --git a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
index ae72c6427588..42e8fba2201e 100644
--- a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -54,14 +54,14 @@ private:
/// Lower a WinAlloca instruction.
void lower(MachineInstr* MI, Lowering L);
- MachineRegisterInfo *MRI;
- const X86Subtarget *STI;
- const TargetInstrInfo *TII;
- const X86RegisterInfo *TRI;
- unsigned StackPtr;
- unsigned SlotSize;
- int64_t StackProbeSize;
- bool NoStackArgProbe;
+ MachineRegisterInfo *MRI = nullptr;
+ const X86Subtarget *STI = nullptr;
+ const TargetInstrInfo *TII = nullptr;
+ const X86RegisterInfo *TRI = nullptr;
+ unsigned StackPtr = 0;
+ unsigned SlotSize = 0;
+ int64_t StackProbeSize = 0;
+ bool NoStackArgProbe = false;
StringRef getPassName() const override { return "X86 WinAlloca Expander"; }
static char ID;
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index d65e1f3ab414..78d3f6460189 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -23,7 +23,8 @@
#include "llvm/IR/Function.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsX86.h"
#include "llvm/IR/Module.h"
#include "llvm/Pass.h"
#include "llvm/Support/Debug.h"
@@ -91,7 +92,7 @@ private:
EHPersonality Personality = EHPersonality::Unknown;
Function *PersonalityFn = nullptr;
bool UseStackGuard = false;
- int ParentBaseState;
+ int ParentBaseState = 0;
FunctionCallee SehLongjmpUnwind = nullptr;
Constant *Cookie = nullptr;
@@ -178,11 +179,6 @@ bool WinEHStatePass::runOnFunction(Function &F) {
{Int8PtrType, Type::getInt32Ty(TheModule->getContext())},
/*isVarArg=*/true));
- // Disable frame pointer elimination in this function.
- // FIXME: Do the nested handlers need to keep the parent ebp in ebp, or can we
- // use an arbitrary register?
- F.addFnAttr("no-frame-pointer-elim", "true");
-
emitExceptionRegistrationRecord(&F);
// The state numbers calculated here in IR must agree with what we calculate